Esempio n. 1
0
def persist_model(clf, dataset_name, flipped=False):
    out.create_dir('./pickled_models')
    out.create_dir('./pickled_models/{}'.format(dataset_name))
    joblib.dump(
        clf, "./pickled_models/{}/{}.pkl".format(
            dataset_name,
            clf.filename() if not flipped else clf.filename() + '_flipped'))
Esempio n. 2
0
def check_data_file(dataset, base_url, fname):
    #files = os.listdir(".") # get the current directory listing
    files_dir = os.path.dirname(os.path.realpath(
        __file__)) + '/data/' + dataset  # get path of this file
    output.create_dir(files_dir)
    files = os.listdir(files_dir)  # get the current directory listing
    print("Looking for file '%s' in the current directory..." % fname)
    full_file = "{}/{}".format(files_dir, fname)

    if fname not in files:
        print("'{}' not found! Downloading ...".format(fname))
        url = base_url + urllib.parse.quote(fname)
        response = urllib.request.urlopen(url)
        content_charset = response.info().get_content_charset()
        if content_charset is not None:
            # string file
            data = response.read().decode(
                response.info().get_content_charset(), 'ignore')
            write_spec = "w"
        else:
            # binary file
            data = response.read()
            write_spec = "wb"
        with open(full_file, write_spec) as fileOut:
            fileOut.write(data)
        print("'%s' download and saved locally.." % fname)
    else:
        print("File found in current directory..")

    return full_file
Esempio n. 3
0
    def __init__(self, dataset, val_split=0.):
        self.hyperparam_store_loc = 'hyperparams/prob_class_params'
        output.create_dir('hyperparams')

        self.ds_name = dataset
        self.val_split = val_split
        self.X = None
Esempio n. 4
0
def plot_one_var_vs_other_together(res_dir,
                                   sens_dict,
                                   nosens_dict,
                                   x_label,
                                   y_label,
                                   format='png'):
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 16
    plt.rcParams['ytick.labelsize'] = 16
    plt.rcParams['legend.fontsize'] = 10
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    filename = '{}_vs_{}_together'.format('_'.join(x_label.split()),
                                          '_'.join(y_label.split()))
    figpath = plots_dir + '/' + filename + '.' + format
    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    idx = 0
    for model, vals in sens_dict.items():
        ax.plot(vals[0],
                vals[1],
                color=colors[idx],
                label='{} (Women)'.format(model.shortfilename()))
        ax.plot(nosens_dict[model][0],
                nosens_dict[model][1],
                linestyle=':',
                color=colors[idx],
                label='{} (Men)'.format(model.shortfilename()))
        idx += 1
    ax.set_xlabel(x_label + ' (' + r'$\delta$' + ')')
    ax.set_ylabel(y_label)
    # box = ax.get_position()
    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # ax.legend(loc='lower right', bbox_to_anchor=(1, 0.5))
    ax.legend(loc='lower right')
    plt.savefig(figpath, format=format, bbox_inches='tight')
    plt.savefig(plots_dir + '/' + filename + '.pdf',
                format='pdf',
                bbox_inches='tight')

    # for i, ax_obj in enumerate([ax]):
    #     fig_legend = plt.figure(figsize=(3, 3))
    #     handles, labels = ax_obj.get_legend_handles_labels()
    #     fig_legend.legend(handles, labels, 'center', ncol=1)
    #     fig_legend.savefig(plots_dir + '/' + filename + "_legend." + format, format=format, bbox_inches='tight')
    #     fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf", format='pdf', bbox_inches='tight')

    return "{}\n\n{}".format(
        get_wiki_link(figpath),
        get_wiki_link(plots_dir + '/' + filename + "_legend." + format))
 def find_neighbourhoods(self, X, Y, tau_sens, tau_nosens, model=None):
     out.create_dir('./params')
     k_means_params = aeio.load_params('./params/KMeans', '{}_{}_{}'.format(model.filename(), 
         tau_sens, tau_nosens) if model is not None else '{}_{}'.format(tau_sens, tau_nosens))
     if k_means_params is None:
         k_means_params = self.find_best_params(X, Y, kmeans_param_grid, adjusted_mutual_info_score, KMeans)
         aeio.save_params('./params/KMeans', '{}_{}_{}'.format(model.filename(), tau_sens, tau_nosens), k_means_params)
     alg = KMeans(**k_means_params, n_jobs=-1, random_state=42)
     alg.fit(X)
     return alg.cluster_centers_, alg.labels_, k_means_params
 def __init__(self):
     self.dataset, self.models_other_than_rules = exp.base_exp(return_vars=True)
     self.prediction_task = exp.dataset_info[self.dataset]['prediction_task']
     self.res_dir = 'results/{}'.format(self.dataset)
     out.create_dir(self.res_dir)
     self.res_file_path = self.res_dir + '/res_utilities_thresholds.txt'
     self.seg_file_path = self.res_dir + '/res_segregation.txt'
     self.wiki_parent_path = "Actionable-Explanations/Simple-Explanations-{}".format(self.dataset)
     self.sens_group_desc = exp.dataset_info[self.dataset]['sens_f']
     self.cost_groups = {cf.ONE_GROUP_IND: "all", 0: self.sens_group_desc[0], 1: self.sens_group_desc[1]}
     self.segregation_indices = [si.Atkinson, si.Centralization, si.Clustering]
Esempio n. 7
0
def plot_dtree(res_dir, clf, feature_info):
    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    filename = "{}_viz".format(clf.filename())
    figpath = plots_dir + '/' + filename + '.png'
    dot_data = StringIO()
    export_graphviz(clf.clf,
                    out_file=dot_data,
                    filled=True,
                    rounded=True,
                    special_characters=True,
                    feature_names=get_feature_names(feature_info))
    graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
    graph.write_png(figpath)
    return get_wiki_link(figpath)
Esempio n. 8
0
 def __init__(self, subsample_size_test=None, subsample_size_train=None):
     self.dataset, self.models_other_than_rules = exp.base_exp(
         return_vars=True)
     self.res_dir = 'results/{}'.format(self.dataset)
     out.create_dir(self.res_dir)
     self.res_dir = self.res_dir if not exp.FAIRNESS_CONSTRAINTS else '{}/FC'.format(
         self.res_dir)
     out.create_dir(self.res_dir)
     self.res_file_path = self.res_dir + '/res_lti.txt'
     self.wiki_parent_path = "Actionable-Explanations/Simple-Explanations-{}".format(
         self.dataset)
     self.subsample_size_test = subsample_size_test
     self.subsample_size_train = subsample_size_train
     self.sens_group_desc = exp.dataset_info[self.dataset]['sens_f']
     self.prediction_task = exp.dataset_info[
         self.dataset]['prediction_task']
     self.cost_groups = {
         cf.ONE_GROUP_IND: "all",
         0: self.sens_group_desc[0],
         1: self.sens_group_desc[1]
     }
Esempio n. 9
0
def plot_covar_matrix(res_dir, X, feature_info, format='png'):
    plt.rcParams['font.size'] = 12
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 16
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 10
    plt.rcParams['axes.linewidth'] = 1
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 10
    plt.rcParams['legend.fontsize'] = 16
    plt.rcParams['figure.titlesize'] = 15
    plt.rcParams['lines.linewidth'] = 1.0

    covar_mat = np.cov(X, rowvar=False)
    assert covar_mat.shape[0] == X.shape[1] and covar_mat.shape[1] == X.shape[1]
    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    filename = 'training_set_feature_covar_mat'
    figpath = plots_dir + '/' + filename + '.' + format
    plt.figure(figsize=(20, 18))
    import seaborn as sns
    ax = sns.heatmap(covar_mat,
                     annot=True,
                     xticklabels=get_feature_names(feature_info),
                     yticklabels=get_feature_names(feature_info))
    ax.xaxis.tick_top()  # x axis on top
    ax.xaxis.set_label_position('top')
    ax.tick_params(length=0)
    plt.xticks(rotation=1)
    plt.title('Covariance Matrix')
    plt.savefig(figpath, format=format, bbox_inches='tight')
    plt.savefig(plots_dir + '/' + filename + '.pdf',
                format='pdf',
                bbox_inches='tight')
    return get_wiki_link(figpath)
Esempio n. 10
0
def get_segregation_plots_new(res_dir,
                              outer_seg_index_mapping,
                              fc,
                              format='png'):
    """
    This function was written very very close to the deadline.
    """
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 12
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 10
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/segregation_plots"
    out.create_dir(plots_dir)
    x_title = r'$\tau$'

    print("Initially passed dict: {}".format(outer_seg_index_mapping))

    for tau_nosens, seg_index_mapping in outer_seg_index_mapping.items():
        if tau_nosens > 0:
            continue
        count = 3
        if fc:
            for index_type, mapping in seg_index_mapping.items():
                # plot_title = str(index_type)
                plot_title = ''
                fig = plt.figure(figsize=(4, 4))
                ax = fig.add_subplot(111)
                ax.get_yaxis().get_major_formatter().set_useOffset(False)
                x_vals, y_vals, y_vals_old = [], [], []
                for model, tau_mapping in mapping.items():
                    if '0.00' in tau_mapping:
                        x_vals.append(model.tau)
                        y_vals.append(tau_mapping.pop('0.00'))
                        for inner_model, inner_tau_mapping in mapping.items():
                            if 'Original Population' in inner_tau_mapping and model.tau == inner_model.tau:
                                y_vals_old.append(
                                    inner_tau_mapping['Original Population'])

                print(x_vals, y_vals_old, y_vals)
                x_vals, y_vals, y_vals_old = list(
                    zip(*(sorted(zip(x_vals, y_vals, y_vals_old),
                                 key=operator.itemgetter(0)))))

                ax.plot(x_vals,
                        y_vals,
                        color=colors[count],
                        marker='o',
                        label='Impacted Population')
                ax.plot(x_vals,
                        y_vals_old,
                        color=colors[count + 1],
                        marker='o',
                        label='Initial Population')

                ax.set_title(plot_title)
                ax.set_ylabel(str(index_type))
                ax.set_xlabel(x_title)
                ax.set_xticks(x_vals)
                ax.set_xticklabels(list(map(str, x_vals)))
                # box = ax.get_position()
                # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
                if 'atkinson' in index_type.shortname().lower():
                    ax.legend(loc='lower left')
                elif 'aci' in index_type.shortname().lower():
                    ax.legend(loc='upper right')
                else:
                    ax.legend(loc='lower right')
                filename = "segregation_{}_fc".format(index_type.shortname())
                figpath = plots_dir + '/' + filename + '.' + format
                plt.savefig(figpath, format=format, bbox_inches='tight')
                plt.savefig(plots_dir + '/' + filename + '.pdf',
                            format='pdf',
                            bbox_inches='tight')
                # for i, ax_obj in enumerate([ax]):
                #     fig_legend = plt.figure(figsize=(4, 3))
                #     handles, labels = ax_obj.get_legend_handles_labels()
                #     fig_legend.legend(handles, labels, 'center', ncol=1)
                #     fig_legend.savefig(plots_dir + '/' + filename + "_legend." + format, format=format, bbox_inches='tight')
                #     fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf", format='pdf', bbox_inches='tight')
                plt.close(fig)

                yield get_wiki_link(figpath)
        else:
            x_labels, y_old, y_new = [], [], []
            for index_type, mapping in seg_index_mapping.items():
                x_labels, y_old, y_new = [], [], []
                # plot_title = str(index_type)
                for model, tau_mapping in mapping.items():
                    if '0.00' in tau_mapping:
                        print("Outer: {}".format(model.filename()))
                        # if index_type.shortname() == 'centralization':
                        #     x_labels.append('{}\n{}'.format(model.shortfilename(), 'Cent.'))
                        # elif index_type.shortname() == 'atkinson':
                        #     x_labels.append('{}\n{}'.format(model.shortfilename(), 'Atkinson'))
                        # else:
                        #     x_labels.append('{}\n{}'.format(model.shortfilename(), index_type.shortname()))
                        x_labels.append(model.shortfilename())
                        y_new.append(tau_mapping['0.00'])
                        for inner_model, inner_tau_mapping in mapping.items():
                            if 'Original Population' in inner_tau_mapping and model.filename(
                            ) == inner_model.filename():
                                print("Added")
                                y_old.append(
                                    inner_tau_mapping['Original Population'])
                                break

                fig = plt.figure(figsize=(4, 4))
                ax = fig.add_subplot(111)
                ax.get_yaxis().get_major_formatter().set_useOffset(False)
                ind = np.arange(len(y_old))
                width = 0.25

                min_y = min(min(y_new), min(y_old))
                max_y = max(max(y_new), max(y_old))
                range_y = max_y - min_y
                ax.set_ylim(min_y - range_y, max_y + range_y)

                ax.bar(ind + width,
                       y_new,
                       width=width,
                       color=colors[0],
                       label='Impacted Population')
                ax.bar(ind,
                       y_old,
                       width=width,
                       color=colors[1],
                       label='Initial Population')
                ax.set_xticks(ind + width / 2)
                ax.set_ylabel(str(index_type))
                ax.set_xticklabels(x_labels)
                ax.legend(loc='upper right')
                filename = "segregation_{}".format(index_type.shortname())
                figpath = plots_dir + '/' + filename + '.' + format
                plt.savefig(figpath, format=format, bbox_inches='tight')
                plt.savefig(plots_dir + '/' + filename + '.pdf',
                            format='pdf',
                            bbox_inches='tight')
                plt.close(fig)
                yield get_wiki_link(figpath)
Esempio n. 11
0
def get_segregation_plots(res_dir, outer_seg_index_mapping, format='png'):
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 18
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/segregation_plots"
    out.create_dir(plots_dir)
    x_title, y_title = r'$C_s$', r'$C$_~s'

    threed_plot_mapping = {
    }  # mapping from { SSI : {model: ([<x_vals>], [<y_vals>], [<z_vals>])} }
    plot_strings_mapping = {}  # mapping from { SSI : [<str1>, <str2>....] }
    strings_to_write = []

    print("Initially passed dict: {}".format(outer_seg_index_mapping))

    for tau_nosens, seg_index_mapping in outer_seg_index_mapping.items():
        plot_strings_mapping[tau_nosens] = {}
        for index_type, mapping in seg_index_mapping.items():
            if index_type.shortname() not in threed_plot_mapping:
                threed_plot_mapping[index_type.shortname()] = {}
            # plot_title = str(index_type)
            plot_title = ''
            fig = plt.figure(figsize=(4, 4))
            ax = fig.add_subplot(111)
            ax.get_yaxis().get_major_formatter().set_useOffset(False)
            # ax.set_yscale('log')
            count, min_y, max_y = 0, INF, -INF
            for model, tau_mapping in mapping.items():
                y_vals_old = tau_mapping.pop('Original Population')
                x_labels, y_vals = list(zip(*tau_mapping.items()))
                x_labels = list(map(float, x_labels))
                x_labels, y_vals = list(
                    zip(*(sorted(zip(x_labels, y_vals),
                                 key=operator.itemgetter(0)))))
                y_vals_old = [y_vals_old] * len(
                    x_labels) if y_vals_old is not None else None
                x_vals = np.arange(1, len(x_labels) + 1, 1)
                ax.plot(x_vals,
                        y_vals,
                        color=colors[count],
                        marker='o',
                        label=model.filename())
                if y_vals_old is not None:
                    ax.plot(x_vals,
                            y_vals_old,
                            color=colors[-1],
                            linestyle='dashed',
                            marker='o',
                            label='Original Population')

                if model in threed_plot_mapping[index_type.shortname()]:
                    threed_plot_mapping[
                        index_type.shortname()][model][0] += list(
                            map(float, x_labels))
                    threed_plot_mapping[index_type.shortname(
                    )][model][1] += [tau_nosens] * len(x_labels)
                    threed_plot_mapping[
                        index_type.shortname()][model][2] += list(
                            y_vals)  # X, Y, Z
                    threed_plot_mapping[index_type.shortname(
                    )]['Original Population'][0] += list(map(float, x_labels))
                    threed_plot_mapping[index_type.shortname(
                    )]['Original Population'][1] += [tau_nosens
                                                     ] * len(x_labels)
                    threed_plot_mapping[index_type.shortname(
                    )]['Original Population'][2] += list(y_vals_old)
                else:
                    threed_plot_mapping[index_type.shortname()][model] = [
                        list(map(float, x_labels)),
                        [tau_nosens] * len(x_labels),
                        list(y_vals)
                    ]  # X, Y, Z
                    threed_plot_mapping[
                        index_type.shortname()]['Original Population'] = [
                            list(map(float, x_labels)),
                            [tau_nosens] * len(x_labels),
                            list(y_vals_old)
                        ]  # X, Y, Z

                count += 1
            ax.set_title(plot_title)
            ax.set_ylabel(str(index_type))
            ax.set_xlabel(x_title)
            ax.set_xticks(x_vals)
            ax.set_xticklabels(x_labels)
            # box = ax.get_position()
            # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
            # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

            filename = "segregation_{}_{}".format(index_type.shortname(),
                                                  tau_nosens)
            figpath = plots_dir + '/' + filename + '.' + format
            plt.savefig(figpath, format=format, bbox_inches='tight')
            plt.savefig(plots_dir + '/' + filename + '.pdf',
                        format='pdf',
                        bbox_inches='tight')
            for i, ax_obj in enumerate([ax]):
                fig_legend = plt.figure(figsize=(4, 3))
                handles, labels = ax_obj.get_legend_handles_labels()
                fig_legend.legend(handles, labels, 'center', ncol=1)
                fig_legend.savefig(plots_dir + '/' + filename + "_legend." +
                                   format,
                                   format=format,
                                   bbox_inches='tight')
                fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf",
                                   format='pdf',
                                   bbox_inches='tight')
            plt.close(fig)

            if index_type.shortname() in plot_strings_mapping[tau_nosens]:
                plot_strings_mapping[tau_nosens][
                    index_type.shortname()].append("{}\n\n{}".format(
                        get_wiki_link(figpath),
                        get_wiki_link(plots_dir + '/' + filename + "_legend." +
                                      format)))
            else:
                plot_strings_mapping[tau_nosens][index_type.shortname()] = [
                    "{}\n\n{}".format(
                        get_wiki_link(figpath),
                        get_wiki_link(plots_dir + '/' + filename + "_legend." +
                                      format))
                ]

    print("3D plot mapping: {}".format(threed_plot_mapping))
    print("Plot string mapping: {}".format(plot_strings_mapping))

    for seg_index, model_mapping in threed_plot_mapping.items():
        strings_to_write.append("== {} ==".format(str(seg_index)))

        for tau_nosens, seg_index_mapping in plot_strings_mapping.items():
            strings_to_write.append(
                "=== Tau for non-sens: {} ===".format(tau_nosens))
            strings_to_write += seg_index_mapping[seg_index]

        # plot_title = str(seg_index)
        plot_title = ''
        filename = "segregation_{}_3d".format(seg_index)
        figpath = plots_dir + '/' + filename + '.' + format
        fig = plt.figure()
        ax = fig.add_subplot(111, projection='3d')
        count = 0
        for model, x_y_z in model_mapping.items():
            x_meshgrid, y_meshgrid = np.meshgrid(x_y_z[0], x_y_z[1])
            _, z_meshgrid = np.meshgrid(x_y_z[0], x_y_z[2])
            ax.plot_wireframe(
                x_meshgrid,
                y_meshgrid,
                z_meshgrid,
                label=model if isinstance(model, str) else model.filename(),
                linestyle='dashed' if isinstance(model, str) else 'solid',
                color=colors[count])
            count += 1
        ax.set_title(plot_title)
        ax.set_xlabel(x_title)
        ax.set_ylabel(y_title)
        box = ax.get_position()
        ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
        ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
        plt.savefig(figpath, format=format, bbox_inches='tight')
        plt.savefig(plots_dir + '/' + filename + '.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.close(fig)
        strings_to_write.append(get_wiki_link(figpath))

    return strings_to_write
Esempio n. 12
0
def get_pdf_plots(res_dir,
                  X,
                  sens_group,
                  taus_for_sens,
                  feature_info,
                  data_for_pdf,
                  tau_nosens,
                  separate_legend=False,
                  y_title='',
                  combine_all_plots=False,
                  plot_title='',
                  filename='',
                  format='png'):
    assert len(feature_info) == X.shape[1]
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 18
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/pdf_before_after_plots"
    out.create_dir(plots_dir)

    X_sens, X_nosens = X[sens_group], X[~sens_group]

    for i in range(len(feature_info)):
        for model, tau_sens_to_population in data_for_pdf.items():
            plot_title = "{}, feature name: {} ({}), C_~s = {}".format(
                model.filename(), feature_info[i][0], feature_info[i][1],
                tau_nosens)
            filename = "pdf_{}_{}_{}".format(model.filename(), tau_nosens,
                                             feature_info[i][0])
            figpath = plots_dir + '/' + filename + '.' + format
            figpath_full = plots_dir + '/' + filename + '_complete_population.' + format
            fig1, (ax1, ax2) = plt.subplots(1,
                                            2,
                                            sharey=True,
                                            sharex=True,
                                            figsize=(6, 6))
            fig3 = plt.figure(figsize=(4, 4))
            ax3 = fig3.add_subplot(111)
            fig1.suptitle(plot_title)
            ax1.set_title("Sens Group")
            ax2.set_title("Non-Sens Group")
            x_vals = sorted(list(set(X[:, i])))
            y_vals_sens = [
                np.count_nonzero(X_sens[:, i] == x_val) / len(X_sens)
                for x_val in x_vals
            ]
            y_vals_nosens = [
                np.count_nonzero(X_nosens[:, i] == x_val) / len(X_nosens)
                for x_val in x_vals
            ]
            y_vals_all = [
                np.count_nonzero(X[:, i] == x_val) / len(X) for x_val in x_vals
            ]
            ax1.plot(x_vals,
                     y_vals_sens,
                     color=colors[0],
                     marker='',
                     linestyle='dashed')
            ax2.plot(x_vals,
                     y_vals_nosens,
                     color=colors[0],
                     marker='',
                     linestyle='dashed',
                     label='Original Population')
            ax3.plot(x_vals,
                     y_vals_all,
                     color=colors[0],
                     marker='',
                     linestyle='dashed',
                     label='Original Population')
            for tau_sens, population in tau_sens_to_population.items():
                idx = taus_for_sens.index(tau_sens)
                population_sens, population_nosens = population[
                    sens_group], population[~sens_group]
                y_vals_sens = [
                    np.count_nonzero(population_sens[:, i] == x_val) /
                    len(population_sens) for x_val in x_vals
                ]
                y_vals_nosens = [
                    np.count_nonzero(population_nosens[:, i] == x_val) /
                    len(population_nosens) for x_val in x_vals
                ]
                y_vals_all = [
                    np.count_nonzero(population[:, i] == x_val) /
                    len(population) for x_val in x_vals
                ]
                ax1.plot(x_vals, y_vals_sens, color=colors[idx + 1], marker='')
                ax2.plot(x_vals,
                         y_vals_nosens,
                         color=colors[idx + 1],
                         marker='',
                         label='{}'.format(tau_sens))
                ax3.plot(x_vals,
                         y_vals_all,
                         color=colors[idx + 1],
                         marker='',
                         label='{}'.format(tau_sens))
            if not separate_legend:
                box2, box3 = ax2.get_position(), ax3.get_position()
                ax2.set_position(
                    [box2.x0, box2.y0, box2.width * 0.8, box2.height])
                ax2.legend(loc='center left', bbox_to_anchor=(1, 0.5))
                ax3.set_position(
                    [box3.x0, box3.y0, box3.width * 0.8, box3.height])
                ax3.legend(loc='center left', bbox_to_anchor=(1, 0.5))
            fig1.savefig(figpath, format=format, bbox_inches='tight')
            fig1.savefig(plots_dir + '/' + filename + '.pdf',
                         format='pdf',
                         bbox_inches='tight')
            fig3.savefig(figpath_full, format=format, bbox_inches='tight')
            fig3.savefig(plots_dir + '/' + filename +
                         '_complete_population.pdf',
                         format='pdf',
                         bbox_inches='tight')
            # if separate_legend:
            #     fig_legend = plt.figure(figsize=(4, 3))
            #     handles, labels = ax2.get_legend_handles_labels()
            #     fig_legend.legend(handles, labels, 'center', ncol=1)
            #     fig_legend.savefig(plots_dir + '/' + filename + "_legend" + format, format=format, bbox_inches='tight')
            #     fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf", format='pdf', bbox_inches='tight')
            plt.close(fig1)
            plt.close(fig3)
            wiki_string = "\n{}\n\n{}\n\n".format(get_wiki_link(figpath),
                                                  get_wiki_link(figpath_full))
            yield wiki_string
Esempio n. 13
0
def get_abs_clustering_plots(res_dir,
                             thresholds,
                             new_abs_clustering_index,
                             old_abs_clustering_index,
                             tau_nosens,
                             y_title,
                             plot_title='',
                             filename='abs_index_utility_threshold',
                             format='png'):
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 18
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    x_title, plot_title = r'$C_s$', r'$C$_~s = ' + str(tau_nosens)

    fig = plt.figure(figsize=(6, 6))
    ax = fig.add_subplot(111)
    count, min_y, max_y = 0, INF, -INF
    for k, v in new_abs_clustering_index.items():
        y_vals, y_vals_old = v, [old_abs_clustering_index[k]] * len(
            thresholds) if old_abs_clustering_index is not None else None
        x_labels = [
            "{:.2f}".format(float(x)) if float(10000 * x) % 100 == 0 else ''
            for x in thresholds
        ]
        x_vals = np.arange(1, len(x_labels) + 1, 1)
        ax.plot(x_vals,
                y_vals,
                color=colors[count],
                marker='o',
                label=k.filename())
        if y_vals_old is not None:
            ax.plot(x_vals,
                    y_vals_old,
                    color=colors[count],
                    linestyle='dashed',
                    marker='o')
        count += 1
    # ax.set_ylim(min_y - 0.5, max_y + 0.5)
    # ax.set_yticks(list(ax.get_yticks())[:-1])
    ax.set_title(plot_title)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.set_xticks(x_vals)
    ax.set_xticklabels(x_labels)
    # if 'fpr' in y_title.lower():
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    figpath = plots_dir + '/' + filename + '.' + format
    plt.savefig(figpath, format=format, bbox_inches='tight')
    plt.savefig(plots_dir + '/' + filename + '.pdf',
                format='pdf',
                bbox_inches='tight')
    # for i, ax_obj in enumerate([ax]):
    #     fig_legend = plt.figure(figsize=(4, 3))
    #     handles, labels = ax_obj.get_legend_handles_labels()
    #     fig_legend.legend(handles, labels, 'center', ncol=1)
    #     fig_legend.savefig(plots_dir + '/' + filename + "_legend." + format, format=format, bbox_inches='tight')
    #     fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf", format='pdf', bbox_inches='tight')
    plt.clf()
    plt.close()
    wiki_string = "\n{}\n\n".format(get_wiki_link(figpath))
    return wiki_string
Esempio n. 14
0
def get_utility_threshold_plots(res_dir,
                                models,
                                col_headings,
                                values,
                                values_old,
                                tau_nosens,
                                plot_title='',
                                filename='utility_threshold',
                                format='png'):
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 20
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 1
    plt.rcParams['xtick.labelsize'] = 10
    plt.rcParams['ytick.labelsize'] = 12
    plt.rcParams['legend.fontsize'] = 14
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 1.0

    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    values = np.array(values)
    x_title, y_title = r'$C_s$', col_headings[2].split("<<BR>>")[0].strip()
    # plot_title = r'$C$_~s = ' + str(tau_nosens)
    plot_title = ''

    fig = plt.figure(figsize=(4, 4))
    ax = fig.add_subplot(111)
    all_model_names = [str(x) for x in models]
    for i in range(len(all_model_names)):
        mask = np.where(values[:, 0] == all_model_names[i])[0]
        x_labels, y_vals = values[mask, 1].flatten(), list(values[mask,
                                                                  2].flatten())
        y_vals_old = list(values_old[mask, 2].flatten())
        y_vals_old = [float(val.split('(')[0].strip()) for val in y_vals_old]
        x_labels = ["{:.2f}".format(float(x)) for x in x_labels]
        x_vals = np.arange(1, len(x_labels) + 1, 1)
        y_vals = [float(val.split('(')[0].strip()) for val in y_vals]
        ax.plot(x_vals,
                y_vals,
                color=colors[i],
                marker='o',
                label=all_model_names[i].split("<<BR>>")[0])
        ax.plot(x_vals,
                y_vals_old,
                color=colors[i],
                linestyle='dashed',
                marker='o')
    ax.set_title(plot_title)
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.set_xticks(x_vals)
    ax.set_xticklabels(x_labels)
    # if 'fpr' in y_title.lower():
    # box = ax.get_position()
    # ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    # ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))

    figpath = plots_dir + '/' + filename + '.' + format
    plt.savefig(figpath, format=format, bbox_inches='tight')
    plt.savefig(plots_dir + '/' + filename + '.pdf',
                format='pdf',
                bbox_inches='tight')
    for i, ax_obj in enumerate([ax]):
        fig_legend = plt.figure(figsize=(3, 3))
        handles, labels = ax_obj.get_legend_handles_labels()
        fig_legend.legend(handles, labels, 'center', ncol=1)
        fig_legend.savefig(plots_dir + '/' + filename + "_legend." + format,
                           format=format,
                           bbox_inches='tight')
        fig_legend.savefig(plots_dir + '/' + filename + "_legend.pdf",
                           format='pdf',
                           bbox_inches='tight')
    plt.clf()
    plt.close()
    wiki_string = "\n{}\n\n{}\n\n".format(
        get_wiki_link(figpath),
        get_wiki_link(plots_dir + '/' + filename + "_legend." + format))
    return wiki_string
Esempio n. 15
0
def get_disparity_plots(res_dir,
                        col_headings,
                        values,
                        plot_title='',
                        filename='all_disp_in_one',
                        format='png'):
    plt.rcParams['font.size'] = 24
    plt.rcParams['pdf.fonttype'] = 42
    plt.rcParams['ps.fonttype'] = 42
    plt.rcParams['axes.labelsize'] = 22
    plt.rcParams['axes.labelweight'] = 'bold'
    plt.rcParams['axes.titlesize'] = 15
    plt.rcParams['axes.linewidth'] = 3
    plt.rcParams['xtick.labelsize'] = 14
    plt.rcParams['ytick.labelsize'] = 14
    plt.rcParams['legend.fontsize'] = 18
    plt.rcParams['figure.titlesize'] = 28
    plt.rcParams['lines.linewidth'] = 3.0

    plots_dir = res_dir + "/disparity_plots"
    out.create_dir(plots_dir)
    values = np.array(values)
    x_labels = list(values[:, 0])
    x_labels = [label.split("<<BR>>")[0].strip() for label in x_labels]
    try:
        x_labels[x_labels.index("LogReg")] = "Log\nReg"
        x_labels[x_labels.index("NeuralNet")] = "Neural\nNet"
    except:
        pass
    x_title, y_title = "Model", "Disparity"
    wiki_string = ""
    width = 0.2
    x_vals = np.array(range(1, 2 * len(x_labels), 2))

    order_of_cols = get_column_heading_order(values[0, 1:].flatten())
    values[:, 1:] = values[:, 1:][:, order_of_cols]
    col_headings = np.array(col_headings)
    print(col_headings, type(col_headings), order_of_cols)
    col_headings[1:] = col_headings[1:][order_of_cols]
    col_headings = list(col_headings)

    fig = plt.figure()
    ax = fig.add_subplot(111)
    all_rects, all_disp_types = [], []

    for i in range(1, len(col_headings)):
        if 'statistical' in col_headings[i].lower():
            disparity_type = "Statistical\nDisparity"
        else:
            disparity_type = col_headings[i].split("<<BR>>")[0].replace(
                'Disparity ', 'Disparity\n').replace(' (', '\n(')
        y_vals = list(values[:, i].flatten())
        y_vals = ([float(val.split('(')[0].strip()) for val in y_vals])
        rect = ax.bar(x_vals + (i - 1) * width,
                      y_vals,
                      width,
                      color=colors[i - 1])
        all_rects.append(rect)
        all_disp_types.append(disparity_type)
    ax.set_title(plot_title)
    ax.set_yscale('log')
    ax.set_xlabel(x_title)
    ax.set_ylabel(y_title)
    ax.set_xticks(x_vals + (len(col_headings) - 2) * width / 2)
    ax.set_xticklabels(x_labels)
    box = ax.get_position()
    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
    ax.legend(all_rects,
              all_disp_types,
              loc='center left',
              bbox_to_anchor=(1, 0.5))
    figpath = plots_dir + '/' + filename + '.' + format
    plt.savefig(figpath, format=format, bbox_inches='tight')
    plt.savefig(plots_dir + '/' + filename + '.pdf',
                format='pdf',
                bbox_inches='tight')
    plt.clf()
    plt.close()
    wiki_string += "\n{}\n\n".format(get_wiki_link(figpath))
    return wiki_string
    def run(self):
        learning_env = dec_rule_env.DecRuleEnv(self.dataset, self.sens_group_desc)
        learning_env.load_data(feature_engineering=True)
        self.initialize_variables(learning_env)

        if os.path.exists(self.res_dir + '/plots_pickled_data/seg_index_mapping.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/seg_index_mapping_fc.pkl'):
            self.seg_index_mapping = joblib.load(self.res_dir + '/plots_pickled_data/seg_index_mapping.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/seg_index_mapping_fc.pkl')
            seg_index_mapping_loaded = True
        else:
            seg_index_mapping_loaded = False

        all_models = self.models_other_than_rules
        with open(self.res_file_path, 'w') as group_res_file:
            group_res_file.write("= Disparity in effort analysis vs different utility thresholds =\n\n")
        with open(self.seg_file_path, 'w') as seg_res_file:
            seg_res_file.write("= Measuring Long Term Impact through Segregation =\n\n")

        for tau_nosens in self.taus_for_nosens:
            self.prev_labels = None
            self.disparity_table_values, self.disparity_table_values_old = [], []
            self.number_flipped_sens = {} # maping of model : number of people flipped
            self.number_flipped_nosens = {} # maping of model : number of people flipped
            self.new_abs_clustering_index = {} # mapping of model : list of new abs_clustering_index
            if not seg_index_mapping_loaded:
                self.seg_index_mapping[tau_nosens] = {k(self.sens_group_train, self.feature_info):{} for k in self.segregation_indices} # mapping of seg_index: {model : {'Original Population': index_val, 'tau_1': index_val, .....}}
            self.new_abs_clustering_index_params = {} # mapping of model : list of best params for clustering to find neighbourhoods
            self.old_abs_clustering_index = {} # mapping of model : initial abs_clustering index
            self.data_for_pdf = {} # mapping of model : dictionary (see below for details of this dictionary)
            with open(self.res_file_path, 'a') as group_res_file:
                group_res_file.write("== Utility threshold for non sensitive people = {:.2f} ==\n\n".format(tau_nosens))
            for model in all_models:
                ### set values in dicts
                # self.old_abs_clustering_index[model], initial_population_params = self.gini_index(self.x_train, self.y_train, self.sens_group_train, 'inf', 'inf')
                # self.old_ssi[model], self.new_ssi[model] = si.ssi(self.x_train, self.y_train, self.sens_group_train, 'inf', 'inf'), []
                self.number_flipped_sens[model], self.number_flipped_nosens[model] = [], []
                self.new_abs_clustering_index[model] = []
                self.new_abs_clustering_index_params[model] = []
                self.data_for_pdf[model] = {} # mapping of tau_sens : list of populations (x_train_new)

                all_flipped_x_test, all_flipped_y_test = self.get_flipped_dataset(model, 'test')
                all_flipped_x_train, all_flipped_y_train = self.get_flipped_dataset(model, 'train')
                utilities, utilities_train = self.get_utilities(model, 'test'), self.get_utilities(model, 'train')

                clf = model
                exists, loaded_clf = aeio.load_model(clf, self.dataset)
                if exists:
                    clf = loaded_clf
                    print ("Loaded {}...".format(str(clf)))
                else:
                    raise ValueError("Run experiment.py first")

                if isinstance(clf, lm.LinReg) or isinstance(clf, lm.LogReg):
                    with open(self.res_file_path, 'a') as group_res_file:
                        group_res_file.write("\n{}\n".format(out.get_table(**aeio.get_regression_weights(self.feature_info, clf))))
                elif isinstance(clf, lm.DTReg) or isinstance(clf, lm.DT):
                    with open(self.res_file_path, 'a') as group_res_file:
                        group_res_file.write("\n{}\n\n".format(aeio.plot_dtree(self.res_dir, clf, self.feature_info)))
                        group_res_file.write("{}\n\n".format(aeio.plot_covar_matrix(self.res_dir, self.x_train, self.feature_info)))

                users_preds, users_preds_train = clf.predict(self.x_test).astype(bool if self.prediction_task == exp.CLASSIFICATION else float), \
                    clf.predict(self.x_train).astype(bool if self.prediction_task == exp.CLASSIFICATION else float)

                cost_funcs, _ = exp.dataset_info[self.dataset]['cost_funcs'](self.feature_info, 
                    self.x_train, self.sens_group_train, exp.dataset_info[self.dataset]['variable_constraints'])
                cost_funcs_rev, _ = exp.dataset_info[self.dataset]['cost_funcs'](self.feature_info, 
                    self.x_train, self.sens_group_train, exp.dataset_info[self.dataset]['variable_constraints_rev'])
                
                if not seg_index_mapping_loaded:
                    for k,v in self.seg_index_mapping[tau_nosens].items():
                        v[model] = {'Original Population': k.val(X=self.x_train, y=self.y_train, cost_funcs=cost_funcs, cost_funcs_rev=cost_funcs_rev, 
                            anchor_indices=self.get_anchor_indices(model, 'train'), y_pred=users_preds_train)}

                sub_filter_sens, sub_filter_sens_train = np.zeros(len(self.x_test), dtype=bool), np.zeros(len(self.x_train), dtype=bool)
                sub_filter_nosens, sub_filter_nosens_train = np.zeros(len(self.x_test), dtype=bool), np.zeros(len(self.x_train), dtype=bool)
                sub_filter_sens[np.where(np.logical_and(self.sens_group_test, users_preds < self.y_test))[0]] = 1
                sub_filter_nosens[np.where(np.logical_and(~self.sens_group_test, users_preds < self.y_test))[0]] = 1
                sub_filter_sens_train[np.where(np.logical_and(self.sens_group_train, users_preds_train < self.y_train))[0]] = 1
                sub_filter_nosens_train[np.where(np.logical_and(~self.sens_group_train, users_preds_train < self.y_train))[0]] = 1
                print (set(list(utilities)))
                # explanations_given_sens = np.where(np.logical_and(~np.all(all_flipped_x_test == self.x_test, axis=1), self.sens_group_test))[0]
                # explanations_given_nosens = np.where(np.logical_and(~np.all(all_flipped_x_test == self.x_test, axis=1), ~self.sens_group_test))[0]
                with open(self.res_file_path, 'a') as group_res_file:
                    group_res_file.write(" * For {}, # test explanations given = {} ({} sens, {} non-sens)\n\n".format(str(model), 
                        len(utilities[sub_filter_sens]) + len(utilities[sub_filter_nosens]), len(utilities[sub_filter_sens]), len(utilities[sub_filter_nosens])))
                    group_res_file.write(" * For {}, # train explanations given = {} ({} sens, {} non-sens)\n\n".format(str(model), 
                        len(utilities_train[sub_filter_sens_train]) + len(utilities_train[sub_filter_nosens_train]), len(utilities_train[sub_filter_sens_train]), 
                        len(utilities_train[sub_filter_nosens_train])))
                for tau_sens in self.taus_for_sens:
                    sens_flipped, nonsens_flipped = (np.where(np.logical_and(utilities > tau_sens, self.sens_group_test))[0], 
                        np.where(np.logical_and(utilities > tau_nosens, ~self.sens_group_test))[0])
                    sens_flipped_train, nonsens_flipped_train = (np.where(np.logical_and(utilities_train > tau_sens, self.sens_group_train))[0], 
                        np.where(np.logical_and(utilities_train > tau_nosens, ~self.sens_group_train))[0])
                    sens_utility_old, nosens_utility_old = np.mean(utilities[sens_flipped]), np.mean(utilities[nonsens_flipped])
                    new_x_test, new_y_test = self.x_test.copy(), self.y_test.copy()
                    new_x_test[sens_flipped,:], new_y_test[sens_flipped] = all_flipped_x_test[sens_flipped,:], all_flipped_y_test[sens_flipped]
                    new_x_test[nonsens_flipped,:], new_y_test[nonsens_flipped] = all_flipped_x_test[nonsens_flipped,:], all_flipped_y_test[nonsens_flipped]
                    new_x_train, new_y_train = self.x_train.copy(), self.y_train.copy()
                    new_x_train[sens_flipped_train,:], new_y_train[sens_flipped_train] = all_flipped_x_train[sens_flipped_train,:], all_flipped_y_train[sens_flipped_train]
                    new_x_train[nonsens_flipped_train,:], new_y_train[nonsens_flipped_train] = all_flipped_x_train[nonsens_flipped_train,:], all_flipped_y_train[nonsens_flipped_train]
                    # Find the abs_clustering/gini index of new population
                    # index_val, clustering_params = self.gini_index(new_x_train, new_y_train, self.sens_group_train, 'inf', 'inf')
                    # self.new_abs_clustering_index[model].append(index_val)
                    # self.new_abs_clustering_index_params[model].append(clustering_params)
                    self.data_for_pdf[model][tau_sens] = new_x_train
                    # Plot data distribution in 2D
                    # self.plot_data_points(new_x_train, self.sens_group_train, tau_sens, tau_nosens)
                    try:
                        new_y_pred, new_y_pred_train = model.predict(new_x_test).astype(bool if self.prediction_task == exp.CLASSIFICATION else float), \
                            model.predict(new_x_train).astype(bool if self.prediction_task == exp.CLASSIFICATION else float)
                    except:
                        _, clf = aeio.load_model(model, self.dataset)
                        new_y_pred, new_y_pred_train = clf.predict(new_x_test).astype(bool if self.prediction_task == exp.CLASSIFICATION else float), \
                            clf.predict(new_x_train).astype(bool if self.prediction_task == exp.CLASSIFICATION else float)
                    
                    cost_funcs, _ = exp.dataset_info[self.dataset]['cost_funcs'](self.feature_info, 
                        new_x_train, self.sens_group_train, exp.dataset_info[self.dataset]['variable_constraints'])
                    cost_funcs_rev, _ = exp.dataset_info[self.dataset]['cost_funcs'](self.feature_info, 
                        new_x_train, self.sens_group_train, exp.dataset_info[self.dataset]['variable_constraints_rev'])

                    if not seg_index_mapping_loaded:
                        for k,v in self.seg_index_mapping[tau_nosens].items():
                            v[model]['{:.2f}'.format(tau_sens)] = k.val(X=new_x_train, y=new_y_train, cost_funcs=cost_funcs, 
                                cost_funcs_rev=cost_funcs_rev, anchor_indices=self.get_anchor_indices(model, 'train'), y_pred=new_y_pred_train)

                    with open(self.res_file_path, 'a') as group_res_file:
                        group_res_file.write(" * For {} test set, with sens tau = {:.2f}, # flipped users = {} ({} sens, {} non-sens)\n\n".format(str(model), tau_sens,
                            len(sens_flipped) + len(nonsens_flipped), len(sens_flipped), len(nonsens_flipped)))
                        group_res_file.write(" * For {} train set, with sens tau = {:.2f}, # flipped users = {} ({} sens, {} non-sens)\n\n".format(str(model), tau_sens,
                            len(sens_flipped_train) + len(nonsens_flipped_train), len(sens_flipped_train), len(nonsens_flipped_train)))

                    self.number_flipped_sens[model].append(len(sens_flipped_train)/np.count_nonzero(users_preds_train < self.y_train))
                    self.number_flipped_nosens[model].append(len(nonsens_flipped_train)/np.count_nonzero(users_preds_train < self.y_train))
                    double_flipped_utilities = self.get_double_flipped_utilities(model, tau_sens, tau_nosens)
                    if double_flipped_utilities is not None:
                        sens_flipped_new, nonsens_flipped_new = (np.where(np.logical_and(double_flipped_utilities != 0, self.sens_group_test))[0], 
                            np.where(np.logical_and(double_flipped_utilities != 0, ~self.sens_group_test))[0])
                        # sens_flipped_new, nonsens_flipped_new = (np.where(np.logical_and(double_flipped_utilities > tau_sens, self.sens_group_test))[0], 
                        #     np.where(np.logical_and(double_flipped_utilities > tau_nosens, ~self.sens_group_test))[0])
                        sens_utility_new, nosens_utility_new = np.mean(double_flipped_utilities[sens_flipped_new]), np.mean(double_flipped_utilities[nonsens_flipped_new])
                        if len(self.disparity_table_heading) <= 2:
                            heading, formats, values = eval_formula.get_disparity_measures(new_y_test, new_y_pred, self.sens_group_test, sens_utility_new if not np.isnan(sens_utility_new) else 0., 
                                nosens_utility_new if not np.isnan(nosens_utility_new) else 0., self.prediction_task, return_heading_and_formats=True)
                            self.disparity_table_heading += heading
                            self.disparity_table_formats += formats
                        else:
                            values = eval_formula.get_disparity_measures(new_y_test, new_y_pred, self.sens_group_test, sens_utility_new if not np.isnan(sens_utility_new) else 0., 
                                nosens_utility_new if not np.isnan(nosens_utility_new) else 0., self.prediction_task, return_heading_and_formats=False)
                        old_values = eval_formula.get_disparity_measures(self.y_test, users_preds, self.sens_group_test, sens_utility_old if not np.isnan(sens_utility_old) else 0., 
                                nosens_utility_old if not np.isnan(nosens_utility_old) else 0., self.prediction_task, return_heading_and_formats=False)
                        self.disparity_table_values.append([str(model), "{:.2f}".format(tau_sens)] + values)
                        self.disparity_table_values_old.append([str(model), "{:.2f}".format(tau_sens)] + old_values)

            # with open(self.res_file_path, 'a') as group_res_file:
            #     group_res_file.write("{}\n\n".format(out.get_table(self.disparity_table_heading, 
            #         self.disparity_table_values, val_format=self.disparity_table_formats)))
            #     for i in range(3, len(self.disparity_table_heading)):
            #         self.disparity_table_values = np.array(self.disparity_table_values)
            #         self.disparity_table_values_old = np.array(self.disparity_table_values_old)
            #         heading = '_'.join(self.disparity_table_heading[i].split("<<BR>>")[0].strip().lower().split(" "))
            #         group_res_file.write(aeio.get_utility_threshold_plots(self.res_dir, all_models, self.disparity_table_heading[:2] + [self.disparity_table_heading[i]], 
            #             np.append(self.disparity_table_values[:,:2], self.disparity_table_values[:,i:i+1], axis=1),
            #             np.append(self.disparity_table_values_old[:,:2], self.disparity_table_values_old[:,i:i+1], axis=1),
            #             tau_nosens, filename='utility_threshold_{}_{}'.format(tau_nosens, heading), plot_title=''))
                # group_res_file.write("\n{}\n\n".format(aeio.get_abs_clustering_plots(self.res_dir, self.taus_for_sens, self.number_flipped_sens, 
                #     self.number_flipped_nosens, tau_nosens, 'Fraction of Flipped Users', filename='number_of_users_flipped_{}'.format(tau_nosens), plot_title='')))
                # for wiki_path in aeio.get_pdf_plots(self.res_dir, self.x_train, self.sens_group_train, self.taus_for_sens, self.feature_info, self.data_for_pdf, tau_nosens):
                #     group_res_file.write("\n{}\n\n".format(wiki_path))

        out.create_dir(self.res_dir + '/plots_pickled_data')
        joblib.dump(self.seg_index_mapping, self.res_dir + '/plots_pickled_data/seg_index_mapping.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/seg_index_mapping_fc.pkl')

        with open(self.seg_file_path, 'a') as seg_res_file:
            for wiki_path in aeio.get_segregation_plots_new(self.res_dir, self.seg_index_mapping, exp.FAIRNESS_CONSTRAINTS):
                seg_res_file.write("\n{}\n\n".format(wiki_path))

        out.upload_results([self.res_dir + '/disparity_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.png')
        out.upload_results([self.res_dir + '/pdf_before_after_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.png')
        out.upload_results([self.res_dir + '/segregation_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.png')
        out.upload_results([self.res_dir + '/disparity_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.pdf')
        out.upload_results([self.res_dir + '/pdf_before_after_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.pdf')
        out.upload_results([self.res_dir + '/segregation_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.pdf')
    def run(self, test_or_train):
        learning_env = dec_rule_env.DecRuleEnv(self.dataset,
                                               self.sens_group_desc)
        learning_env.load_data(feature_engineering=True)
        self.initialize_variables(learning_env)
        self.set_vars(test_or_train)

        with open(self.res_file_path, 'w') as res_file:
            res_file.write(
                "= Effort, Reward and Utilities as functions of one another =\n\n"
            )

        model_to_utility_sens, model_to_utility_nosens = {}, {}
        model_to_reward_sens, model_to_reward_nosens = {}, {}
        model_to_effort_sens, model_to_effort_nosens = {}, {}

        for model in self.models_other_than_rules:
            sens_utils_with_effort, nosens_utils_with_effort = [], []
            sens_reward_with_effort, nosens_reward_with_effort = [], []
            sens_effort_with_reward, nosens_effort_with_reward = [], []
            model_start = time.time()
            exists, loaded_clf = aeio.load_model(model, self.dataset)
            if exists:
                model = loaded_clf
                print("Loaded {}...".format(str(model)))
            else:
                print("Training {}...".format(str(model)))
                model.train(self.x_train, self.y_train)
                aeio.persist_model(model, self.dataset)
            model_end = time.time()

            y_test_pred = model.predict(self.users).astype(
                bool if exp.dataset_info[self.dataset]['prediction_task'] ==
                exp.CLASSIFICATION else float)
            y_train_pred = model.predict(self.role_model_users).astype(
                bool if exp.dataset_info[self.dataset]['prediction_task'] ==
                exp.CLASSIFICATION else float)

            print("Model: {}, MAE: {}, MSE: {}".format(
                model, mean_absolute_error(self.users_gt, y_test_pred),
                mean_squared_error(self.users_gt, y_test_pred)))
            # continue

            self.role_model_users_pred = y_train_pred  # This should not change
            self.users_preds = y_test_pred if test_or_train == 'test' else y_train_pred  # change this based on which group's explanations are needed (test or train)

            sub_filter_sens = np.zeros(len(self.users), dtype=bool)
            sub_filter_nosens = np.zeros(len(self.users), dtype=bool)
            sub_filter_sens[np.where(np.logical_and(self.users_sens_group, self.users_preds < self.users_gt))[0][:self.subsample_size] if \
                self.subsample_size is not None else np.where(np.logical_and(self.users_sens_group, self.users_preds < self.users_gt))[0]] = 1
            sub_filter_nosens[np.where(np.logical_and(~self.users_sens_group, self.users_preds < self.users_gt))[0][:self.subsample_size] if \
                self.subsample_size is not None else np.where(np.logical_and(~self.users_sens_group, self.users_preds < self.users_gt))[0]] = 1
            filtered_users_sens = self.users[sub_filter_sens]
            user_gt_labels_sens = self.users_gt[sub_filter_sens]
            user_predicted_labels_sens = self.users_preds[sub_filter_sens]
            filtered_users_nosens = self.users[sub_filter_nosens]
            user_gt_labels_nosens = self.users_gt[sub_filter_nosens]
            user_predicted_labels_nosens = self.users_preds[sub_filter_nosens]

            ind_start = time.time()

            for delta in self.effort_deltas:
                sens_rewards, sens_utils, nosens_rewards, nosens_utils = [], [], [], []
                for i, user in enumerate(filtered_users_sens):
                    print("Computing for user",
                          np.where(sub_filter_sens)[0][i])
                    user = np.array([user])
                    index_in_users = np.where(sub_filter_sens)[0][i]

                    # optimizer = ge.SamplingMethod(np.array([1]), self.feature_info, self.cost_funcs, self.cost_funcs_rev,
                    #     exp.dataset_info[self.dataset]['variable_constraints'], model, self.dataset)
                    # role_model, role_model_gt, role_model_pred = optimizer.sampling_based_explanations(
                    #     user,
                    #     self.role_model_users,
                    #     self.role_model_users_gt_labels,
                    #     self.role_model_users_pred,
                    #     user_gt_labels_sens[i],
                    #     user_predicted_labels_sens[i],
                    #     user_sens_group=1,
                    #     return_only_user=True)
                    role_model, role_model_effort, role_model_reward, role_model_utility = \
                        self.sampling_based_explanations(
                            user,
                            self.role_model_users,
                            self.role_model_users_gt_labels,
                            self.role_model_users_pred,
                            user_gt_labels_sens[i],
                            user_predicted_labels_sens[i],
                            user_sens_group=1,
                            cost_sens_group=np.array([1]),
                            variable_to_optimize='reward',
                            variable_to_threshold='effort',
                            threshold_value=delta
                        )
                    assert role_model_utility == role_model_reward - role_model_effort
                    sens_rewards.append(role_model_reward)
                    print(
                        "[Sens] Model: {}, Effort threshold: {}, Effort value: {}, Max Reward: {}"
                        .format(model, delta, role_model_effort,
                                role_model_reward))
                    # break
                    # role_model, role_model_effort, role_model_reward, role_model_utility = \
                    #     self.sampling_based_explanations(
                    #         user,
                    #         self.role_model_users,
                    #         self.role_model_users_gt_labels,
                    #         self.role_model_users_pred,
                    #         user_gt_labels_sens[i],
                    #         user_predicted_labels_sens[i],
                    #         user_sens_group=1,
                    #         cost_sens_group=np.array([1]),
                    #         variable_to_optimize='utility',
                    #         variable_to_threshold='effort',
                    #         threshold_value=delta
                    #     )
                    # sens_utils.append(role_model_utility)

                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         1, optimizer, 'reward',
                    #         'effort', delta)
                    # sens_rewards.append(x_new_reward)
                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         1, optimizer, 'utility',
                    #         'effort', delta)
                    # sens_utils.append(x_new_utility)

                    dir_up_cols_generator = cf.get_up_cols(
                        exp.dataset_info[self.dataset]['variable_constraints'],
                        self.feature_info)
                    for dir_up_cols in dir_up_cols_generator:
                        assert np.all(
                            role_model[dir_up_cols] >=
                            user.flatten()[dir_up_cols])  # sanity check

                for i, user in enumerate(filtered_users_nosens):
                    print("Computing for user",
                          np.where(sub_filter_nosens)[0][i])
                    user = np.array([user])
                    index_in_users = np.where(sub_filter_nosens)[0][i]

                    # optimizer = ge.SamplingMethod(np.array([0]), self.feature_info, self.cost_funcs, self.cost_funcs_rev,
                    #     exp.dataset_info[self.dataset]['variable_constraints'], model, self.dataset)
                    # role_model, role_model_gt, role_model_pred = optimizer.sampling_based_explanations(
                    #     user,
                    #     self.role_model_users,
                    #     self.role_model_users_gt_labels,
                    #     self.role_model_users_pred,
                    #     user_gt_labels_nosens[i],
                    #     user_predicted_labels_nosens[i],
                    #     user_sens_group=0,
                    #     return_only_user=True)

                    role_model, role_model_effort, role_model_reward, role_model_utility = \
                        self.sampling_based_explanations(
                            user,
                            self.role_model_users,
                            self.role_model_users_gt_labels,
                            self.role_model_users_pred,
                            user_gt_labels_nosens[i],
                            user_predicted_labels_nosens[i],
                            user_sens_group=0,
                            cost_sens_group=np.array([0]),
                            variable_to_optimize='reward',
                            variable_to_threshold='effort',
                            threshold_value=delta
                        )
                    assert role_model_utility == role_model_reward - role_model_effort
                    nosens_rewards.append(role_model_reward)
                    print(
                        "[Nosens] Model: {}, Effort threshold: {}, Effort value: {}, Max Reward: {}"
                        .format(model, delta, role_model_effort,
                                role_model_reward))
                    # break
                    # role_model, role_model_effort, role_model_reward, role_model_utility = \
                    #     self.sampling_based_explanations(
                    #         user,
                    #         self.role_model_users,
                    #         self.role_model_users_gt_labels,
                    #         self.role_model_users_pred,
                    #         user_gt_labels_nosens[i],
                    #         user_predicted_labels_nosens[i],
                    #         user_sens_group=0,
                    #         cost_sens_group=np.array([0]),
                    #         variable_to_optimize='utility',
                    #         variable_to_threshold='effort',
                    #         threshold_value=delta
                    #     )
                    # nosens_utils.append(role_model_utility)

                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         0, optimizer, 'reward',
                    #         'effort', delta)
                    # nosens_rewards.append(x_new_reward)
                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         0, optimizer, 'utility',
                    #         'effort', delta)
                    # nosens_utils.append(x_new_utility)

                    dir_up_cols_generator = cf.get_up_cols(
                        exp.dataset_info[self.dataset]['variable_constraints'],
                        self.feature_info)
                    for dir_up_cols in dir_up_cols_generator:
                        assert np.all(
                            role_model[dir_up_cols] >=
                            user.flatten()[dir_up_cols])  # sanity check
                # sens_utils_with_effort.append(np.mean(sens_utils))
                sens_reward_with_effort.append(np.mean(sens_rewards))
                # nosens_utils_with_effort.append(np.mean(nosens_utils))
                nosens_reward_with_effort.append(np.mean(nosens_rewards))

            sens_reward_with_effort, nosens_reward_with_effort = np.array(
                sens_reward_with_effort), np.array(nosens_reward_with_effort)
            sens_utils_with_effort, nosens_utils_with_effort = sens_reward_with_effort - self.effort_deltas, nosens_reward_with_effort - self.effort_deltas
            model_to_utility_sens[model] = [
                self.effort_deltas, sens_utils_with_effort
            ]
            model_to_utility_nosens[model] = [
                self.effort_deltas, nosens_utils_with_effort
            ]
            model_to_reward_sens[model] = [
                self.effort_deltas, sens_reward_with_effort
            ]
            model_to_reward_nosens[model] = [
                self.effort_deltas, nosens_reward_with_effort
            ]

            for delta in self.reward_deltas:
                sens_efforts, nosens_efforts = [], []
                for i, user in enumerate(filtered_users_sens):
                    print("Computing for user",
                          np.where(sub_filter_sens)[0][i])
                    user = np.array([user])
                    index_in_users = np.where(sub_filter_sens)[0][i]

                    # optimizer = ge.SamplingMethod(np.array([1]), self.feature_info, self.cost_funcs, self.cost_funcs_rev,
                    #     exp.dataset_info[self.dataset]['variable_constraints'], model, self.dataset)
                    # role_model, role_model_gt, role_model_pred = optimizer.sampling_based_explanations(
                    #     user,
                    #     self.role_model_users,
                    #     self.role_model_users_gt_labels,
                    #     self.role_model_users_pred,
                    #     user_gt_labels_sens[i],
                    #     user_predicted_labels_sens[i],
                    #     user_sens_group=1,
                    #     return_only_user=True)

                    role_model, role_model_effort, role_model_reward, role_model_utility = \
                        self.sampling_based_explanations(
                            user,
                            self.role_model_users,
                            self.role_model_users_gt_labels,
                            self.role_model_users_pred,
                            user_gt_labels_sens[i],
                            user_predicted_labels_sens[i],
                            user_sens_group=1,
                            cost_sens_group=np.array([1]),
                            variable_to_optimize='effort',
                            variable_to_threshold='reward',
                            threshold_value=delta
                        )
                    sens_efforts.append(role_model_effort)

                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         1, optimizer, 'effort',
                    #         'reward', delta)
                    # sens_efforts.append(x_new_effort)

                    dir_up_cols_generator = cf.get_up_cols(
                        exp.dataset_info[self.dataset]['variable_constraints'],
                        self.feature_info)
                    for dir_up_cols in dir_up_cols_generator:
                        assert np.all(
                            role_model[dir_up_cols] >=
                            user.flatten()[dir_up_cols])  # sanity check

                for i, user in enumerate(filtered_users_nosens):
                    print("Computing for user",
                          np.where(sub_filter_nosens)[0][i])
                    user = np.array([user])
                    index_in_users = np.where(sub_filter_nosens)[0][i]

                    # optimizer = ge.SamplingMethod(np.array([0]), self.feature_info, self.cost_funcs, self.cost_funcs_rev,
                    #     exp.dataset_info[self.dataset]['variable_constraints'], model, self.dataset)
                    # role_model, role_model_gt, role_model_pred = optimizer.sampling_based_explanations(
                    #     user,
                    #     self.role_model_users,
                    #     self.role_model_users_gt_labels,
                    #     self.role_model_users_pred,
                    #     user_gt_labels_nosens[i],
                    #     user_predicted_labels_nosens[i],
                    #     user_sens_group=0,
                    #     return_only_user=True)

                    role_model, role_model_effort, role_model_reward, role_model_utility = \
                        self.sampling_based_explanations(
                            user,
                            self.role_model_users,
                            self.role_model_users_gt_labels,
                            self.role_model_users_pred,
                            user_gt_labels_nosens[i],
                            user_predicted_labels_nosens[i],
                            user_sens_group=0,
                            cost_sens_group=np.array([0]),
                            variable_to_optimize='effort',
                            variable_to_threshold='reward',
                            threshold_value=delta
                        )
                    nosens_efforts.append(role_model_effort)

                    # x_new, x_new_utility, x_new_effort, x_new_reward, x_new_gt, x_new_pred = \
                    #     self.generate_new_feature_vector(model, user.flatten(), self.users_gt[index_in_users], self.users_preds[index_in_users],
                    #         role_model, role_model_gt, role_model_pred,
                    #         0, optimizer, 'effort',
                    #         'reward', delta)
                    # nosens_efforts.append(x_new_effort)

                    dir_up_cols_generator = cf.get_up_cols(
                        exp.dataset_info[self.dataset]['variable_constraints'],
                        self.feature_info)
                    for dir_up_cols in dir_up_cols_generator:
                        assert np.all(
                            role_model[dir_up_cols] >=
                            user.flatten()[dir_up_cols])  # sanity check
                sens_effort_with_reward.append(np.nanmean(sens_efforts))
                nosens_effort_with_reward.append(np.nanmean(nosens_efforts))

            model_to_effort_sens[model] = [
                self.reward_deltas, sens_effort_with_reward
            ]
            model_to_effort_nosens[model] = [
                self.reward_deltas, nosens_effort_with_reward
            ]

            with open(self.res_file_path, 'a') as res_file:
                res_file.write("== {} ==\n\n".format(str(model)))
                res_file.write("{}\n\n{}\n\n{}\n\n".format(
                    aeio.plot_one_var_vs_other(self.res_dir, model,
                                               self.effort_deltas,
                                               sens_utils_with_effort,
                                               nosens_utils_with_effort,
                                               'Effort', 'Average Utility'),
                    aeio.plot_one_var_vs_other(self.res_dir, model,
                                               self.effort_deltas,
                                               sens_reward_with_effort,
                                               nosens_reward_with_effort,
                                               'Effort', 'Average Reward'),
                    aeio.plot_one_var_vs_other(self.res_dir, model,
                                               self.reward_deltas,
                                               sens_effort_with_reward,
                                               nosens_effort_with_reward,
                                               'Reward', 'Average Effort')))
        # model_to_utility_sens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_utility_sens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_utility_sens_fc.pkl')
        # model_to_utility_nosens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_utility_nosens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_utility_nosens_fc.pkl')
        # model_to_reward_sens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_reward_sens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_reward_sens_fc.pkl')
        # model_to_reward_nosens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_reward_nosens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_reward_nosens_fc.pkl')
        # model_to_effort_nosens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_effort_nosens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_effort_nosens_fc.pkl')
        # model_to_effort_sens = joblib.load(self.res_dir + '/plots_pickled_data/model_to_effort_sens.pkl' if not exp.FAIRNESS_CONSTRAINTS else self.res_dir + '/plots_pickled_data/model_to_effort_sens_fc.pkl')
        with open(self.res_file_path, 'a') as res_file:
            res_file.write("== All Models in One ==\n\n")
            res_file.write("{}\n\n{}\n\n{}\n\n".format(
                aeio.plot_one_var_vs_other_together(self.res_dir,
                                                    model_to_utility_sens,
                                                    model_to_utility_nosens,
                                                    'Effort',
                                                    'Average Utility'),
                aeio.plot_one_var_vs_other_together(self.res_dir,
                                                    model_to_reward_sens,
                                                    model_to_reward_nosens,
                                                    'Effort',
                                                    'Average Reward'),
                aeio.plot_one_var_vs_other_together(self.res_dir,
                                                    model_to_effort_sens,
                                                    model_to_effort_nosens,
                                                    'Reward',
                                                    'Average Effort')))
        # out.upload_results([self.res_dir + '/disparity_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.png')
        # out.upload_results([self.res_dir + '/disparity_plots'], 'results', aeio.SERVER_PROJECT_PATH, '.pdf')
        out.create_dir(self.res_dir + '/plots_pickled_data')
        joblib.dump(
            model_to_utility_sens,
            self.res_dir + '/plots_pickled_data/model_to_utility_sens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_utility_sens_fc.pkl')
        joblib.dump(
            model_to_utility_nosens,
            self.res_dir + '/plots_pickled_data/model_to_utility_nosens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_utility_nosens_fc.pkl')
        joblib.dump(
            model_to_reward_sens,
            self.res_dir + '/plots_pickled_data/model_to_reward_sens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_reward_sens_fc.pkl')
        joblib.dump(
            model_to_reward_nosens,
            self.res_dir + '/plots_pickled_data/model_to_reward_nosens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_reward_nosens_fc.pkl')
        joblib.dump(
            model_to_effort_nosens,
            self.res_dir + '/plots_pickled_data/model_to_effort_nosens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_effort_nosens_fc.pkl')
        joblib.dump(
            model_to_effort_sens,
            self.res_dir + '/plots_pickled_data/model_to_effort_sens.pkl'
            if not exp.FAIRNESS_CONSTRAINTS else self.res_dir +
            '/plots_pickled_data/model_to_effort_sens_fc.pkl')
Esempio n. 18
0
def evaluate_population_splits(datasets, methods):
    feature_split_order = {'Compas': ['sex', 'race', 'age']}
    #feature_split_order = {'Compas': ['sex', 'race', 'age', 'c'],
    #        #'Adult': ['marital', 'relationship', 'workclass', 'education']}
    #        'Adult': ['marital', 'relationship']}#, 'workclass']}#, 'education']}

    methods.remove('Oracle')

    for dataset, sens_features in datasets:
        print("Evaluating dataset {}".format(dataset))

        out_dir = 'results/intergroup_splits/' + dataset + '/'
        output.create_dir(out_dir)

        split_features = feature_split_order[dataset]
        class_env = ProbClassEnv(dataset, val_split=0.7)
        class_env.load_data()

        x_labels = None
        intergroup_inequality_fracs = {}
        intragroup_inequality_fracs = {}
        for method in methods:
            print("Training method", method)
            class_env.setup_model(method)

            eval_func = para.wrap_function(eval_seed_benefits)
            results = para.map_parallel(eval_func,
                                        seeds,
                                        invariant_data=(class_env,
                                                        split_features))

            x_labels = results[0][0]
            intergroup_fracs = []
            intragroup_fracs = defaultdict(lambda: defaultdict(list))
            for _, intergroup_frac, intragroup_frac in results:
                intergroup_fracs.append(intergroup_frac)
                # TODO: relies on ordered dicts
                for sens_comb, frac in zip(x_labels, intragroup_frac):
                    for group, group_share in frac.items():
                        intragroup_fracs[sens_comb][group].append(group_share)
            #intergroup_inequality_fracs[method] = para.mean_with_conf(intergroup_fracs, axis=0)
            intergroup_inequality_fracs[method] = para.aggregate_results(
                intergroup_fracs, axis=0)
            intragroup_inequality_fracs[method] = {
                sens_comb: {
                    group: para.aggregate_results(group_fracs, np.mean, axis=0)
                    for group, group_fracs in fracs.items()
                }
                for sens_comb, fracs in intragroup_fracs.items()
            }

        iu.plot_curves(iu.FIG_TYPE_INTERGROUP_SPLITS,
                       "comparison",
                       x_labels,
                       "Feature combinations",
                       intergroup_inequality_fracs,
                       "Contribution (%)",
                       bars=True,
                       colors=method_colors)

        # Plot the intragroup inequalities for the various groups
        for method in methods:
            for feature_comb, intergroup_ineq, intragroup_ineqs in zip(
                    x_labels, intergroup_inequality_fracs[method],
                    intragroup_inequality_fracs[method].values()):
                if isinstance(intergroup_ineq, tuple):
                    intergroup_ineq = intergroup_ineq[0]
                # TODO: relies on dict order
                iu.plot_pie(iu.FIG_TYPE_INTERGROUP_SPLITS,
                            "{}_{}_breakdown".format(method, feature_comb),
                            [intergroup_ineq] +
                            list(intragroup_ineqs.values()),
                            ["between-group"] + list(intragroup_ineqs.keys()))

        iu.plot_results(out_dir, dataset)  #, output_channel="show")

        wiki_file_loc = iu.get_wiki_file(out_dir)
        with open(wiki_file_loc, 'a') as wiki_file:
            iu.emit_curves(
                wiki_file, out_dir, dataset, iu.FIG_TYPE_INTERGROUP_SPLITS,
                "Contribution of between-group unfairness to the overall individual unfairness. The numbers in parentheses after the feature combinations denote the number of population subgroups obtained from splitting the population on all the features."
            )
Esempio n. 19
0
def evaluate_inequality_decomposition(datasets, methods):
    for dataset, sens_features in datasets:
        print("Evaluating dataset {}".format(dataset))

        out_dir = 'results/class_ineq_decomp/' + dataset + '/'
        output.create_dir(out_dir)

        class_env = ProbClassEnv(dataset, val_split=.7)
        class_env.load_data()

        x_range = np.arange(len(class_env.y_test) + 1) / len(class_env.y_test)
        x_label = "Fraction of rejected users ($\\tau$)"
        y1_label = "Between-group\nunfairness ($\mathcal{E}^2_\\beta$)"
        y2_label = "Accuracy"

        sens_feature_combs = iu.powerset(sens_features)
        method_plots = {",".join(sens_feature_comb): {'intergroup_ineq': {}, 'accuracy': {}} \
                for sens_feature_comb in sens_feature_combs}

        for method in methods:
            print("Evaluating method", method)

            class_env.setup_model(method)
            class_env.train_model()
            #class_env.calibrate_probabilities(calibration_method)

            sens_feature_plots = {'intergroup_ineq': {}}
            for sens_feature_comb in sens_feature_combs:
                print("Evaluating sens feature combination", sens_feature_comb)

                decomp_res = class_env.evaluate_inequality_decomp(
                    sens_feature_comb)

                # method plots
                method_key = ','.join(sens_feature_comb)
                method_plots[method_key]['intergroup_ineq'][(
                    method, '(Unfairness)')] = decomp_res['intergroup_ineq']
                method_plots[method_key]['accuracy'][(
                    method, '(Accuracy)')] = decomp_res['accuracy']

                # sens_feature plots
                sens_feature_label = ', '.join(
                    sens_feature_comb) + " inequality"
                sens_feature_plots['intergroup_ineq'][
                    sens_feature_label] = decomp_res['intergroup_ineq']
                if 'accuracy' not in sens_feature_plots:
                    sens_feature_plots['accuracy'] = decomp_res['accuracy']

            iu.plot_curves(iu.FIG_TYPE_INTERGROUP_INEQ,
                           "method_{}".format(method), x_range, x_label,
                           sens_feature_plots['intergroup_ineq'], y1_label,
                           {"Accuracy": sens_feature_plots['accuracy']},
                           y2_label)

        linestyles = {'(Unfairness)': '-', '(Accuracy)': ':'}
        for sens_feature_comb, method_results in method_plots.items():
            iu.plot_curves(iu.FIG_TYPE_INTERGROUP_INEQ,
                           "feature_{}".format(sens_feature_comb),
                           x_range,
                           x_label,
                           method_results['intergroup_ineq'],
                           y1_label,
                           method_results['accuracy'],
                           y2_label,
                           colors=method_colors,
                           linestyles=linestyles)

        iu.plot_results(out_dir, dataset)

        wiki_file_loc = iu.get_wiki_file(out_dir)
        with open(wiki_file_loc, 'a') as wiki_file:
            iu.emit_curves(
                wiki_file, out_dir, dataset, iu.FIG_TYPE_INEQ_DECOMP,
                "Inequality decomposition of the overall GE_2 into intergroup- and intragroup-inequality"
            )
            iu.emit_curves(
                wiki_file, out_dir, dataset, iu.FIG_TYPE_INTERGROUP_INEQ,
                "== Intergroup inequalities for methods and feature combinations =="
            )

        iu.clear_figures()
Esempio n. 20
0
def evaluate_prob_classification(datasets, methods):
    for dataset, sens_features in datasets:
        print("Evaluating dataset {}".format(dataset))

        out_dir = 'results/prob_class/' + dataset + '/'
        output.create_dir(out_dir)

        class_env = ProbClassEnv(dataset, val_split=0.7)
        class_env.load_data()
        sens_feature_combs = iu.powerset(sens_features)

        method_results = {}
        hyperparams = {}
        col_names = []
        for method in methods:
            print('\nEvaluating {}'.format(method))

            params = class_env.setup_model(method)
            hyperparams[method] = params

            for calibration_method in calibration_methods:

                eval_kernel = para.wrap_function(eval_prob_class_kernel)
                results = para.map_parallel(
                    eval_kernel,
                    seeds,
                    invariant_data=(class_env, calibration_method, method,
                                    sens_feature_combs),
                    run_parallel=True)

                accuracies, ineqs, col_names, rejection_res = \
                        para.extract_positions(results, range(4))

                avg_ineqs = para.aggregate_results(ineqs, axis=0)
                table_row = [para.aggregate_results(accuracies, axis=0)] + \
                        list(avg_ineqs)
                col_names = ["Acc"] + col_names[0]

                def rejection_curve_aggregator(curves):
                    agg_curves = defaultdict(list)
                    for rejection_curves in curves:
                        for metric, metric_res in rejection_curves.items():
                            agg_curves[metric].append(metric_res)
                    agg_curves = {metric: para.aggregate_results(metric_res) \
                            for metric, metric_res in agg_curves.items()}
                    return agg_curves

                curve_types = ['overall'] + [
                    ','.join(feature_comb)
                    for feature_comb in sens_feature_combs
                ]
                rejection_curves = {curve_type: curves for curve_type, curves in \
                        zip(curve_types, para.extract_positions(
                            rejection_res, curve_types))}
                avg_rejection_curves = {curve_type: rejection_curve_aggregator(rejection_res) for \
                                curve_type, rejection_res in \
                                rejection_curves.items()}
                for curve_type, curves in avg_rejection_curves.items():
                    color = method_colors[method]
                    iu.plot_rejection_curves(curves,
                                             method,
                                             fig_name=curve_type,
                                             color=color)

                method_name = method  #+ '_' + calibration_method
                method_results[method_name] = table_row

        iu.plot_results(out_dir, dataset)
        wiki_file_loc = iu.get_wiki_file(out_dir)
        with open(wiki_file_loc, 'a') as wiki_file:
            col_format = ['3'] * len(col_names)
            col_format[0] = '2'
            iu.write_wiki_results(wiki_file, col_names, method_results,
                                  col_format, hyperparams,
                                  regression_methods_info)

            iu.emit_acc_fairness_curves(wiki_file, out_dir, dataset)

            lorenz_desc = "error (y_hat - y)"
            iu.emit_lorenz_curves(wiki_file, out_dir, dataset, lorenz_desc)

        iu.clear_figures()
def evaluate_models(dataset,
                    models,
                    test_or_train,
                    subsample_size=None,
                    num_investigation_users=None):
    """
    Main analysis function.
    Loads the selected dataset
    generates explanations for individuals and analyzes efforts.
    """

    models_to_individual_explanation_strings_sens, models_to_individual_explanation_strings_nosens = {}, {} # global mapping

    res_dir = 'results/{}'.format(dataset)
    out.create_dir(res_dir)
    res_dir = res_dir if not FAIRNESS_CONSTRAINTS else '{}/FC'.format(res_dir)
    out.create_dir(res_dir)
    res_file_path = res_dir + '/res.txt'
    wiki_parent_path = "Actionable-Explanations/Simple-Explanations-{}".format(
        dataset)

    sens_group_desc = dataset_info[dataset]['sens_f']
    learning_env = dec_rule_env.DecRuleEnv(dataset, sens_group_desc)
    learning_env.load_data()

    feature_info = learning_env.feature_info
    print("\n\nfeature_info original:{}\n\n".format(learning_env.feature_info))
    x_test_original = learning_env.x_test
    y_test = (learning_env.y_test
              ).astype(bool if dataset_info[dataset]['prediction_task'] ==
                       CLASSIFICATION else float)
    x_train_original = learning_env.x_train
    y_train = (learning_env.y_train
               ).astype(bool if dataset_info[dataset]['prediction_task'] ==
                        CLASSIFICATION else float)
    scaler = MinMaxScaler()
    scaler.fit(x_train_original)
    x_train = scaler.transform(x_train_original)

    with open('processed_student_data.csv', 'w') as fp:
        pd.DataFrame(data=np.append(x_train,
                                    y_train.reshape(x_train.shape[0], 1),
                                    axis=1),
                     columns=aeio.get_feature_names(feature_info) +
                     ['G3']).to_csv(fp, index=False)

    x_test = scaler.transform(x_test_original)

    sens_group = ~learning_env.x_control[sens_group_desc[-1]]
    sens_group_train = ~learning_env.x_control_train[sens_group_desc[-1]]
    sens_group_test = ~learning_env.x_control_test[sens_group_desc[-1]]
    ds_statistics = get_dataset_statistics_temp(
        learning_env.y, sens_group, dataset_info[dataset]['prediction_task'])
    users = np.append(x_train, x_test, axis=0)
    user_gt_labels = np.append(
        y_train, y_test,
        axis=0).astype(bool if dataset_info[dataset]['prediction_task'] ==
                       CLASSIFICATION else float)

    ##Use these for the remaining analysis; in case you want to change analysis from test to train or vice versa this is the place to change the var assignments;
    ##also change vars `role_model_users_pred` and `users_preds`
    role_model_users = x_train  # This should not change
    role_model_users_gt_labels = y_train  # This should not change
    role_model_users_sens_group = sens_group_train  # This should not change
    users = x_test if test_or_train == 'test' else x_train  # change this based on which group's explanations are needed (test or train)
    users_sens_group = sens_group_test if test_or_train == 'test' else sens_group_train  # change this based on which group's explanations are needed (test or train)
    users_gt = y_test if test_or_train == 'test' else y_train  # change this based on which group's explanations are needed (test or train)

    # If not already found, search for common negative users for in-depth analysis
    common_negative_users_sens_filename = "./common_negative_users/{}/random_{}_users_sens.txt".format(dataset, test_or_train) if not FAIRNESS_CONSTRAINTS \
        else "./common_negative_users/{}/random_{}_users_sens_fc.txt".format(dataset, test_or_train)
    common_negative_users_nosens_filename = "./common_negative_users/{}/random_{}_users_nosens.txt".format(dataset, test_or_train) if not FAIRNESS_CONSTRAINTS \
                else "./common_negative_users/{}/random_{}_users_nosens_fc.txt".format(dataset, test_or_train)
    if not os.path.exists(
            common_negative_users_sens_filename) or not os.path.exists(
                common_negative_users_nosens_filename):
        out.create_dir('./common_negative_users')
        out.create_dir('./common_negative_users/{}'.format(dataset))
        overall_negative_sens, overall_negative_nosens = None, None
        for m in models:
            clf = m
            exists, loaded_clf = aeio.load_model(clf, dataset)
            if exists:
                clf = loaded_clf
                print("Loaded {}...".format(str(clf)))
            else:
                print("Training {}...".format(str(clf)))
                clf.train(role_model_users, role_model_users_gt_labels)
                aeio.persist_model(clf, dataset)
            y_pred = clf.predict(users).astype(
                bool if dataset_info[dataset]['prediction_task'] ==
                CLASSIFICATION else float)
            m_negatives_sens = set(
                np.where(np.logical_and(y_pred < users_gt,
                                        users_sens_group))[0])
            m_negatives_nosens = set(
                np.where(np.logical_and(y_pred < users_gt,
                                        ~users_sens_group))[0])
            if overall_negative_nosens is None:
                overall_negative_nosens, overall_negative_sens = m_negatives_nosens, m_negatives_sens
            else:
                overall_negative_sens = overall_negative_sens.intersection(
                    m_negatives_sens)
                overall_negative_nosens = overall_negative_nosens.intersection(
                    m_negatives_nosens)
        print("\n{}\n".format(np.array(list(overall_negative_sens))))
        print("\n{}\n".format(np.array(list(overall_negative_nosens))))
        np.savetxt(common_negative_users_sens_filename,
                   np.array(list(overall_negative_sens)))
        np.savetxt(common_negative_users_nosens_filename,
                   np.array(list(overall_negative_nosens)))

    with open(res_file_path, 'w') as res_file:
        res_file.write("Sensitive group: {}\n\n{}".format(
            sens_group_desc[1], ds_statistics))

        # feature_desc = du.get_feature_descriptions(feature_info)
        feature_desc = get_feature_descriptions_temp(feature_info)
        res_file.write(feature_desc)

    analysis = [
        ("group-efforts", role_model_users_sens_group)
    ]  #[("overall-efforts", None)] # Only run for individual users for now

    # TODO: keep an eye on this mapping
    cost_groups = {
        cf.ONE_GROUP_IND: "all",
        0: sens_group_desc[0],
        1: sens_group_desc[1]
    }  # cf.ONE_GROUP_IN = -1

    for (analysis_name, cost_sens_group) in analysis:
        group_res_file_path = "{}/{}_res_{}.txt".format(
            res_dir, analysis_name, test_or_train)

        cost_funcs, feature_val_costs = dataset_info[dataset]['cost_funcs'](
            feature_info, role_model_users, cost_sens_group,
            dataset_info[dataset]['variable_constraints'])
        cost_funcs_rev, feature_val_costs_rev = dataset_info[dataset][
            'cost_funcs'](feature_info, role_model_users, cost_sens_group,
                          dataset_info[dataset]['variable_constraints_rev'])

        print("{}\n".format(feature_val_costs))
        print("{}\n".format(cost_funcs))

        with open(group_res_file_path, 'w') as group_res_file:
            group_res_file.write(
                "== Cost functions ==\n\nCosts are computed as (fraction < new value) - (fraction < old value), where values are ordered in the direction of increasing effort it takes to reach them.\n\n"
            )
            for sens_group_val, costs in feature_val_costs.items(
            ):  # Code never goes into this loop for generic cost func
                sens_group_desc = cost_groups[sens_group_val]
                group_res_file.write(
                    "=== Costs for group {}: ===\n\n{}\n".format(
                        sens_group_desc, aeio.get_cost_func_desc(costs)))
                group_res_file.write(
                    "=== Reverse costs for group {}: ===\n\n{}\n".format(
                        sens_group_desc,
                        aeio.get_cost_func_desc(
                            feature_val_costs_rev[sens_group_val])))

        # Randomly choose num_investigation_users to check their feature
        # values and the explanations generated for them
        np.random.seed(seed)
        investigation_users_sens = np.loadtxt(common_negative_users_sens_filename).astype(int)[:num_investigation_users] \
            if num_investigation_users is not None else np.loadtxt(common_negative_users_sens_filename).astype(int)  # np.random.randint(len(filtered_users_sens), size=num_investigation_users) # Hardcoded so as to have same accross all models
        investigation_users_nosens = np.loadtxt(common_negative_users_nosens_filename).astype(int)[:num_investigation_users] \
            if num_investigation_users is not None else np.loadtxt(common_negative_users_nosens_filename).astype(int) # np.random.randint(len(filtered_users_nosens), size=num_investigation_users) # Hardcoded so as to have same accross all models
        # Values for individual users

        #print("Users:\n", group_desc)
        disparity_table_heading = ["model"]
        disparity_table_formats = [None]
        disparity_table_values = []

        with open(group_res_file_path, 'a') as group_res_file:
            group_res_file.write("=== Individual explanations: ===\n\n")
            group_res_file.write(
                "All disparities are calculated as abs(sens_val - nosens_val)\n\n"
            )

        for model in models:
            model_start = time.time()
            clf = model
            exists, loaded_clf = aeio.load_model(clf, dataset)
            if exists:
                clf = loaded_clf
                print("Loaded {}...".format(str(clf)))
            else:
                print("Training {}...".format(str(clf)))

                # TODO: ugly
                if isinstance(model, lm.FCLogReg):
                    clf.train(role_model_users, role_model_users_gt_labels,
                              learning_env.x_control_train)
                else:
                    clf.train(role_model_users, role_model_users_gt_labels)
                aeio.persist_model(clf, dataset)

            model_end = time.time()
            performance_stats = eval_formula.eval_model(
                clf, users, users_gt, dataset_info[dataset]['prediction_task'])
            y_test_pred = clf.predict(users).astype(
                bool if dataset_info[dataset]['prediction_task'] ==
                CLASSIFICATION else float)
            y_train_pred = clf.predict(role_model_users).astype(
                bool if dataset_info[dataset]['prediction_task'] ==
                CLASSIFICATION else float)

            ####Common var names; change these if you want to change analysis from train to test or vice versa
            role_model_users_pred = y_train_pred  # This should not change
            users_preds = y_test_pred if test_or_train == 'test' else y_train_pred  # change this based on which group's explanations are needed (test or train)

            with open(res_file_path, 'a') as res_file:
                res_file.write("Performance of {}\n\n{}\n\n".format(
                    str(clf), aeio.get_dict_listing(performance_stats)))
                res_file.write("Training {} took {:.2f} secs".format(
                    str(clf), model_end - model_start))

            investigation_explanations_sens, investigation_explanations_nosens = [], []

            sub_filter_sens = np.zeros(len(users), dtype=bool)
            sub_filter_nosens = np.zeros(len(users), dtype=bool)
            sub_filter_sens[np.where(np.logical_and(users_sens_group, users_preds < users_gt))[0][:subsample_size] if \
                subsample_size is not None else np.where(np.logical_and(users_sens_group, users_preds < users_gt))[0]] = 1
            sub_filter_nosens[np.where(np.logical_and(~users_sens_group, users_preds < users_gt))[0][:subsample_size] if \
                subsample_size is not None else np.where(np.logical_and(~users_sens_group, users_preds < users_gt))[0]] = 1
            sub_filter_sens[investigation_users_sens] = 1
            sub_filter_nosens[investigation_users_nosens] = 1
            filtered_users_sens = users[sub_filter_sens]
            user_gt_labels_sens = users_gt[sub_filter_sens]
            user_predicted_labels_sens = users_preds[sub_filter_sens]
            filtered_users_nosens = users[sub_filter_nosens]
            user_gt_labels_nosens = users_gt[sub_filter_nosens]
            user_predicted_labels_nosens = users_preds[sub_filter_nosens]

            user_utility_sens, user_utility_nosens, anchor_indices_sens, anchor_indices_nosens = [], [], [], []
            fp_count_sens, fp_count_nosens = 0, 0

            all_users_flipped, all_users_flipped_labels = users.copy(
            ), users_gt.copy()

            ind_start = time.time()

            for i, user in enumerate(filtered_users_sens):
                # if i % 100 == 0:
                print("Computing for user", np.where(sub_filter_sens)[0][i])
                user = np.array([user])
                index_in_users = np.where(sub_filter_sens)[0][i]

                optimizer = ge.SamplingMethod(
                    np.array([1]), feature_info, cost_funcs, cost_funcs_rev,
                    dataset_info[dataset]['variable_constraints'], clf,
                    dataset)
                new_feature_vector, utility, effort, false_positive, role_model_gt, anchor_index = optimizer.sampling_based_explanations(
                    user,
                    role_model_users,
                    role_model_users_gt_labels,
                    role_model_users_pred,
                    user_gt_labels_sens[i],
                    user_predicted_labels_sens[i],
                    user_sens_group=1)
                dir_up_cols_generator = cf.get_up_cols(
                    dataset_info[dataset]['variable_constraints'],
                    feature_info)
                for dir_up_cols in dir_up_cols_generator:
                    assert np.all(new_feature_vector[dir_up_cols] >=
                                  user.flatten()[dir_up_cols])  # sanity check
                new_predicted_label = clf.predict(
                    [new_feature_vector])[0] if dataset_info[dataset][
                        'prediction_task'] == REGRESSION else bool(
                            clf.predict([new_feature_vector])[0])
                all_users_flipped[index_in_users] = new_feature_vector
                all_users_flipped_labels[index_in_users] = role_model_gt
                if false_positive:
                    fp_count_sens += 1
                tar_nec_vars = np.where(new_feature_vector != user[0])[0]
                tar_vals = new_feature_vector[tar_nec_vars]
                old_vals = user[0][tar_nec_vars]
                user_utility_sens.append(utility)
                anchor_indices_sens.append(anchor_index)
                if np.where(sub_filter_sens)[0][i] in investigation_users_sens:
                    feature_wise_effort, explanation = aeio.get_feature_wise_effort(
                        feature_info, user[0], tar_nec_vars, tar_vals,
                        cost_funcs, cost_funcs_rev, True, role_model_gt,
                        new_predicted_label, user_gt_labels_sens[i],
                        user_predicted_labels_sens[i])
                    individual_feature_costs_str = aeio.get_feature_wise_str(
                        feature_wise_effort, explanation)
                    user_explanation = \
                        " * User {} explanation, utility: {:.3f}, effort: {:.3f}\n{}\n  * User gt label: {}, user_predicted_label:{}, role model gt label: {}, role model predicted label: {}; explanation:\n{}\n  * Old feature vals for user {}:\n{}\n".format(
                            np.where(np.where(sub_filter_sens)[0][i] == investigation_users_sens)[0][0], utility, effort, individual_feature_costs_str,
                            user_gt_labels_sens[i], user_predicted_labels_sens[i], role_model_gt, new_predicted_label,
                            aeio.get_conditions_str(feature_info, tar_nec_vars, tar_vals, scaler=scaler, level=3), np.where(np.where(sub_filter_sens)[0][i] == investigation_users_sens)[0][0],
                            aeio.get_conditions_str(feature_info, tar_nec_vars, old_vals, scaler=scaler, level=3))
                    investigation_explanations_sens.append(user_explanation)

            for i, user in enumerate(filtered_users_nosens):
                # if i % 100 == 0:
                print("Computing for user", np.where(sub_filter_nosens)[0][i])
                user = np.array([user])
                index_in_users = np.where(sub_filter_nosens)[0][i]

                optimizer = ge.SamplingMethod(
                    np.array([0]), feature_info, cost_funcs, cost_funcs_rev,
                    dataset_info[dataset]['variable_constraints'], clf,
                    dataset)
                new_feature_vector, utility, effort, false_positive, role_model_gt, anchor_index = optimizer.sampling_based_explanations(
                    user,
                    role_model_users,
                    role_model_users_gt_labels,
                    role_model_users_pred,
                    user_gt_labels_nosens[i],
                    user_predicted_labels_nosens[i],
                    user_sens_group=0)
                dir_up_cols_generator = cf.get_up_cols(
                    dataset_info[dataset]['variable_constraints'],
                    feature_info)
                for dir_up_cols in dir_up_cols_generator:
                    assert np.all(new_feature_vector[dir_up_cols] >=
                                  user.flatten()[dir_up_cols])  # education
                new_predicted_label = clf.predict(
                    [new_feature_vector])[0] if dataset_info[dataset][
                        'prediction_task'] == REGRESSION else bool(
                            clf.predict([new_feature_vector])[0])
                all_users_flipped[index_in_users] = new_feature_vector
                all_users_flipped_labels[index_in_users] = role_model_gt
                if false_positive:
                    fp_count_nosens += 1
                tar_nec_vars = np.where(new_feature_vector != user[0])[0]
                tar_vals = new_feature_vector[tar_nec_vars]
                old_vals = user[0][tar_nec_vars]
                user_utility_nosens.append(utility)
                anchor_indices_nosens.append(anchor_index)
                if np.where(
                        sub_filter_nosens)[0][i] in investigation_users_nosens:
                    feature_wise_effort, explanation = aeio.get_feature_wise_effort(
                        feature_info, user[0], tar_nec_vars, tar_vals,
                        cost_funcs, cost_funcs_rev, False, role_model_gt,
                        new_predicted_label, user_gt_labels_nosens[i],
                        user_predicted_labels_nosens[i])
                    individual_feature_costs_str = aeio.get_feature_wise_str(
                        feature_wise_effort, explanation)
                    user_explanation = \
                        " * User {} explanation, utility: {:.3f}, effort: {:.3f}\n{}\n  * User gt label: {}, user_predicted_label:{}, role model gt label: {}, role model predicted label: {}; explanation:\n{}\n  * Old feature vals for user {}:\n{}\n".format(
                            np.where(np.where(sub_filter_nosens)[0][i] == investigation_users_nosens)[0][0], utility, effort, individual_feature_costs_str,
                            user_gt_labels_nosens[i], user_predicted_labels_nosens[i], role_model_gt, new_predicted_label,
                            aeio.get_conditions_str(feature_info, tar_nec_vars, tar_vals, scaler=scaler, level=3), np.where(np.where(sub_filter_nosens)[0][i] == investigation_users_nosens)[0][0],
                            aeio.get_conditions_str(feature_info, tar_nec_vars, old_vals, scaler=scaler, level=3))
                    investigation_explanations_nosens.append(user_explanation)

            user_utility_sens, user_utility_nosens, anchor_indices_sens, anchor_indices_nosens = \
                np.array(user_utility_sens), np.array(user_utility_nosens), np.array(anchor_indices_sens), np.array(anchor_indices_nosens)

            summary_of_useful_explanations = "Total explanations given: {} ({} sens group, {} non-sens group); Useful explanations (utility > 0): {} ({} sens group, {} non-sens group".format(
                len(filtered_users_sens) + len(filtered_users_nosens),
                len(filtered_users_sens), len(filtered_users_nosens),
                np.count_nonzero(user_utility_sens > 0) +
                np.count_nonzero(user_utility_nosens > 0),
                np.count_nonzero(user_utility_sens > 0),
                np.count_nonzero(user_utility_nosens > 0))

            all_users_utilities = np.zeros(len(users))
            all_users_utilities[sub_filter_sens] = user_utility_sens
            all_users_utilities[sub_filter_nosens] = user_utility_nosens
            all_user_anchors = np.full(len(users), fill_value=np.nan)
            all_user_anchors[sub_filter_sens] = anchor_indices_sens
            all_user_anchors[sub_filter_nosens] = anchor_indices_nosens
            out.create_dir('./flipped_datasets')
            out.create_dir('./flipped_datasets/{}'.format(dataset))
            np.savetxt(
                './flipped_datasets/{}/{}_all_x_{}.txt'.format(
                    dataset, clf.filename(), test_or_train), all_users_flipped)
            np.savetxt(
                './flipped_datasets/{}/{}_all_y_{}.txt'.format(
                    dataset, clf.filename(), test_or_train),
                all_users_flipped_labels)
            np.savetxt(
                './flipped_datasets/{}/{}_utilities_{}.txt'.format(
                    dataset, clf.filename(), test_or_train),
                all_users_utilities)
            np.savetxt(
                './flipped_datasets/{}/{}_anchors_{}.txt'.format(
                    dataset, clf.filename(), test_or_train), all_user_anchors)

            # group_sens = users[investigation_users_sens]
            # group_efforts_sens = user_utility_sens[investigation_users_sens]
            user_explanations_sens = "".join(
                string for string in investigation_explanations_sens)
            models_to_individual_explanation_strings_sens[str(
                clf)] = user_explanations_sens

            # group_nosens = users[investigation_users_nosens]
            # group_efforts_nosens = user_utility_nosens[investigation_users_nosens]
            user_explanations_nosens = "".join(
                string for string in investigation_explanations_nosens)
            models_to_individual_explanation_strings_nosens[str(
                clf)] = user_explanations_nosens

            ind_end = time.time()

            if len(disparity_table_heading) <= 1:
                heading, formats, values = eval_formula.get_disparity_measures(
                    users_gt,
                    users_preds,
                    users_sens_group,
                    np.nanmean(user_utility_sens),
                    np.nanmean(user_utility_nosens),
                    dataset_info[dataset]['prediction_task'],
                    return_heading_and_formats=True)
                disparity_table_heading += heading
                disparity_table_formats += formats
            else:
                values = eval_formula.get_disparity_measures(
                    users_gt,
                    users_preds,
                    users_sens_group,
                    np.nanmean(user_utility_sens),
                    np.nanmean(user_utility_nosens),
                    dataset_info[dataset]['prediction_task'],
                    return_heading_and_formats=False)
            disparity_table_values.append([str(clf)] + values)

            with open(group_res_file_path, 'a') as group_res_file:
                group_res_file.write("  * For {}:\n\n".format(str(model)))
                group_res_file.write(
                    "   * Computation of explanations, for {} ({} sens, {} nonsens) \
                    individual users took {:.2f} seconds.\n\n   * # False positive role models = {} sens, {} \
                    nonsens.\n\n".format(
                        len(filtered_users_sens) + len(filtered_users_nosens),
                        len(filtered_users_sens), len(filtered_users_nosens),
                        ind_end - ind_start, fp_count_sens, fp_count_nosens))
                group_res_file.write(
                    "   * {}\n\n".format(summary_of_useful_explanations))

        with open(group_res_file_path, 'a') as group_res_file:
            group_res_file.write(
                "Disparity measures of different models:\n\n{}\n".format(
                    out.get_table(disparity_table_heading,
                                  disparity_table_values,
                                  val_format=disparity_table_formats)))
            group_res_file.write(
                aeio.get_disparity_plots(
                    res_dir,
                    disparity_table_heading,
                    disparity_table_values,
                    filename='all_disp_in_one_{}'.format(test_or_train)))
            # group_res_file.write("Effort statistics for sens group:\n\n{}\n".format(ind_effort_stats_sens))
            # group_res_file.write("Effort statistics for non-sens group:\n\n{}\n".format(ind_effort_stats_nosens))
            # group_res_file.write("Top explanations for sens group:\n\n{}\n".format(top_explanations_sens))
            # group_res_file.write("Top explanations for non-sens group:\n\n{}\n".format(top_explanations_nosens))
            if num_investigation_users is None or num_investigation_users > 0:
                #     group_res_file.write("User explanation examples for randomly selected sensitive users:\n\n{}\nExplanations for sensitive users in group:\n\n{}\n".format(group_desc_sens, user_explanations_sens))
                for k, v in models_to_individual_explanation_strings_sens.items(
                ):
                    group_res_file.write(
                        "Randomly chosen sens users' explanations for {}:\n\n{}\n\n"
                        .format(k, v))
                for k, v in models_to_individual_explanation_strings_nosens.items(
                ):
                    group_res_file.write(
                        "Randomly chosen nonsens users' explanations for {}:\n\n{}\n\n"
                        .format(k, v))
Esempio n. 22
0
    def generate_explanations_for_test(self, model, x_train_flipped,
                                       y_train_flipped, x_test_flipped,
                                       y_test_flipped, threshold_sens,
                                       threshold_nosens, cost_funcs,
                                       cost_funcs_rev):
        y_test_pred = model.predict(x_test_flipped).astype(
            bool if self.prediction_task == exp.CLASSIFICATION else float)
        y_train_pred = model.predict(x_train_flipped).astype(
            bool if self.prediction_task == exp.CLASSIFICATION else float)
        explanations_mask = np.zeros(len(x_test_flipped), dtype=bool)
        explanations_mask[np.where(
            y_test_pred < self.y_test)[0][:self.subsample_size_test] if self.
                          subsample_size_test is not None else np.where(
                              y_test_pred < self.y_test)[0]] = 1
        filtered_users = x_test_flipped[explanations_mask]
        filtered_users_sens_group = self.sens_group_test[explanations_mask]
        filtered_users_gt_labels = y_test_flipped[explanations_mask]
        filtered_users_pred_labels = y_test_pred[explanations_mask]
        user_utilities = np.zeros(len(filtered_users))

        procs, queue = [], Queue()

        x_test_flipped_twice, y_test_flipped_twice = x_test_flipped.copy(
        ), y_test_flipped.copy()

        for i, user in enumerate(filtered_users):
            # if i == 1986:
            #     continue
            user = np.array([user])
            index_in_users = np.where(explanations_mask)[0][i]
            # print("Computing for user", index_in_users)

            optimizer = ge.SamplingMethod(
                np.array([filtered_users_sens_group[i]]), self.feature_info,
                cost_funcs, cost_funcs_rev,
                exp.dataset_info[self.dataset]['variable_constraints'], model,
                self.dataset)
            # p = Process(target=self.get_explanations_test, args=(queue, i, index_in_users, optimizer, user, y_train_pred, filtered_users_gt_labels,
            #     filtered_users_pred_labels, filtered_users_sens_group, x_train_flipped, y_train_flipped, ))
            # p.start()
            # procs.append(p)
            # if len(procs) == 5:
            # for p in procs:
            # i, index_in_users, user, new_feature_vector, utility, effort, false_positive, role_model_gt = queue.get()
            i, index_in_users, user, new_feature_vector, utility, effort, false_positive, role_model_gt, anchor = \
                self.get_explanations_test(i, index_in_users, optimizer, user, y_train_pred, filtered_users_gt_labels,
                    filtered_users_pred_labels, filtered_users_sens_group, x_train_flipped, y_train_flipped)
            print("User {} retrieved".format(i))
            user_utilities[i] = utility
            x_test_flipped_twice[index_in_users] = new_feature_vector
            y_test_flipped_twice[index_in_users] = role_model_gt

        user_utilities = np.array(user_utilities)
        all_utilities = np.zeros(len(self.x_test))
        all_utilities[explanations_mask] = user_utilities
        out.create_dir('./flipped_datasets')
        out.create_dir('./flipped_datasets/{}'.format(self.dataset))
        np.savetxt(
            './flipped_datasets/{}/{}_x_test_twice_{}_{}.txt'.format(
                self.dataset, model.filename(), threshold_sens,
                threshold_nosens), x_test_flipped_twice)
        np.savetxt(
            './flipped_datasets/{}/{}_y_test_twice_{}_{}.txt'.format(
                self.dataset, model.filename(), threshold_sens,
                threshold_nosens), y_test_flipped_twice)
        np.savetxt(
            './flipped_datasets/{}/{}_new_utilities_test_{}_{}.txt'.format(
                self.dataset, model.filename(), threshold_sens,
                threshold_nosens), all_utilities)