Ejemplo n.º 1
0
def get_dataset(data_config, device="cpu"):
    """ Take config and return dataloader"""
    dataname = data_config.name
    val_size = data_config.val_size

    if dataname == "adult":
        data = load_adult(val_size=val_size)
        c_size = 2
        c_type = "binary"
    elif dataname == "health":
        data = load_health(val_size=val_size)
        c_size = 9
        c_type = "one_hot"
    else:
        logger.error(f"Invalid data name {dataname} specified")
        raise Exception(f"Invalid data name {dataname} specified")

    train, valid, test = data["train"], data["valid"], data["test"]
    if valid is None:
        valid = data["test"]

    return (
        Box({"train": TensorDataset(
            torch.tensor(train[0]).float().to(device),
            torch.tensor(train[1]).long().to(device),
            torch.tensor(train[2]).long().to(device),
        ), "test": TensorDataset(
            torch.tensor(test[0]).float().to(device),
            torch.tensor(test[1]).long().to(device),
            torch.tensor(test[2]).long().to(device),
        ), "valid": TensorDataset(
            torch.tensor(valid[0]).float().to(device),
            torch.tensor(valid[1]).long().to(device),
            torch.tensor(valid[2]).long().to(device),
        )}), {
            "input_shape": train[0].shape[1:],
            "c_size": c_size,
            "c_type": c_type,
            "y_size": 2,
            "y_type": "binary",
        })
Ejemplo n.º 2
0
        for i in range(p_y_c.shape[1]):
            for j in range(i + 1, p_y_c.shape[1]):
                constraints.extend([
                    -dp <= (p_y_c[1, i] - delta[i]) / p_c[i] -
                    (p_y_c[1, j] - delta[j]) / p_c[j],
                    (p_y_c[1, i] - delta[i]) / p_c[i] -
                    (p_y_c[1, j] - delta[j]) / p_c[j] <= dp,
                ])
        prob = cvxpy.Problem(objective, constraints)
        result = prob.solve()
        # breakpoint()
        solution.append([result, dp])
        print(f"DP: {dp}, sol : {result}")

    return solution


if __name__ == "__main__":
    for data in ["adult", "health"]:
        # compute idea areas
        if data == "adult":
            adult = load_adult(0.2)
            Y = adult["test"][2]
            C = adult["test"][1]
        elif data == "health":
            health = load_health(0.2)
            Y = health["test"][2]
            C = health["test"][1]

        solution = get_optimal_front(Y, C)
def area_over_curve_lp():
    def compute_ideal_area(Y, C):
        len_c = len(numpy.unique(C))
        len_y = len(numpy.unique(Y))
        p_y_c = numpy.zeros((len_y, len_c))

        for c in range(len_c):
            for y in range(len_y):
                p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean()
        print(p_y_c)

        # compute desired rate i.e p(y=1|C=c)
        desired_rate = p_y_c[1, :].mean()
        errors = p_y_c[1, :] - desired_rate

        majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1))
        max_dp = demographic_parity_difference(Y, Y, sensitive_features=C)

        solution = get_optimal_front(Y, C)
        # add no error and max_dp to the solution
        solution.append([1, max_dp])

        solution = numpy.array(solution)

        # sort by dp
        solution = solution[solution[:, 1].argsort()]

        area = numpy.sum(
            # acc                            * dp_next - dp_cur
            (solution[:-1, 0] - majority_acc) *
            (solution[1:, 1] - solution[0:-1, 1]))
        return area, majority_acc, max_dp

    # Methods
    methods = [
        "fcrl", "cvib_supervised", "lag-fairness", "maxent_arl", "laftr",
        "adv_forgetting"
    ]

    # compute AUC table
    area = {}
    for data in ["adult", "health"]:
        # compute idea areas
        if data == "adult":
            adult = load_adult(0.2)
            Y = adult["test"][2]
            C = adult["test"][1]
        elif data == "health":
            health = load_health(0.2)
            Y = health["test"][2]
            C = health["test"][1]

        norm_area, majority_acc, max_dp = compute_ideal_area(Y, C)

        area[data] = {}
        for idx, key in enumerate([
                "nn_1_layer", "nn_2_layer", "random_forest", "svm",
                "logistic_regression"
        ]):
            area[data][key] = {}
            for m in methods:
                if data == "health" and m == "laftr":
                    continue
                t = numpy.load(f"result/eval/{data}/{m}.npy",
                               allow_pickle=True).item()
                df = get_dataframe_from_results(t)

                # get pareto front
                pareto = df[[f'{key}_normalized_acc',
                             f'{key}_normalized_dp']].values
                # drop nan
                pareto = pareto[~numpy.isnan(pareto).any(axis=1)]
                pareto = get_pareto_front(pareto)
                pareto = numpy.array(pareto)
                pareto = pareto[pareto[:, 1].argsort()]

                # reject points that have more dp than data
                THRESH = 1.0
                idx = pareto.shape[0]
                while idx > -1:
                    if pareto[idx - 1, 1] > THRESH * max_dp:
                        idx = idx - 1
                    else:
                        break
                pareto = pareto[:idx]
                if idx == -1:
                    area[data][key][m] = 0
                    print(f"No point found below dp_max for {m}, {data}")
                    continue

                # add random acc point, 0 (this works as a reference to create horizontal bars
                # add max_dp, pareto[-1,0] i.e max acc you can get at data's dp
                pareto = numpy.concatenate(
                    [[[majority_acc, 0]], pareto, [[pareto[-1, 0], max_dp]]],
                    axis=0)

                # get area by making rectangle
                area[data][key][m] = numpy.sum(
                    # acc                            * dp_next - dp_cur
                    (pareto[:-1, 0] - pareto[0, 0]) *
                    (pareto[1:, 1] - pareto[0:-1, 1]))

                # normalize
                area[data][key][m] /= norm_area

    # dump to table
    for idx, key in enumerate([
            "nn_1_layer", "nn_2_layer", "random_forest", "svm",
            "logistic_regression"
    ]):
        table = Texttable()
        table.set_cols_align(["l", "c", "c"])
        table.header(["Method", "UCI Adult", "Heritage Health"])
        for m in methods:
            if m == "fcrl":
                table.add_row([
                    "FCRL (Ours)", area["adult"][key][m],
                    area["health"][key][m]
                ])
            if m == "lag-fairness":
                table.add_row(
                    ["MIFR", area["adult"][key][m], area["health"][key][m]])
            if m == "maxent_arl":
                table.add_row([
                    "MaxEnt-ARL", area["adult"][key][m], area["health"][key][m]
                ])
            if m == "cvib_supervised":
                table.add_row(
                    ["CVIB", area["adult"][key][m], area["health"][key][m]])
            if m == "laftr":
                table.add_row(["LAFTR", area["adult"][key][m], "N/A"])
            if m == "adv_forgetting":
                table.add_row([
                    "Adversarial Forgetting", area["adult"][key][m],
                    area["health"][key][m]
                ])

        os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "table"))
        with open(os.path.join(FIGURES_FOLDER, "table", f"{key}.better.tex"),
                  'w') as f:
            f.write(
                latextable.draw_latex(
                    table,
                    caption="Area Over Parity Accuracy Curve",
                    label=f"AOPAC_{key}"))
def figure9():
    def compute_ideal_stats(Y, C):
        len_c = len(numpy.unique(C))
        len_y = len(numpy.unique(Y))
        p_y_c = numpy.zeros((len_y, len_c))

        for c in range(len_c):
            for y in range(len_y):
                p_y_c[y, c] = numpy.logical_and(Y == y, C == c).mean()
        print(p_y_c)

        # compute desired rate i.e p(y=1|C=c)
        desired_rate = p_y_c[1, :].mean()
        errors = p_y_c[1, :] - desired_rate

        majority_acc = max(numpy.mean(Y == 1), 1 - numpy.mean(Y == 1))
        max_dp = demographic_parity_difference(Y, Y, sensitive_features=C)

        return 0, majority_acc, max_dp

    # modify font size
    fontsize = pyplot.rcParams.get("font.size")
    xlabelsize = pyplot.rcParams.get("xtick.labelsize")
    ylabelsize = pyplot.rcParams.get("ytick.labelsize")
    labelsize = pyplot.rcParams.get("axes.labelsize")
    titlesize = pyplot.rcParams.get("axes.titlesize")

    pyplot.rcParams.update({
        "font.size": 12,
        "xtick.labelsize": 16,
        "ytick.labelsize": 16,
        "axes.labelsize": 16,
        "axes.titlesize": 20
    })

    for data in ["adult", "health"]:
        # compute idea areas
        if data == "adult":
            adult = load_adult(0.2)
            Y = adult["test"][2]
            C = adult["test"][1]
        elif data == "health":
            health = load_health(0.2)
            Y = health["test"][2]
            C = health["test"][1]

        _, RANDOM_ACC, MAX_DP = compute_ideal_stats(Y, C)
        t = numpy.load(f"result/eval/{data}/fcrl.npy",
                       allow_pickle=True).item()
        df = get_dataframe_from_results(t)
        for idx, key in enumerate(["nn_1_layer"]):
            figure = pyplot.figure(figsize=(16, 8))
            ax = figure.add_subplot(1, 1, 1)
            pareto = get_pareto_front(
                df[[f'{key}_normalized_acc', f'{key}_normalized_dp']].values)
            pareto = numpy.array(pareto)
            pareto = pareto[pareto[:, 1].argsort()]

            # plot the points
            df.plot(kind="scatter",
                    x=f'{key}_normalized_acc',
                    y=f'{key}_normalized_dp',
                    c="none",
                    edgecolors=COLOR[0],
                    linewidth=2,
                    marker=MARKER[0],
                    ax=ax,
                    s=SCATTER_MARKERSIZE,
                    label='All Models')
            ax.scatter(pareto[:, 0],
                       pareto[:, 1],
                       label="Pareto Front",
                       c="none",
                       edgecolors=COLOR[1],
                       linewidth=2,
                       marker=MARKER[1],
                       s=SCATTER_MARKERSIZE)

            # create bars
            ax.barh(y=pareto[:-1, 1],
                    width=pareto[:-1, 0] - RANDOM_ACC,
                    height=pareto[1:, 1] - pareto[:-1, 1],
                    left=RANDOM_ACC,
                    color="yellow",
                    alpha=0.2,
                    align="edge",
                    edgecolor="red")
            ax.barh(y=pareto[-1, 1],
                    height=MAX_DP - pareto[-1, 1],
                    width=pareto[-1, 0] - RANDOM_ACC,
                    left=RANDOM_ACC,
                    color="yellow",
                    alpha=0.2,
                    align="edge",
                    edgecolor="red")

            # ideal plot
            ax.plot([1, 1], [MAX_DP, 0], color="red", label="Ideal")

            # ideal plot but better
            solution = get_optimal_front(Y, C)
            solution.append([1, MAX_DP])
            solution = numpy.array(solution)
            solution = solution[solution[:, 1].argsort()]
            ax.plot(solution[:, 0],
                    solution[:, 1],
                    color="cyan",
                    label="Ideal (LP)")

            # box
            ax.plot([RANDOM_ACC, RANDOM_ACC], [0, MAX_DP],
                    color="gray",
                    linestyle="--")
            ax.plot([RANDOM_ACC, 1], [0, 0], color="gray", linestyle="--")
            ax.plot([RANDOM_ACC, 1], [MAX_DP, MAX_DP],
                    color="gray",
                    linestyle="--")

            ax.set_xlabel("Accuracy")
            ax.set_ylabel("$\Delta_{DP}$")
            # ax.set_title(
            # "Acc Vs $\Delta_{DP}$" + f" ({'UCI Adult' if data == 'adult' else 'Heritage Health'})")
            ax.legend()
            ax.set_xlim(left=RANDOM_ACC - 0.005, right=1.005)
            os_utils.safe_makedirs(os.path.join(FIGURES_FOLDER, "appendix"))
            pyplot.savefig(os.path.join(FIGURES_FOLDER, "appendix",
                                        f"pareto_{data}_{key}.{FORMAT}"),
                           bbox_inches='tight')
            pyplot.close()

    # put values back
    pyplot.rcParams.update({
        "font.size": fontsize,
        "xtick.labelsize": xlabelsize,
        "ytick.labelsize": ylabelsize,
        "axes.labelsize": labelsize,
        "axes.titlesize": titlesize
    })
Ejemplo n.º 5
0
import numpy as np

from src.common.data.adult import load_adult

if __name__=="__main__":
    data  = load_adult(val_size=0)
    f_out_np = 'laftr/data/adult/adult.npz'
    train = data["train"]
    test = data["test"]

    D = {"training": {}, "test":{}}
    D["training"]["X"] = train[0]
    D["training"]["Y"] = train[2]
    D["training"]["A"] = train[1]

    D["test"]["X"] = test[0]
    D["test"]["Y"] = test[2]
    D["test"]["A"] = test[1]

    # since we don't want to use the validation strategy we can reduce this to minimum so that all the training data is used. But this shouldnot matter much
    n = D['training']['X'].shape[0]
    shuf = np.random.permutation(n)
    valid_pct = 0.2
    valid_ct = int(n * valid_pct)
    valid_inds = shuf[:valid_ct]
    train_inds = shuf[valid_ct:]

    np.savez(f_out_np, x_train=D['training']['X'], x_test=D['test']['X'],
        y_train=D['training']['Y'], y_test=D['test']['Y'],
        attr_train=D['training']['A'], attr_test=D['test']['A'],
        train_inds=train_inds, valid_inds=valid_inds)