def run_nfr_cv(n_dim, batch_size, C, alpha, emd_method=emd_samples):
    global X, P, y, df, X_test

    reps = {}

    X_no_p = df.drop(['Y', 'P'], axis=1).values

    # declare variables
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    # train-test split
    data_train, data_test = split_data_np(
        (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    X_train_no_p = X_train[:, :-1]
    X_test_no_p = X_test[:, :-1]
    X_u = X[P == 1]
    X_n = X[P == 0]

    # NFR.
    model_nfr = FairRep(len(X[0]), n_dim)
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    return train_rep(model_nfr,
                     0.01,
                     X,
                     P,
                     n_iter,
                     c_iter=10,
                     batch_size=batch_size,
                     alpha=alpha,
                     C_reg=C)
Beispiel #2
0
def test_in_one(n_dim,
                batch_size,
                n_iter,
                C,
                alpha,
                compute_emd=True,
                k_nbrs=3,
                emd_method=emd_samples):
    global X, P, y, df, X_test

    reps = {}

    X_no_p = df.drop(['Y', 'P'], axis=1).values

    # declare variables
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    # train-test split
    data_train, data_test = split_data_np(
        (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    X_train_no_p = X_train[:, :-1]
    X_test_no_p = X_test[:, :-1]
    X_u = X[P == 1]
    X_n = X[P == 0]

    # AE.
    model_ae = FairRep(len(X[0]), n_dim)
    train_rep(model_ae,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)

    # AE_P.
    model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1)
    train_rep(model_ae_P,
              0.01,
              X_no_p,
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)

    # NFR.
    model_name = 'compas_Original'
    model_nfr = FairRep(len(X[0]), n_dim)
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_nfr,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=alpha,
              C_reg=0,
              compute_emd=compute_emd,
              adv=True,
              verbose=True)
    results = {}

    print('begin testing.')
    X_ori_np = X.data.cpu().numpy()
    # Original.
    print('logistic regression on the original...')
    lin_model, y_test_scores, performance = get_model_preds(
        X_train, y_train, P_train, X_test, y_test, P_test, model_name)
    y_hats[model_name] = get_preds_on_full_dataset(X, lin_model)
    reps[model_name] = None
    print(X_train.shape, X_test.shape)
    save_decision_boundary_plot(np.concatenate((X_train, X_test)),
                                np.concatenate((y_train, y_test)),
                                np.concatenate((P_train, P_test)), model_name)

    performance.append(emd_method(X_n, X_u))
    performance.append(
        get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs))
    performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model))
    performance.append(equal_odds(X.data.cpu().numpy(), y, P, lin_model))
    # make_cal_plot(X.data.cpu().numpy(), y, P, lin_model, model_name)

    results[model_name] = performance

    # Original-P.
    model_name = 'compas_Original-P'
    print('logistic regression on the original-P')
    lin_model, y_test_scores, performance = get_model_preds(
        X_train_no_p, y_train, P_train, X_test_no_p, y_test, P_test,
        model_name)
    y_hats[model_name] = get_preds_on_full_dataset(X[:, :-1], lin_model)
    reps[model_name] = None
    save_decision_boundary_plot(np.concatenate((X_train_no_p, X_test_no_p)),
                                np.concatenate((y_train, y_test)),
                                np.concatenate((P_train, P_test)), model_name)

    performance.append(emd_method(X_n[:, :-1], X_u[:, :-1]))
    print('calculating consistency...')
    performance.append(
        get_consistency(X[:, :-1].data.cpu().numpy(),
                        lin_model,
                        n_neighbors=k_nbrs))
    print('calculating stat diff...')
    performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model))
    performance.append(
        equal_odds(X[:, :-1].data.cpu().numpy(), y, P, lin_model))
    # make_cal_plot(X[:, :-1].data.cpu().numpy(), y, P, lin_model, model_name)

    results[model_name] = performance

    # use encoder
    model_name = 'compas_AE'

    U_0 = model_ae.encoder(X[P == 0]).data
    U_1 = model_ae.encoder(X[P == 1]).data
    U = model_ae.encoder(X).data

    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regression on AE...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)
    save_decision_boundary_plot(np.concatenate((X_train, X_test)),
                                np.concatenate((y_train, y_test)),
                                np.concatenate((P_train, P_test)), model_name)
    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    y_hats[model_name] = get_preds_on_full_dataset(U, lin_model)
    reps[model_name] = U

    def calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np, lin_model,
                  X_test, model_name):
        print('logistic regression evaluation...')
        performance = list(
            evaluate_performance_sim(y_test, y_test_scores, P_test))
        print('calculating emd...')
        performance.append(emd_method(U_0, U_1))
        print('calculating consistency...')
        performance.append(
            get_consistency(U_np,
                            lin_model,
                            n_neighbors=k_nbrs,
                            based_on=X_ori_np))
        print('calculating stat diff...')
        performance.append(stat_diff(X_test, P_test, lin_model))
        print('calculating equal odds...')
        performance.append(equal_odds(X_test, y_test, P_test, lin_model))
        # make_cal_plot(X_test, y_test, P_test, lin_model, model_name)
        return performance

    performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np,
                            lin_model, X_test, model_name)
    results[model_name] = (performance)

    # AE minus P
    model_name = 'compas_AE_P'
    U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data
    U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data
    U = model_ae_P.encoder(X[:, :-1]).data
    print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1)))
    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regression on AE-P...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    save_decision_boundary_plot(np.concatenate((X_train, X_test)),
                                np.concatenate((y_train, y_test)),
                                np.concatenate((P_train, P_test)), model_name)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    y_hats[model_name] = get_preds_on_full_dataset(U, lin_model)
    reps[model_name] = U

    performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np,
                            lin_model, X_test, model_name)
    results[model_name] = (performance)

    model_name = 'compas_NFR'
    U_0 = model_nfr.encoder(X[P == 0]).data
    U_1 = model_nfr.encoder(X[P == 1]).data
    U = model_nfr.encoder(X).data
    print('nfr emd afterwards: ' + str(emd_method(U_0, U_1)))

    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regression on NFR...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)
    save_decision_boundary_plot(np.concatenate((X_train, X_test)),
                                np.concatenate((y_train, y_test)),
                                np.concatenate((P_train, P_test)), model_name)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    y_hats[model_name] = get_preds_on_full_dataset(U, lin_model)
    reps[model_name] = U

    performance = calc_perf(y_test, y_test_scores, P_test, U, U_0, U_1, U_np,
                            lin_model, X_test, model_name)
    results[model_name] = (performance)

    return results, y_hats, reps
Beispiel #3
0
def test_in_one(n_dim,
                batch_size,
                n_iter,
                C,
                alpha,
                compute_emd=True,
                k_nbrs=3,
                emd_method=emd_samples):
    global X, P, y
    # AE.
    model_ae = FairRep(len(X[0]), n_dim)
    #model_ae.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_ae,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)
    # AE_P.
    model_ae_P = FairRep(len(X[0]) - 1, n_dim - 1)
    #model_ae_P.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_ae_P,
              0.01,
              X[:, :-1],
              P,
              n_iter,
              10,
              batch_size,
              alpha=0,
              C_reg=0,
              compute_emd=compute_emd,
              adv=False,
              verbose=True)
    # NFR.
    model_nfr = FairRep(len(X[0]), n_dim)
    #model_nfr.cuda()
    X = torch.tensor(X).float()
    P = torch.tensor(P).long()
    train_rep(model_nfr,
              0.01,
              X,
              P,
              n_iter,
              10,
              batch_size,
              alpha=alpha,
              C_reg=0,
              compute_emd=compute_emd,
              adv=True,
              verbose=True)
    results = {}

    print('begin testing.')
    X_ori_np = X.data.cpu().numpy()
    # Original.
    data_train, data_test = split_data_np(
        (X.data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on the original...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)
    #print(lin_model.coef_.shape)
    #int(X_train.shape)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(X_n, X_u))
    print('calculating consistency...')
    performance.append(
        get_consistency(X.data.cpu().numpy(), lin_model, n_neighbors=k_nbrs))
    print('calculating stat diff...')
    performance.append(stat_diff(X.data.cpu().numpy(), P, lin_model))
    results['Original'] = performance
    # Original-P.
    data_train, data_test = split_data_np(
        (X[:, :-1].data.cpu().numpy(), P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on the original-P')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(X_n[:, :-1], X_u[:, :-1]))
    print('calculating consistency...')
    performance.append(
        get_consistency(X[:, :-1].data.cpu().numpy(),
                        lin_model,
                        n_neighbors=k_nbrs))
    print('calculating stat diff...')
    performance.append(stat_diff(X[:, :-1].data.cpu().numpy(), P, lin_model))
    results['Original-P'] = (performance)
    U_0 = model_ae.encoder(X[P == 0]).data
    U_1 = model_ae.encoder(X[P == 1]).data
    U = model_ae.encoder(X).data
    print('ae emd afterwards: ' + str(emd_method(U_0, U_1)))
    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regresison on AE...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['AE'] = (performance)

    U_0 = model_ae_P.encoder(X[:, :-1][P == 0]).data
    U_1 = model_ae_P.encoder(X[:, :-1][P == 1]).data
    U = model_ae_P.encoder(X[:, :-1]).data
    print('ae-p emd afterwards: ' + str(emd_method(U_0, U_1)))
    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test

    print('logistic regresison on AE-P...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['AE_P'] = (performance)

    U_0 = model_nfr.encoder(X[P == 0]).data
    U_1 = model_nfr.encoder(X[P == 1]).data
    U = model_nfr.encoder(X).data
    print('nfr emd afterwards: ' + str(emd_method(U_0, U_1)))

    U_np = U.cpu().numpy()
    data_train, data_test = split_data_np((U_np, P.data.cpu().numpy(), y), 0.7)
    X_train, P_train, y_train = data_train
    X_test, P_test, y_test = data_test
    print('logistic regresison on NFR...')
    lin_model = LogisticRegression(C=C, solver='sag', max_iter=2000)
    lin_model.fit(X_train, y_train)

    y_test_scores = sigmoid(
        (X_test.dot(lin_model.coef_.T) + lin_model.intercept_).flatten())
    print('logistic regresison evaluation...')
    performance = list(evaluate_performance_sim(y_test, y_test_scores, P_test))
    print('calculating emd...')
    performance.append(emd_method(U_0, U_1))
    print('calculating consistency...')
    performance.append(
        get_consistency(U_np, lin_model, n_neighbors=k_nbrs,
                        based_on=X_ori_np))
    print('calculating stat diff...')
    performance.append(stat_diff(X_test, P_test, lin_model))
    results['NFR'] = (performance)

    return results