Python get_adult_data Examples, DataPreprocessing.get_adult_data Python Examples

Example #1

0

Show file

File: run_experiment.py Project: davevanveen/fairness_logistic_regression

def run_fairness_regression(s_id, writer_name, **kwargs):
    writer = SummaryWriter(log_dir=writer_name, comment=writer_name)

    # Import data as pandas dataframes
    s, x_train, y_train, x_test, y_test = get_adult_data(s_id)

    # # Save the header info before turning into matrices
    # x_cols = x_train.columns
    # y_cols = y_train.columns

    # Convert the dataframes into PyTorch variables and cuda-fy if available
    x_train = Variable(torch.from_numpy(x_train.as_matrix()))
    y_train = Variable(torch.from_numpy(y_train.as_matrix()).long())
    x_test = Variable(torch.from_numpy(x_test.as_matrix()))

    if torch.cuda.is_available():
        x_train = x_train.cuda()
        y_train = y_train.cuda()
        x_test = x_test.cuda()

    # We'll only compare y_test as a numpy array, so don't bother to convert
    y_test = y_test.as_matrix()

    # Instantiate and fit the model
    flr = FairLogisticRegression(**kwargs)
    flr.fit(x_train, y_train, s, writer=writer)

    # Predict x_test, but then convert result to numpy array
    y_pred = flr.predict(x_test).data.cpu().numpy()
    mse = mean_squared_error(y_test, y_pred)

    print('MSE: {}'.format(mse))

Example #2

0

Show file

File: test_utils.py Project: davevanveen/fairness_logistic_regression

def get_adult_test_data(s_ids):
    s, x_train, y_train, x_test, y_test = get_adult_data(s_ids)
    x_train = Variable(torch.from_numpy(x_train.as_matrix()))
    y_train = Variable(torch.from_numpy(y_train.as_matrix()).long())
    x_test = Variable(torch.from_numpy(x_test.as_matrix()))
    y_test = Variable(torch.from_numpy(y_test.as_matrix()).long())

    if torch.cuda.is_available():
        x_train = x_train.cuda()
        y_train = y_train.cuda()
        x_test = x_test.cuda()
        y_test = y_test.cuda()

    x, y, _, _1 = split_data(x_train, y_train, 0.5)

    return s, x, y

Example #3

0

Show file

def update_df(json_fn, pkl_fn, csv_fn):
    with open(json_fn, 'r') as json_file:
        my_kws = json.load(json_file)
    with open(pkl_fn, 'rb') as pickle_file:
        models = pickle.load(pickle_file)

    df = pd.read_csv(csv_fn)

    # Load and convert the data
    s, _, _1, x_test, y_test = get_adult_data(my_kws['s_ids'])
    x_test = Variable(torch.from_numpy(x_test.as_matrix()))
    y_test = Variable(torch.from_numpy(y_test.as_matrix()).long())

    # Create a dataloader to iterate over minibatches
    ds = TensorDataset(x_test.data.cpu(), y_test.data.cpu())
    loader = DataLoader(ds, batch_size=my_kws['batch_size'], shuffle=False)

    current_df_novel_penalties = []

    for idx, row in df.iterrows():
        # Get the current model
        current_model = models[row.ID_String]
        if torch.cuda.is_available():
            current_model.model = current_model.model.cuda()

        # Calculate the penalty
        penalty = 0.
        # Load across batches for computational efficiency
        for i, data in enumerate(loader):
            inputs, labels = to_Variables(*data)
            penalty += current_model.fairness_penalty(inputs,
                                                      labels,
                                                      inputs,
                                                      labels,
                                                      s,
                                                      penalty_type='novel')

        # Add the value to the current list
        current_df_novel_penalties.append(penalty.data[0])

    # Now add the complete column to the current dataframe and write it to a file
    df['Novel Penalty'] = current_df_novel_penalties
    df.to_csv(csv_fn[:-4] + '_updated.csv')

Example #4

0

Show file

# Open the models and pick one or more to show the more fair version of the plots in the presentation
import sys
sys.path.append('../fair_regression')

import torch
import pickle
import pandas as pd

from torch.autograd import Variable
from DataPreprocessing import get_adult_data


s, _, _1, x_test, y_test = get_adult_data(['Sex_Female'])
with open('../fair_regression/save_temporary_models.pkl', 'rb') as f:
    model_dict = pickle.load(f)

torch_xt = Variable(torch.from_numpy(x_test.as_matrix()))
if torch.cuda.is_available():
    torch_xt = torch_xt.cuda()

# Pick out models from fold 1 (doesn't really matter which one)
model_str = 'fold: 1 pen: {:0.5g} type: {}'

model_types = ['plain', 'indiv', 'group']
penalties = [1e-3, 1e-2, 1e-1]

x_test['True Y'] = y_test

new_df = pd.DataFrame()
for t in model_types:
    for p in penalties:

Example #5

0

Show file

File: run_cv.py Project: davevanveen/fairness_logistic_regression

    def __init__(self,
                 s_ids=['Sex_Female'],
                 n_epochs=256,
                 batch_size=512,
                 plain_batch_size=32,
                 ftol=1e-6,
                 batch_fairness=True,
                 l_fair_logspace=[-6, 2, 4],
                 cv=3,
                 csv_fn='save_temporary_results.csv',
                 models_fn='save_temporary_models.pkl',
                 pred_fn='save_temporary_preds.csv'):
        # Load data based on inputs
        self.s, x_train, y_train, self.x_test, y_test = get_adult_data(s_ids)

        # Convert the dataframes into PyTorch variables and cuda-fy if available
        x = Variable(torch.from_numpy(x_train.as_matrix()))
        y = Variable(torch.from_numpy(y_train.as_matrix()).long())
        xt = Variable(torch.from_numpy(self.x_test.as_matrix()))

        if torch.cuda.is_available():
            self.x = x.cuda()
            self.y = y.cuda()
            self.xt = xt.cuda()

        # Add the true values to x_test for later analysis
        self.x_test['Y True'] = y_test

        # K-fold Cross Validation for FairLogReg fairness search
        self.shared_kwargs = {
            'ftol': ftol,
            'n_epochs': n_epochs,
            'minibatch_size': batch_size,
            'batch_fairness': batch_fairness
        }

        self.df_template = {
            'Type': [],
            'MSE': [],
            'Score': [],
            'Group Penalty': [],
            'Individual Penalty': [],
            'ID_String': [],
            'l_fair': None
        }
        self.penalties = np.logspace(*l_fair_logspace)
        self.splits = cv
        if cv > 1:
            self.cv = KFold(cv)
        else:
            self.cv = None

        self.models = {}

        self.batch_size = batch_size
        self.plain_batch_size = plain_batch_size
        self.ftol = ftol
        self.n_epochs = n_epochs
        self.batch_fairness = batch_fairness
        self.csv_fn = csv_fn
        self.models_fn = models_fn
        self.pred_fn = pred_fn