def run_fairness_regression(s_id, writer_name, **kwargs): writer = SummaryWriter(log_dir=writer_name, comment=writer_name) # Import data as pandas dataframes s, x_train, y_train, x_test, y_test = get_adult_data(s_id) # # Save the header info before turning into matrices # x_cols = x_train.columns # y_cols = y_train.columns # Convert the dataframes into PyTorch variables and cuda-fy if available x_train = Variable(torch.from_numpy(x_train.as_matrix())) y_train = Variable(torch.from_numpy(y_train.as_matrix()).long()) x_test = Variable(torch.from_numpy(x_test.as_matrix())) if torch.cuda.is_available(): x_train = x_train.cuda() y_train = y_train.cuda() x_test = x_test.cuda() # We'll only compare y_test as a numpy array, so don't bother to convert y_test = y_test.as_matrix() # Instantiate and fit the model flr = FairLogisticRegression(**kwargs) flr.fit(x_train, y_train, s, writer=writer) # Predict x_test, but then convert result to numpy array y_pred = flr.predict(x_test).data.cpu().numpy() mse = mean_squared_error(y_test, y_pred) print('MSE: {}'.format(mse))
def get_adult_test_data(s_ids): s, x_train, y_train, x_test, y_test = get_adult_data(s_ids) x_train = Variable(torch.from_numpy(x_train.as_matrix())) y_train = Variable(torch.from_numpy(y_train.as_matrix()).long()) x_test = Variable(torch.from_numpy(x_test.as_matrix())) y_test = Variable(torch.from_numpy(y_test.as_matrix()).long()) if torch.cuda.is_available(): x_train = x_train.cuda() y_train = y_train.cuda() x_test = x_test.cuda() y_test = y_test.cuda() x, y, _, _1 = split_data(x_train, y_train, 0.5) return s, x, y
def update_df(json_fn, pkl_fn, csv_fn): with open(json_fn, 'r') as json_file: my_kws = json.load(json_file) with open(pkl_fn, 'rb') as pickle_file: models = pickle.load(pickle_file) df = pd.read_csv(csv_fn) # Load and convert the data s, _, _1, x_test, y_test = get_adult_data(my_kws['s_ids']) x_test = Variable(torch.from_numpy(x_test.as_matrix())) y_test = Variable(torch.from_numpy(y_test.as_matrix()).long()) # Create a dataloader to iterate over minibatches ds = TensorDataset(x_test.data.cpu(), y_test.data.cpu()) loader = DataLoader(ds, batch_size=my_kws['batch_size'], shuffle=False) current_df_novel_penalties = [] for idx, row in df.iterrows(): # Get the current model current_model = models[row.ID_String] if torch.cuda.is_available(): current_model.model = current_model.model.cuda() # Calculate the penalty penalty = 0. # Load across batches for computational efficiency for i, data in enumerate(loader): inputs, labels = to_Variables(*data) penalty += current_model.fairness_penalty(inputs, labels, inputs, labels, s, penalty_type='novel') # Add the value to the current list current_df_novel_penalties.append(penalty.data[0]) # Now add the complete column to the current dataframe and write it to a file df['Novel Penalty'] = current_df_novel_penalties df.to_csv(csv_fn[:-4] + '_updated.csv')
# Open the models and pick one or more to show the more fair version of the plots in the presentation import sys sys.path.append('../fair_regression') import torch import pickle import pandas as pd from torch.autograd import Variable from DataPreprocessing import get_adult_data s, _, _1, x_test, y_test = get_adult_data(['Sex_Female']) with open('../fair_regression/save_temporary_models.pkl', 'rb') as f: model_dict = pickle.load(f) torch_xt = Variable(torch.from_numpy(x_test.as_matrix())) if torch.cuda.is_available(): torch_xt = torch_xt.cuda() # Pick out models from fold 1 (doesn't really matter which one) model_str = 'fold: 1 pen: {:0.5g} type: {}' model_types = ['plain', 'indiv', 'group'] penalties = [1e-3, 1e-2, 1e-1] x_test['True Y'] = y_test new_df = pd.DataFrame() for t in model_types: for p in penalties:
def __init__(self, s_ids=['Sex_Female'], n_epochs=256, batch_size=512, plain_batch_size=32, ftol=1e-6, batch_fairness=True, l_fair_logspace=[-6, 2, 4], cv=3, csv_fn='save_temporary_results.csv', models_fn='save_temporary_models.pkl', pred_fn='save_temporary_preds.csv'): # Load data based on inputs self.s, x_train, y_train, self.x_test, y_test = get_adult_data(s_ids) # Convert the dataframes into PyTorch variables and cuda-fy if available x = Variable(torch.from_numpy(x_train.as_matrix())) y = Variable(torch.from_numpy(y_train.as_matrix()).long()) xt = Variable(torch.from_numpy(self.x_test.as_matrix())) if torch.cuda.is_available(): self.x = x.cuda() self.y = y.cuda() self.xt = xt.cuda() # Add the true values to x_test for later analysis self.x_test['Y True'] = y_test # K-fold Cross Validation for FairLogReg fairness search self.shared_kwargs = { 'ftol': ftol, 'n_epochs': n_epochs, 'minibatch_size': batch_size, 'batch_fairness': batch_fairness } self.df_template = { 'Type': [], 'MSE': [], 'Score': [], 'Group Penalty': [], 'Individual Penalty': [], 'ID_String': [], 'l_fair': None } self.penalties = np.logspace(*l_fair_logspace) self.splits = cv if cv > 1: self.cv = KFold(cv) else: self.cv = None self.models = {} self.batch_size = batch_size self.plain_batch_size = plain_batch_size self.ftol = ftol self.n_epochs = n_epochs self.batch_fairness = batch_fairness self.csv_fn = csv_fn self.models_fn = models_fn self.pred_fn = pred_fn