def adjust(self, dataset: DataTuple) -> DataTuple: """Take a datatuple and make the labels [0,1].""" y_col = dataset.y.columns[0] assert dataset.y[y_col].nunique() == 2 # make copy of dataset dataset = dataset.replace(y=dataset.y.copy()) self.min_val = dataset.y.to_numpy().min().item() self.max_val = dataset.y.to_numpy().max().item() y_col = dataset.y.columns[0] dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0) dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1) return DataTuple(x=dataset.x, s=dataset.s, y=dataset.y, name=dataset.name)
def bin_cont_feats(data: DataTuple) -> DataTuple: """Bin the continuous fetures. Given a datatuple, bin the columns that have ordinal features and return as afresh new DataTuple. """ groups: List[List[str]] = [ list(group) for _, group in groupby(data.x.columns, lambda x: x.split("_")[0]) ] copy: pd.DataFrame = data.x.copy() for group in groups: # if there is only one element in the group, then it corresponds to a continuous feature if len(group) == 1 and data.x[group[0]].nunique() > 2: copy[group] = pd.cut(data.x[group].to_numpy()[:, 0], 5) copy = pd.concat([copy, pd.get_dummies(copy[group])], axis="columns") copy = copy.drop(group, axis="columns") return data.replace(x=copy)
def post(self, dataset: DataTuple) -> DataTuple: """Inverse of adjust.""" y_col = dataset.y.columns[0] transformed_y = self.post_only_labels(dataset.y[y_col]) return dataset.replace(y=pd.DataFrame(transformed_y, columns=[y_col]))
def run( args: TuningLrArgs, debiasing_args: Union[None, DPFlags, EOFlags], train: DataTuple, test: TestTuple, device, use_cuda: bool, ) -> Tuple[np.ndarray, Optional[np.ndarray]]: np.random.seed(args.seed) # cpu vars torch.manual_seed(args.seed) # cpu vars random.seed(args.seed) # Python if use_cuda: torch.cuda.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # gpu vars torch.backends.cudnn.deterministic = True # needed torch.backends.cudnn.benchmark = False in_dim = train.x.shape[1] if args.use_s: train = train.replace(x=pd.concat([train.x, train.s], axis="columns")) test = test.replace(x=pd.concat([test.x, test.s], axis="columns")) in_dim += 1 train_ds = CustomDataset(train) test_ds = TestDataset(test) train_dl = DataLoader(train_ds, batch_size=args.batch_size, pin_memory=True, shuffle=True) test_dl = DataLoader(test_ds, batch_size=10000, pin_memory=True) debiasing_params = None label_likelihood = None if debiasing_args is not None: if debiasing_args.biased_acceptance_s0 is None: biased_acceptance_s0 = float(train.y[train.y.columns[0]].loc[ train.s[train.s.columns[0]] == 0].mean()) debiasing_args = debiasing_args._replace( biased_acceptance_s0=biased_acceptance_s0) if debiasing_args.biased_acceptance_s1 is None: biased_acceptance_s1 = float(train.y[train.y.columns[0]].loc[ train.s[train.s.columns[0]] == 1].mean()) debiasing_args = debiasing_args._replace( biased_acceptance_s1=biased_acceptance_s1) # print(debiasing_args) if isinstance(debiasing_args, DPFlags): debiasing_params, label_likelihood = debiasing_params_target_rate( debiasing_args) else: debiasing_params, label_likelihood = debiasing_params_target_tpr( debiasing_args) model = nn.Linear(in_dim, 1) model.to(device) optimizer: Optimizer if args.use_sgd: optimizer = SGD(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) else: optimizer = RAdam(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay) fit( model=model, train_data=train_dl, optimizer=optimizer, epochs=args.epochs, device=device, debiasing_params=debiasing_params, # lr_milestones=dict(milestones=[30, 60, 90, 120], gamma=0.3), ) predictions = predict_dataset(model, test_dl, device) return predictions.cpu().numpy(), label_likelihood