Ejemplo n.º 1
0
    def adjust(self, dataset: DataTuple) -> DataTuple:
        """Take a datatuple and make the labels [0,1]."""
        y_col = dataset.y.columns[0]
        assert dataset.y[y_col].nunique() == 2

        # make copy of dataset
        dataset = dataset.replace(y=dataset.y.copy())

        self.min_val = dataset.y.to_numpy().min().item()
        self.max_val = dataset.y.to_numpy().max().item()

        y_col = dataset.y.columns[0]

        dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0)
        dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1)

        return DataTuple(x=dataset.x,
                         s=dataset.s,
                         y=dataset.y,
                         name=dataset.name)
Ejemplo n.º 2
0
def bin_cont_feats(data: DataTuple) -> DataTuple:
    """Bin the continuous fetures.

    Given a datatuple, bin the columns that have ordinal features
    and return as afresh new DataTuple.
    """
    groups: List[List[str]] = [
        list(group)
        for _, group in groupby(data.x.columns, lambda x: x.split("_")[0])
    ]

    copy: pd.DataFrame = data.x.copy()

    for group in groups:
        # if there is only one element in the group, then it corresponds to a continuous feature
        if len(group) == 1 and data.x[group[0]].nunique() > 2:
            copy[group] = pd.cut(data.x[group].to_numpy()[:, 0], 5)
            copy = pd.concat([copy, pd.get_dummies(copy[group])],
                             axis="columns")
            copy = copy.drop(group, axis="columns")

    return data.replace(x=copy)
Ejemplo n.º 3
0
 def post(self, dataset: DataTuple) -> DataTuple:
     """Inverse of adjust."""
     y_col = dataset.y.columns[0]
     transformed_y = self.post_only_labels(dataset.y[y_col])
     return dataset.replace(y=pd.DataFrame(transformed_y, columns=[y_col]))
Ejemplo n.º 4
0
def run(
    args: TuningLrArgs,
    debiasing_args: Union[None, DPFlags, EOFlags],
    train: DataTuple,
    test: TestTuple,
    device,
    use_cuda: bool,
) -> Tuple[np.ndarray, Optional[np.ndarray]]:
    np.random.seed(args.seed)  # cpu vars
    torch.manual_seed(args.seed)  # cpu  vars
    random.seed(args.seed)  # Python
    if use_cuda:
        torch.cuda.manual_seed(args.seed)
        torch.cuda.manual_seed_all(args.seed)  # gpu vars
        torch.backends.cudnn.deterministic = True  # needed
        torch.backends.cudnn.benchmark = False

    in_dim = train.x.shape[1]
    if args.use_s:
        train = train.replace(x=pd.concat([train.x, train.s], axis="columns"))
        test = test.replace(x=pd.concat([test.x, test.s], axis="columns"))
        in_dim += 1
    train_ds = CustomDataset(train)
    test_ds = TestDataset(test)
    train_dl = DataLoader(train_ds,
                          batch_size=args.batch_size,
                          pin_memory=True,
                          shuffle=True)
    test_dl = DataLoader(test_ds, batch_size=10000, pin_memory=True)

    debiasing_params = None
    label_likelihood = None
    if debiasing_args is not None:
        if debiasing_args.biased_acceptance_s0 is None:
            biased_acceptance_s0 = float(train.y[train.y.columns[0]].loc[
                train.s[train.s.columns[0]] == 0].mean())
            debiasing_args = debiasing_args._replace(
                biased_acceptance_s0=biased_acceptance_s0)
        if debiasing_args.biased_acceptance_s1 is None:
            biased_acceptance_s1 = float(train.y[train.y.columns[0]].loc[
                train.s[train.s.columns[0]] == 1].mean())
            debiasing_args = debiasing_args._replace(
                biased_acceptance_s1=biased_acceptance_s1)
        # print(debiasing_args)
        if isinstance(debiasing_args, DPFlags):
            debiasing_params, label_likelihood = debiasing_params_target_rate(
                debiasing_args)
        else:
            debiasing_params, label_likelihood = debiasing_params_target_tpr(
                debiasing_args)

    model = nn.Linear(in_dim, 1)
    model.to(device)
    optimizer: Optimizer
    if args.use_sgd:
        optimizer = SGD(model.parameters(),
                        lr=args.learning_rate,
                        weight_decay=args.weight_decay)
    else:
        optimizer = RAdam(model.parameters(),
                          lr=args.learning_rate,
                          weight_decay=args.weight_decay)
    fit(
        model=model,
        train_data=train_dl,
        optimizer=optimizer,
        epochs=args.epochs,
        device=device,
        debiasing_params=debiasing_params,
        # lr_milestones=dict(milestones=[30, 60, 90, 120], gamma=0.3),
    )
    predictions = predict_dataset(model, test_dl, device)
    return predictions.cpu().numpy(), label_likelihood