Example #1
0
    def __call__(
            self,
            data: DataTuple,
            split_id: int = 0
    ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]:
        random_seed = self._get_seed(split_id)
        train_indexes, test_indexes = generate_proportional_split_indexes(
            data,
            train_percentage=self.train_percentage,
            random_seed=random_seed)

        train: DataTuple = DataTuple(
            x=data.x.iloc[train_indexes].reset_index(drop=True),
            s=data.s.iloc[train_indexes].reset_index(drop=True),
            y=data.y.iloc[train_indexes].reset_index(drop=True),
            name=f"{data.name} - Train",
        )

        test: DataTuple = DataTuple(
            x=data.x.iloc[test_indexes].reset_index(drop=True),
            s=data.s.iloc[test_indexes].reset_index(drop=True),
            y=data.y.iloc[test_indexes].reset_index(drop=True),
            name=f"{data.name} - Test",
        )

        # assert that no data points got lost anywhere
        assert len(data) == len(train) + len(test)

        split_info: Dict[str, float] = {"seed": random_seed}

        return train, test, split_info
    def fit(self, train: DataTuple) -> Tuple[PreAlgorithm, DataTuple]:
        """Generate fair features with the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            a tuple of the pre-processed training data and the test data
        """
        self.model_path = self.model_dir / f"model_{self.name}.joblib"
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            # ================================ write data to files ================================
            train_path, test_path = tmp_path / "train.npz", tmp_path / "test.npz"
            train.to_npz(train_path)

            # ========================== generate commandline arguments ===========================
            transformed_train_path = tmp_path / "transformed_train.npz"
            cmd = self._fit_script_command(train_path, transformed_train_path, self.model_path)

            # ============================= run the generated command =============================
            self._call_script(cmd + ["--mode", "fit"])

            # ================================== load results =====================================
            transformed_train = DataTuple.from_npz(transformed_train_path)

        # prefix the name of the algorithm to the dataset name
        transformed_train = transformed_train.replace(
            name=None if train.name is None else f"{self.name}: {train.name}"
        )
        return self, transformed_train
Example #3
0
def main() -> None:
    """Main method to run model."""
    args = VfaeArgs(explicit_bool=True).parse_args()
    set_seed(args.seed)
    if args.mode == "run":
        assert args.train is not None
        assert args.new_train is not None
        assert args.test is not None
        assert args.new_test is not None
        train, test = load_data_from_flags(args)
        save_transformations(train_and_transform(train, test, args), args)
    elif args.mode == "fit":
        assert args.model is not None
        assert args.train is not None
        assert args.new_train is not None
        train = DataTuple.from_npz(Path(args.train))
        enc = fit(train, args)
        transformed_train = transform(enc, train, args)
        transformed_train.to_npz(Path(args.new_train))
        dump(enc, Path(args.model))
    elif args.mode == "transform":
        assert args.model is not None
        assert args.test is not None
        assert args.new_test is not None
        test = DataTuple.from_npz(Path(args.test))
        model = load(Path(args.model))
        transformed_test = transform(model, test, args)
        transformed_test.to_npz(Path(args.new_test))
Example #4
0
def main() -> None:
    """Load data from feather files, pass it to `train_and_transform` and then save the result."""
    args = BeutelArgs().parse_args()
    if args.mode == "run":
        assert args.train is not None
        assert args.new_train is not None
        assert args.test is not None
        assert args.new_test is not None
        train, test = load_data_from_flags(args)
        save_transformations(train_and_transform(train, test, args), args)
    elif args.mode == "fit":
        assert args.model is not None
        assert args.train is not None
        assert args.new_train is not None
        train = DataTuple.from_npz(Path(args.train))
        transformed_train, enc = fit(train, args)
        transformed_train.to_npz(Path(args.new_train))
        dump(enc, Path(args.model))
    elif args.mode == "transform":
        assert args.model is not None
        assert args.test is not None
        assert args.new_test is not None
        test = DataTuple.from_npz(Path(args.test))
        model = load(Path(args.model))
        transformed_test = transform(test, model, args)
        transformed_test.to_npz(Path(args.new_test))
Example #5
0
def domain_split(datatup: DataTuple,
                 tr_cond: str,
                 te_cond: str,
                 seed: int = 888) -> Tuple[DataTuple, DataTuple]:
    """Splits a datatuple based on a condition.

    Args:
        datatup: DataTuple
        tr_cond: condition for the training set
        te_cond: condition for the test set

    Returns:
        Tuple of DataTuple split into train and test. The test is all those that meet
        the test condition plus the same percentage again of the train set.
    """
    dataset = datatup.x

    train_dataset = dataset_from_cond(dataset, tr_cond)
    test_dataset = dataset_from_cond(dataset, te_cond)

    assert train_dataset.shape[0] + test_dataset.shape[0] == dataset.shape[0]

    test_pct = test_dataset.shape[0] / dataset.shape[0]
    train_pct = 1 - test_pct

    train_train_pcnt = (1 - (test_pct * 2)) / train_pct

    train_train = train_dataset.sample(frac=train_train_pcnt,
                                       random_state=seed)
    test_train = train_dataset.drop(train_train.index,
                                    axis="index")  # type: ignore[arg-type]

    test = pd.concat([test_train, test_dataset], axis="index")

    train_x = datatup.x.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    train_s = datatup.s.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    train_y = datatup.y.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]

    train_datatup = DataTuple(x=train_x,
                              s=train_s,
                              y=train_y,
                              name=datatup.name)

    test_x = datatup.x.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    test_s = datatup.s.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    test_y = datatup.y.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]

    test_datatup = DataTuple(x=test_x, s=test_s, y=test_y, name=datatup.name)

    return train_datatup, test_datatup
Example #6
0
def fold_data(data: DataTuple,
              folds: int) -> Iterator[Tuple[DataTuple, DataTuple]]:
    """So much love to sklearn for making their source code open."""
    indices: np.ndarray = np.arange(data.x.shape[0])

    fold_sizes: np.ndarray = np.full(folds,
                                     data.x.shape[0] // folds,
                                     dtype=np.int32)
    fold_sizes[:data.x.shape[0] % folds] += np.int32(1)

    current = 0
    for i, fold_size in enumerate(fold_sizes):
        start, stop = current, int(current + fold_size)
        val_inds: np.ndarray = indices[start:stop]
        train_inds = np.array([i for i in indices
                               if i not in val_inds])  # probably inefficient

        train_x = data.x.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        train_s = data.s.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        train_y = data.y.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]

        assert train_x.shape == (len(train_inds), data.x.shape[1])
        assert train_s.shape == (len(train_inds), data.s.shape[1])
        assert train_y.shape == (len(train_inds), data.y.shape[1])

        val_x = data.x.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        val_s = data.s.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        val_y = data.y.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]

        assert val_x.shape == (len(val_inds), data.x.shape[1])
        assert val_s.shape == (len(val_inds), data.s.shape[1])
        assert val_y.shape == (len(val_inds), data.y.shape[1])

        yield DataTuple(x=train_x,
                        s=train_s,
                        y=train_y,
                        name=f"{data.name} - train fold {i}"), DataTuple(
                            x=val_x,
                            s=val_s,
                            y=val_y,
                            name=f"{data.name} - test fold {i}")

        current = stop
def train_and_transform(train: DataTuple, test: TestTuple,
                        flags: VfaeArgs) -> Tuple[DataTuple, TestTuple]:
    """Train the model and transform the dataset.

    Args:
        train:
        test:
        flags:

    Returns:
        Tuple of Encoded Train Dataset and Test Dataset.
    """
    dataset = get_dataset_obj_by_name(flags.dataset)()

    # Set up the data
    train_data = CustomDataset(train)
    train_loader = DataLoader(train_data, batch_size=flags.batch_size)

    test_data = TestDataset(test)
    test_loader = DataLoader(test_data, batch_size=flags.batch_size)

    # Build Network
    model = VFAENetwork(
        dataset,
        flags.supervised,
        train_data.xdim,
        latent_dims=50,
        z1_enc_size=flags.z1_enc_size,
        z2_enc_size=flags.z2_enc_size,
        z1_dec_size=flags.z1_dec_size,
    ).to("cpu")
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Run Network
    for epoch in range(int(flags.epochs)):
        train_model(epoch, model, train_loader, optimizer, flags)

    # Transform output
    post_train: List[List[float]] = []
    post_test: List[List[float]] = []
    model.eval()
    with torch.no_grad():
        for _x, _s, _ in train_loader:
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            z1 = model.reparameterize(z1_mu, z1_logvar)
            post_train += z1.data.tolist()
        for _x, _s in test_loader:
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            z1 = model.reparameterize(z1_mu, z1_logvar)
            post_test += z1.data.tolist()

    return (
        DataTuple(x=pd.DataFrame(post_train),
                  s=train.s,
                  y=train.y,
                  name=f"VFAE: {train.name}"),
        TestTuple(x=pd.DataFrame(post_test),
                  s=test.s,
                  name=f"VFAE: {test.name}"),
    )
Example #8
0
def transform(model: VFAENetwork, dataset: T, flags) -> T:
    """Transform the dataset."""
    data: Union[CustomDataset, TestDataset]
    if isinstance(dataset, DataTuple):
        data = CustomDataset(dataset)
        loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False)
    elif isinstance(dataset, TestTuple):
        data = TestDataset(dataset)
        loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False)

    post_train: List[List[float]] = []
    model.eval()
    with torch.no_grad():
        for sample in loader:
            if isinstance(dataset, DataTuple):
                _x, _s, _ = sample
            elif isinstance(dataset, TestTuple):
                _x, _s = sample
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            # z1 = model.reparameterize(z1_mu, z1_logvar)
            post_train += z1_mu.data.tolist()

    if isinstance(dataset, DataTuple):
        return DataTuple(x=pd.DataFrame(post_train),
                         s=dataset.s,
                         y=dataset.y,
                         name=f"VFAE: {dataset.name}")
    elif isinstance(dataset, TestTuple):
        return TestTuple(x=pd.DataFrame(post_train),
                         s=dataset.s,
                         name=f"VFAE: {dataset.name}")
Example #9
0
def main():
    """This function runs the Agarwal model as a standalone program."""
    args: AgarwalArgs = AgarwalArgs().parse_args()
    train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(
        Path(args.test))
    Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz(
        Path(args.predictions))
def train_and_transform(
    train: DataTuple, test: TestTuple, flags: ZemelArgs
) -> (Tuple[DataTuple, TestTuple]):
    """Train the Zemel model and return the transformed features of the train and test sets."""
    np.random.seed(flags.seed)

    sens_col = train.s.columns[0]
    training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
    training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()
    ytrain_sensitive = train.y.loc[train.s[sens_col] == 0].to_numpy()
    ytrain_nonsensitive = train.y.loc[train.s[sens_col] == 1].to_numpy()

    print_interval = 100
    verbose = False

    num_train_samples, features_dim = train.x.shape

    # Initialize the LFR optim objective parameters
    parameters_initialization = np.random.uniform(
        size=flags.clusters + features_dim * flags.clusters
    )
    bnd = [(0, 1)] * flags.clusters + [(None, None)] * features_dim * flags.clusters  # type: ignore[operator]
    LFR_optim_objective.steps = 0  # type: ignore[attr-defined]

    learned_model = optim.fmin_l_bfgs_b(
        LFR_optim_objective,
        x0=parameters_initialization,
        epsilon=1e-5,
        args=(
            training_nonsensitive,
            training_sensitive,
            ytrain_nonsensitive[:, 0],
            ytrain_sensitive[:, 0],
            flags.clusters,
            flags.Ax,
            flags.Ay,
            flags.Az,
            print_interval,
            verbose,
        ),
        bounds=bnd,
        approx_grad=True,
        maxfun=flags.maxfun,
        maxiter=flags.max_iter,
        disp=verbose,
    )[0]
    w = learned_model[: flags.clusters]
    prototypes = learned_model[flags.clusters :].reshape((flags.clusters, features_dim))

    testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy()
    testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy()

    train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train)
    test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test)

    return (
        DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name),
        TestTuple(x=test_transformed, s=test.s, name=test.name),
    )
Example #11
0
 def __init__(self, data: DataTuple):
     super().__init__()
     test = data.remove_y()
     self.x, self.s, self.num, self.xdim, self.sdim, self.x_names, self.s_names = _get_info(
         test)
     self.y = data.y.to_numpy(dtype=np.float32)
     self.ydim = data.y.shape[1]
     self.y_names = data.y.columns
Example #12
0
def query_dt(datatup: DataTuple, query_str: str) -> DataTuple:
    """Query a datatuple."""
    assert isinstance(query_str, str)
    assert isinstance(datatup, DataTuple)

    def _query_func(joined_data_frame: pd.DataFrame) -> pd.DataFrame:
        return dataset_from_cond(joined_data_frame, cond=query_str)

    return datatup.apply_to_joined_df(_query_func)
Example #13
0
    def __call__(
            self,
            data: DataTuple,
            split_id: int = 0
    ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]:
        del split_id
        train_len = round(self.train_percentage * len(data))

        train = data.apply_to_joined_df(
            lambda df: df.iloc[:train_len].reset_index(drop=True))
        train = train.replace(name=f"{data.name} - Train")

        test = data.apply_to_joined_df(
            lambda df: df.iloc[train_len:].reset_index(drop=True))
        test = test.replace(name=f"{data.name} - Test")

        assert len(train) + len(test) == len(data)
        return train, test, {}
Example #14
0
def transform(data: T, prototypes: np.ndarray, w: np.ndarray) -> T:
    """Transform."""
    sens_col = data.s.columns[0]
    data_sens = data.x.loc[data.s[sens_col] == 0].to_numpy()
    data_nons = data.x.loc[data.s[sens_col] == 1].to_numpy()
    transformed = trans(prototypes, w, data_nons, data_sens, data)
    if isinstance(data, DataTuple):
        return DataTuple(x=transformed, s=data.s, y=data.y, name=data.name)
    elif isinstance(data, TestTuple):
        return TestTuple(x=transformed, s=data.s, name=data.name)
Example #15
0
    def fit(self: _IA, train: DataTuple) -> _IA:
        """Fit algorithm on the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            predictions
        """
        self.model_path = self.model_dir / f"model_{self.name}.joblib"
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_path = tmp_path / "train.npz"
            train.to_npz(train_path)
            cmd = self._fit_script_command(train_path, self.model_path)
            self._call_script(cmd +
                              ["--mode", "fit"])  # wait for script to run
            return self
Example #16
0
def main() -> None:
    """This function runs the Agarwal model as a standalone program."""
    args: AgarwalArgs = AgarwalArgs().parse_args()
    random.seed(args.seed)
    np.random.seed(args.seed)
    try:
        import cloudpickle

        # Need to install cloudpickle for now. See https://github.com/fairlearn/fairlearn/issues/569
    except ImportError as e:
        raise RuntimeError(
            "In order to use Agarwal, install fairlearn and cloudpickle."
        ) from e

    if args.mode == "run":
        assert args.train is not None
        assert args.test is not None
        assert args.predictions is not None
        train, test = DataTuple.from_npz(Path(args.train)), TestTuple.from_npz(
            Path(args.test))
        Prediction(hard=train_and_predict(train, test, args)["preds"]).to_npz(
            Path(args.predictions))
    elif args.mode == "fit":
        assert args.train is not None
        assert args.model is not None
        data = DataTuple.from_npz(Path(args.train))
        model = fit(data, args)
        with working_dir(Path(args.model)):
            model_file = cloudpickle.dumps(model)
        dump(model_file, Path(args.model))
    elif args.mode == "predict":
        assert args.model is not None
        assert args.predictions is not None
        assert args.test is not None
        data = TestTuple.from_npz(Path(args.test))
        model_file = load(Path(args.model))
        with working_dir(Path(args.model)):
            model = cloudpickle.loads(model_file)
        Prediction(hard=predict(model, data)["preds"]).to_npz(
            Path(args.predictions))
    else:
        raise RuntimeError(f"Unknown mode: {args.mode}")
    async def run_async(self, train: DataTuple, test: TestTuple) -> Prediction:
        """Run Algorithm on the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            predictions
        """
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            train_path = tmp_path / "train.npz"
            test_path = tmp_path / "test.npz"
            pred_path = tmp_path / "predictions.npz"
            train.to_npz(train_path)
            test.to_npz(test_path)
            cmd = self._script_command(train_path, test_path, pred_path)
            await self._call_script(cmd)  # wait for scrip to run
            return Prediction.from_npz(pred_path)
Example #18
0
    def adjust(self, dataset: DataTuple) -> DataTuple:
        """Take a datatuple and make the labels [0,1]."""
        y_col = dataset.y.columns[0]
        assert dataset.y[y_col].nunique() == 2

        # make copy of dataset
        dataset = dataset.replace(y=dataset.y.copy())

        self.min_val = dataset.y.to_numpy().min().item()
        self.max_val = dataset.y.to_numpy().max().item()

        y_col = dataset.y.columns[0]

        dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0)
        dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1)

        return DataTuple(x=dataset.x,
                         s=dataset.s,
                         y=dataset.y,
                         name=dataset.name)
Example #19
0
    def run(self, train: DataTuple, test: TestTuple):
        seed = 42
        np.random.seed(seed)  # cpu vars
        torch.manual_seed(seed)  # cpu  vars
        random.seed(seed)  # Python

        in_dim = train.x.shape[1]
        if self.use_s:
            train = train.make_copy_with(x=pd.concat([train.x, train.s], axis="columns"))
            test = test.make_copy_with(x=pd.concat([test.x, test.s], axis="columns"))
            in_dim += 1
        train_ds = CustomDataset(train)
        test_ds = TestDataset(test)
        train_ds = DataLoader(train_ds, batch_size=self.batch_size, pin_memory=True, shuffle=True)
        test_ds = DataLoader(test_ds, batch_size=10000, pin_memory=True)

        if self.fair:
            debiasing_args = self.debiasing_args
            if debiasing_args.biased_acceptance_s0 is None:
                biased_acceptance_s0 = float(
                    train.y[train.y.columns[0]].loc[train.s[train.s.columns[0]] == 0].mean()
                )
                debiasing_args = debiasing_args._replace(biased_acceptance_s0=biased_acceptance_s0)
            if debiasing_args.biased_acceptance_s1 is None:
                biased_acceptance_s1 = float(
                    train.y[train.y.columns[0]].loc[train.s[train.s.columns[0]] == 1].mean()
                )
                debiasing_args = debiasing_args._replace(biased_acceptance_s1=biased_acceptance_s1)
            # print(debiasing_args)
            if isinstance(debiasing_args, DPFlags):
                self.debiasing_params = debiasing_params_target_rate(debiasing_args)
            else:
                self.debiasing_params = debiasing_params_target_tpr(debiasing_args)

        model = nn.Linear(in_dim, 1)
        model.to(self.device)
        optimizer: Optimizer
        if self.use_sgd:
            optimizer = SGD(
                model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
            )
        else:
            optimizer = RAdam(
                model.parameters(), lr=self.learning_rate, weight_decay=self.weight_decay
            )
        self._fit(
            model=model,
            train_data=train_ds,
            optimizer=optimizer,
            # lr_milestones=dict(milestones=[30, 60, 90, 120], gamma=0.3),
        )
        predictions = self.predict_dataset(model, test_ds)
        return pd.DataFrame(predictions.numpy(), columns=["preds"])
def scale_continuous(dataset: Dataset,
                     datatuple: DataTuple,
                     scaler: ScalerType,
                     inverse: bool = False) -> Tuple[DataTuple, ScalerType]:
    """Use a scaler on just the continuous features."""
    new_feats = datatuple.x.copy().astype('float64')
    if inverse:
        new_feats[dataset.continuous_features] = scaler.inverse_transform(
            new_feats[dataset.continuous_features])
    else:
        new_feats[dataset.continuous_features] = scaler.fit_transform(
            new_feats[dataset.continuous_features])
    return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
    async def run_async(self, train: DataTuple,
                        test: TestTuple) -> Tuple[DataTuple, TestTuple]:
        """Generate fair features with the given data asynchronously.

        Args:
            train: training data
            test: test data

        Returns:
            a tuple of the pre-processed training data and the test data
        """
        with TemporaryDirectory() as tmpdir:
            tmp_path = Path(tmpdir)
            # ================================ write data to files ================================
            train_path, test_path = tmp_path / "train.npz", tmp_path / "test.npz"
            train.to_npz(train_path)
            test.to_npz(test_path)

            # ========================== generate commandline arguments ===========================
            transformed_train_path = tmp_path / "transformed_train.npz"
            transformed_test_path = tmp_path / "transformed_test.npz"
            cmd = self._script_command(train_path, test_path,
                                       transformed_train_path,
                                       transformed_test_path)

            # ============================= run the generated command =============================
            await self._call_script(cmd)

            # ================================== load results =====================================
            transformed_train = DataTuple.from_npz(transformed_train_path)
            transformed_test = TestTuple.from_npz(transformed_test_path)

        # prefix the name of the algorithm to the dataset name
        transformed_train = transformed_train.replace(
            name=None if train.name is None else f"{self.name}: {train.name}")
        transformed_test = transformed_test.replace(
            name=None if test.name is None else f"{self.name}: {test.name}")
        return transformed_train, transformed_test
Example #22
0
def encode_dataset(enc: nn.Module, dataloader: torch.utils.data.DataLoader,
                   datatuple: DataTuple) -> DataTuple:
    """Encode a dataset."""
    data_to_return: List[Any] = []

    for embedding, _, _ in dataloader:
        data_to_return += enc(embedding).data.numpy().tolist()

    return DataTuple(
        x=pd.DataFrame(data_to_return),
        s=datatuple.s,
        y=datatuple.y,
        name=f"Beutel: {datatuple.name}",
    )
Example #23
0
def metric_per_sensitive_attribute(
        prediction: Prediction,
        actual: DataTuple,
        metric: Metric,
        use_sens_name: bool = True) -> Dict[str, float]:
    """Compute a metric repeatedly on subsets of the data that share a senstitive attribute."""
    if not metric.apply_per_sensitive:
        raise MetricNotApplicable(
            f"Metric {metric.name} is not applicable per sensitive "
            f"attribute, apply to whole dataset instead")

    assert actual.s.shape[0] == actual.x.shape[0]
    assert actual.s.shape[0] == actual.y.shape[0]
    assert prediction.hard.shape[0] == actual.y.shape[0]

    per_sensitive_attr: Dict[str, float] = {}

    s_columns: List[str] = list(actual.s.columns)
    y_columns: List[str] = list(actual.y.columns)
    assert len(y_columns) == 1

    for y_col in y_columns:
        for s_col in s_columns:
            for unique_s in actual.s[s_col].unique():
                mask: pd.Series = actual.s[s_col] == unique_s
                subset = DataTuple(
                    x=pd.DataFrame(
                        actual.x.loc[mask][actual.x.columns],
                        columns=actual.x.columns).reset_index(drop=True),
                    s=pd.DataFrame(actual.s.loc[mask][s_col],
                                   columns=[s_col]).reset_index(drop=True),
                    y=pd.DataFrame(actual.y.loc[mask][y_col],
                                   columns=[y_col]).reset_index(drop=True),
                    name=actual.name,
                )
                pred_y: Prediction
                if isinstance(prediction, SoftPrediction):
                    pred_y = SoftPrediction(
                        soft=prediction.soft.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                else:
                    pred_y = Prediction(
                        hard=prediction.hard.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                key = (s_col if use_sens_name else "S") + "_" + str(unique_s)
                per_sensitive_attr[key] = metric.score(pred_y, subset)

    return per_sensitive_attr
Example #24
0
def scale_continuous(
    dataset: Dataset,
    datatuple: DataTuple,
    scaler: ScalerType,
    inverse: bool = False,
    fit: bool = True,
) -> Tuple[DataTuple, ScalerType]:
    """Use a scaler on just the continuous features.

    Args:
        dataset:
            Dataset object. Used to find the continuous features.
        datatuple:
            DataTuple on which to sclae the continuous features.
        scaler:
            Scaler object to scale the features. Must fit the SKLearn scaler API.
        inverse:
            Should the scaling be reversed?
        fit:
            If not `inverse`, should the scaler be fit to the data? If `True`, do
            `fit_transform` operation, else just `transform`.

    Returns:
        Tuple of (scaled) DataTuple, and the Scaler (which may have been fit to the data).

    Examples:
        >>> dataset = adult()
        >>> datatuple = dataset.load()
        >>> train, test = train_test_split(datatuple)
        >>> train, scaler = scale_continuous(dataset, train, scaler)
        >>> test, scaler = scale_continuous(dataset, test, scaler, fit=False)
    """
    new_feats = datatuple.x.copy().astype('float64')
    if inverse:
        new_feats[dataset.continuous_features] = scaler.inverse_transform(
            new_feats[dataset.continuous_features])
    elif fit:
        new_feats[dataset.continuous_features] = scaler.fit_transform(
            new_feats[dataset.continuous_features])
    else:
        new_feats[dataset.continuous_features] = scaler.transform(
            new_feats[dataset.continuous_features])
    return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
Example #25
0
def main() -> None:
    """LFR Model.

    Learning fair representations is a pre-processing technique that finds a
    latent representation which encodes the data well but obfuscates information
    about protected attributes [2]_.

    References:
        .. [2] R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork,  "Learning
           Fair Representations." International Conference on Machine Learning,
           2013.
    Based on code from https://github.com/zjelveh/learning-fair-representations
    Which in turn, we've got from AIF360
    """
    args = ZemelArgs()
    args.parse_args()
    if args.mode == "run":
        assert args.train is not None
        assert args.new_train is not None
        assert args.test is not None
        assert args.new_test is not None
        train, test = load_data_from_flags(args)
        save_transformations(train_and_transform(train, test, args), args)
    elif args.mode == "fit":
        assert args.model is not None
        assert args.train is not None
        assert args.new_train is not None
        train = DataTuple.from_npz(Path(args.train))
        model = fit(train, args)
        sens_col = train.s.columns[0]
        training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
        training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()
        train_transformed = trans(model.prototypes, model.w,
                                  training_nonsensitive, training_sensitive,
                                  train)
        data = DataTuple(x=train_transformed,
                         s=train.s,
                         y=train.y,
                         name=train.name)
        data.to_npz(Path(args.new_train))
        dump(model, Path(args.model))
    elif args.mode == "transform":
        assert args.model is not None
        assert args.test is not None
        assert args.new_test is not None
        test = DataTuple.from_npz(Path(args.test))
        model = load(Path(args.model))
        transformed_test = transform(test, model.prototypes, model.w)
        transformed_test.to_npz(Path(args.new_test))
Example #26
0
def train_and_transform(train: DataTuple, test: TestTuple,
                        flags: ZemelArgs) -> (Tuple[DataTuple, TestTuple]):
    """Train and transform."""
    prototypes, w = fit(train, flags)
    sens_col = train.s.columns[0]

    training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
    training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()

    testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy()
    testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy()

    train_transformed = trans(prototypes, w, training_nonsensitive,
                              training_sensitive, train)
    test_transformed = trans(prototypes, w, testing_nonsensitive,
                             testing_sensitive, test)

    return (
        DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name),
        TestTuple(x=test_transformed, s=test.s, name=test.name),
    )
Example #27
0
def bin_cont_feats(data: DataTuple) -> DataTuple:
    """Bin the continuous fetures.

    Given a datatuple, bin the columns that have ordinal features
    and return as afresh new DataTuple.
    """
    groups: List[List[str]] = [
        list(group)
        for _, group in groupby(data.x.columns, lambda x: x.split("_")[0])
    ]

    copy: pd.DataFrame = data.x.copy()

    for group in groups:
        # if there is only one element in the group, then it corresponds to a continuous feature
        if len(group) == 1 and data.x[group[0]].nunique() > 2:
            copy[group] = pd.cut(data.x[group].to_numpy()[:, 0], 5)
            copy = pd.concat([copy, pd.get_dummies(copy[group])],
                             axis="columns")
            copy = copy.drop(group, axis="columns")

    return data.replace(x=copy)
Example #28
0
def main() -> None:
    """This function runs the FWD model as a standalone program on tabular data."""
    args = DroArgs().parse_args()
    if args.mode == "run":
        assert args.train is not None
        assert args.test is not None
        assert args.predictions is not None
        train, test = load_data_from_flags(args)
        train_and_predict(train, test, args).to_npz(Path(args.predictions))
    elif args.mode == "fit":
        assert args.train is not None
        assert args.model is not None
        data = DataTuple.from_npz(Path(args.train))
        model = fit(data, args)
        dump(model, Path(args.model))
    elif args.mode == "predict":
        assert args.model is not None
        assert args.predictions is not None
        assert args.test is not None
        data = TestTuple.from_npz(Path(args.test))
        model = load(Path(args.model))
        predict(model, data, args).to_npz(Path(args.predictions))
def concat_datatuples(first_dt: DataTuple, second_dt: DataTuple) -> DataTuple:
    """Given 2 datatuples, concatenate them and shuffle."""
    assert (first_dt.x.columns == second_dt.x.columns).all()
    assert (first_dt.s.columns == second_dt.s.columns).all()
    assert (first_dt.y.columns == second_dt.y.columns).all()

    x_columns: pd.Index = first_dt.x.columns
    s_columns: pd.Index = first_dt.s.columns
    y_columns: pd.Index = first_dt.y.columns

    a_combined: pd.DataFrame = pd.concat([first_dt.x, first_dt.s, first_dt.y],
                                         axis="columns")
    b_combined: pd.DataFrame = pd.concat(
        [second_dt.x, second_dt.s, second_dt.y], axis="columns")

    combined: pd.DataFrame = pd.concat([a_combined, b_combined], axis="index")
    combined = combined.sample(frac=1.0, random_state=1).reset_index(drop=True)

    return DataTuple(x=combined[x_columns],
                     s=combined[s_columns],
                     y=combined[y_columns],
                     name=first_dt.name)
def upsample(
    dataset: DataTuple, test: TestTuple,
    strategy: Literal["uniform", "preferential", "naive"]
) -> Tuple[DataTuple, TestTuple]:
    """Upsample a datatuple."""
    s_col = dataset.s.columns[0]
    y_col = dataset.y.columns[0]

    s_vals: List[int] = list(map(int, dataset.s[s_col].unique()))
    y_vals: List[int] = list(map(int, dataset.y[y_col].unique()))

    groups = itertools.product(s_vals, y_vals)

    data: Dict[Tuple[int, int], DataTuple] = {}
    for s, y in groups:
        s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y)
        data[(s, y)] = DataTuple(
            x=dataset.x.loc[s_y_mask].reset_index(drop=True),
            s=dataset.s.loc[s_y_mask].reset_index(drop=True),
            y=dataset.y.loc[s_y_mask].reset_index(drop=True),
            name=dataset.name,
        )

    percentages: Dict[Tuple[int, int], float] = {}

    vals: List[int] = []
    for key, val in data.items():
        vals.append(val.x.shape[0])

    for key, val in data.items():
        if strategy == "naive":
            percentages[key] = max(vals) / val.x.shape[0]
        else:
            s_val: int = key[0]
            y_val: int = key[1]

            y_eq_y = dataset.y.loc[dataset.y[y_col] ==
                                   y_val].count().to_numpy()[0]
            s_eq_s = dataset.s.loc[dataset.s[s_col] ==
                                   s_val].count().to_numpy()[0]

            num_samples = dataset.y.count().to_numpy()[0]
            num_batch = val.y.count().to_numpy()[0]

            percentages[key] = round(
                (y_eq_y * s_eq_s / (num_batch * num_samples)), 8)

    x_columns: pd.Index = dataset.x.columns
    s_columns: pd.Index = dataset.s.columns
    y_columns: pd.Index = dataset.y.columns

    upsampled: Dict[Tuple[int, int], DataTuple] = {}
    for key, val in data.items():
        all_data: pd.DataFrame = pd.concat([val.x, val.s, val.y],
                                           axis="columns")
        all_data = all_data.sample(frac=percentages[key],
                                   random_state=1,
                                   replace=True).reset_index(drop=True)
        upsampled[key] = DataTuple(x=all_data[x_columns],
                                   s=all_data[s_columns],
                                   y=all_data[y_columns],
                                   name=dataset.name)

    upsampled_datatuple: Optional[DataTuple] = None
    for key, val in upsampled.items():
        if upsampled_datatuple is None:
            upsampled_datatuple = val
        else:
            upsampled_datatuple = concat_datatuples(upsampled_datatuple, val)

    if strategy == "preferential":
        ranker = LRProb()
        rank: SoftPrediction = ranker.run(dataset, dataset)

        selected: List[pd.DataFrame] = []

        all_data = pd.concat([dataset.x, dataset.s, dataset.y], axis="columns")
        all_data = pd.concat(
            [all_data, pd.DataFrame(rank.soft, columns=["preds"])],
            axis="columns")

        for key, val in data.items():

            s_val = key[0]
            y_val = key[1]
            s_y_mask = (dataset.s[s_col] == s_val) & (dataset.y[y_col]
                                                      == y_val)

            ascending = False
            if s_val <= 0:
                ascending = True

            if percentages[key] > 1.0:
                selected.append(all_data.loc[s_y_mask])
                percentages[key] -= 1.0

            weight = all_data.loc[s_y_mask][y_col].count()
            selected.append(all_data.loc[s_y_mask].sort_values(
                by=["preds"],
                ascending=ascending).iloc[:int(percentages[key] * weight)])

        upsampled_dataframes: pd.DataFrame
        for i, df in enumerate(selected):
            if i == 0:
                upsampled_dataframes = df.drop(["preds"], axis="columns")
            else:
                upsampled_dataframes = pd.concat(
                    [upsampled_dataframes,
                     df.drop(["preds"], axis="columns")],
                    axis="index").reset_index(drop=True)
        upsampled_datatuple = DataTuple(
            x=upsampled_dataframes[x_columns],
            s=upsampled_dataframes[s_columns],
            y=upsampled_dataframes[y_columns],
            name=dataset.name,
        )

    assert upsampled_datatuple is not None
    return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=test.name)