Example #1
0
    def __call__(
            self,
            data: DataTuple,
            split_id: int = 0
    ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]:
        random_seed = self._get_seed(split_id)
        train_indexes, test_indexes = generate_proportional_split_indexes(
            data,
            train_percentage=self.train_percentage,
            random_seed=random_seed)

        train: DataTuple = DataTuple(
            x=data.x.iloc[train_indexes].reset_index(drop=True),
            s=data.s.iloc[train_indexes].reset_index(drop=True),
            y=data.y.iloc[train_indexes].reset_index(drop=True),
            name=f"{data.name} - Train",
        )

        test: DataTuple = DataTuple(
            x=data.x.iloc[test_indexes].reset_index(drop=True),
            s=data.s.iloc[test_indexes].reset_index(drop=True),
            y=data.y.iloc[test_indexes].reset_index(drop=True),
            name=f"{data.name} - Test",
        )

        # assert that no data points got lost anywhere
        assert len(data) == len(train) + len(test)

        split_info: Dict[str, float] = {"seed": random_seed}

        return train, test, split_info
Example #2
0
def domain_split(datatup: DataTuple,
                 tr_cond: str,
                 te_cond: str,
                 seed: int = 888) -> Tuple[DataTuple, DataTuple]:
    """Splits a datatuple based on a condition.

    Args:
        datatup: DataTuple
        tr_cond: condition for the training set
        te_cond: condition for the test set

    Returns:
        Tuple of DataTuple split into train and test. The test is all those that meet
        the test condition plus the same percentage again of the train set.
    """
    dataset = datatup.x

    train_dataset = dataset_from_cond(dataset, tr_cond)
    test_dataset = dataset_from_cond(dataset, te_cond)

    assert train_dataset.shape[0] + test_dataset.shape[0] == dataset.shape[0]

    test_pct = test_dataset.shape[0] / dataset.shape[0]
    train_pct = 1 - test_pct

    train_train_pcnt = (1 - (test_pct * 2)) / train_pct

    train_train = train_dataset.sample(frac=train_train_pcnt,
                                       random_state=seed)
    test_train = train_dataset.drop(train_train.index,
                                    axis="index")  # type: ignore[arg-type]

    test = pd.concat([test_train, test_dataset], axis="index")

    train_x = datatup.x.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    train_s = datatup.s.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    train_y = datatup.y.iloc[train_train.index].reset_index(
        drop=True)  # type: ignore[call-overload]

    train_datatup = DataTuple(x=train_x,
                              s=train_s,
                              y=train_y,
                              name=datatup.name)

    test_x = datatup.x.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    test_s = datatup.s.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]
    test_y = datatup.y.iloc[test.index].reset_index(
        drop=True)  # type: ignore[call-overload]

    test_datatup = DataTuple(x=test_x, s=test_s, y=test_y, name=datatup.name)

    return train_datatup, test_datatup
Example #3
0
def fold_data(data: DataTuple,
              folds: int) -> Iterator[Tuple[DataTuple, DataTuple]]:
    """So much love to sklearn for making their source code open."""
    indices: np.ndarray = np.arange(data.x.shape[0])

    fold_sizes: np.ndarray = np.full(folds,
                                     data.x.shape[0] // folds,
                                     dtype=np.int32)
    fold_sizes[:data.x.shape[0] % folds] += np.int32(1)

    current = 0
    for i, fold_size in enumerate(fold_sizes):
        start, stop = current, int(current + fold_size)
        val_inds: np.ndarray = indices[start:stop]
        train_inds = np.array([i for i in indices
                               if i not in val_inds])  # probably inefficient

        train_x = data.x.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        train_s = data.s.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        train_y = data.y.iloc[train_inds].reset_index(
            drop=True)  # type: ignore[call-overload]

        assert train_x.shape == (len(train_inds), data.x.shape[1])
        assert train_s.shape == (len(train_inds), data.s.shape[1])
        assert train_y.shape == (len(train_inds), data.y.shape[1])

        val_x = data.x.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        val_s = data.s.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]
        val_y = data.y.iloc[val_inds].reset_index(
            drop=True)  # type: ignore[call-overload]

        assert val_x.shape == (len(val_inds), data.x.shape[1])
        assert val_s.shape == (len(val_inds), data.s.shape[1])
        assert val_y.shape == (len(val_inds), data.y.shape[1])

        yield DataTuple(x=train_x,
                        s=train_s,
                        y=train_y,
                        name=f"{data.name} - train fold {i}"), DataTuple(
                            x=val_x,
                            s=val_s,
                            y=val_y,
                            name=f"{data.name} - test fold {i}")

        current = stop
def train_and_transform(train: DataTuple, test: TestTuple,
                        flags: VfaeArgs) -> Tuple[DataTuple, TestTuple]:
    """Train the model and transform the dataset.

    Args:
        train:
        test:
        flags:

    Returns:
        Tuple of Encoded Train Dataset and Test Dataset.
    """
    dataset = get_dataset_obj_by_name(flags.dataset)()

    # Set up the data
    train_data = CustomDataset(train)
    train_loader = DataLoader(train_data, batch_size=flags.batch_size)

    test_data = TestDataset(test)
    test_loader = DataLoader(test_data, batch_size=flags.batch_size)

    # Build Network
    model = VFAENetwork(
        dataset,
        flags.supervised,
        train_data.xdim,
        latent_dims=50,
        z1_enc_size=flags.z1_enc_size,
        z2_enc_size=flags.z2_enc_size,
        z1_dec_size=flags.z1_dec_size,
    ).to("cpu")
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # Run Network
    for epoch in range(int(flags.epochs)):
        train_model(epoch, model, train_loader, optimizer, flags)

    # Transform output
    post_train: List[List[float]] = []
    post_test: List[List[float]] = []
    model.eval()
    with torch.no_grad():
        for _x, _s, _ in train_loader:
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            z1 = model.reparameterize(z1_mu, z1_logvar)
            post_train += z1.data.tolist()
        for _x, _s in test_loader:
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            z1 = model.reparameterize(z1_mu, z1_logvar)
            post_test += z1.data.tolist()

    return (
        DataTuple(x=pd.DataFrame(post_train),
                  s=train.s,
                  y=train.y,
                  name=f"VFAE: {train.name}"),
        TestTuple(x=pd.DataFrame(post_test),
                  s=test.s,
                  name=f"VFAE: {test.name}"),
    )
Example #5
0
def transform(model: VFAENetwork, dataset: T, flags) -> T:
    """Transform the dataset."""
    data: Union[CustomDataset, TestDataset]
    if isinstance(dataset, DataTuple):
        data = CustomDataset(dataset)
        loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False)
    elif isinstance(dataset, TestTuple):
        data = TestDataset(dataset)
        loader = DataLoader(data, batch_size=flags.batch_size, shuffle=False)

    post_train: List[List[float]] = []
    model.eval()
    with torch.no_grad():
        for sample in loader:
            if isinstance(dataset, DataTuple):
                _x, _s, _ = sample
            elif isinstance(dataset, TestTuple):
                _x, _s = sample
            z1_mu, z1_logvar = model.encode_z1(_x, _s)
            # z1 = model.reparameterize(z1_mu, z1_logvar)
            post_train += z1_mu.data.tolist()

    if isinstance(dataset, DataTuple):
        return DataTuple(x=pd.DataFrame(post_train),
                         s=dataset.s,
                         y=dataset.y,
                         name=f"VFAE: {dataset.name}")
    elif isinstance(dataset, TestTuple):
        return TestTuple(x=pd.DataFrame(post_train),
                         s=dataset.s,
                         name=f"VFAE: {dataset.name}")
def train_and_transform(
    train: DataTuple, test: TestTuple, flags: ZemelArgs
) -> (Tuple[DataTuple, TestTuple]):
    """Train the Zemel model and return the transformed features of the train and test sets."""
    np.random.seed(flags.seed)

    sens_col = train.s.columns[0]
    training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
    training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()
    ytrain_sensitive = train.y.loc[train.s[sens_col] == 0].to_numpy()
    ytrain_nonsensitive = train.y.loc[train.s[sens_col] == 1].to_numpy()

    print_interval = 100
    verbose = False

    num_train_samples, features_dim = train.x.shape

    # Initialize the LFR optim objective parameters
    parameters_initialization = np.random.uniform(
        size=flags.clusters + features_dim * flags.clusters
    )
    bnd = [(0, 1)] * flags.clusters + [(None, None)] * features_dim * flags.clusters  # type: ignore[operator]
    LFR_optim_objective.steps = 0  # type: ignore[attr-defined]

    learned_model = optim.fmin_l_bfgs_b(
        LFR_optim_objective,
        x0=parameters_initialization,
        epsilon=1e-5,
        args=(
            training_nonsensitive,
            training_sensitive,
            ytrain_nonsensitive[:, 0],
            ytrain_sensitive[:, 0],
            flags.clusters,
            flags.Ax,
            flags.Ay,
            flags.Az,
            print_interval,
            verbose,
        ),
        bounds=bnd,
        approx_grad=True,
        maxfun=flags.maxfun,
        maxiter=flags.max_iter,
        disp=verbose,
    )[0]
    w = learned_model[: flags.clusters]
    prototypes = learned_model[flags.clusters :].reshape((flags.clusters, features_dim))

    testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy()
    testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy()

    train_transformed = trans(prototypes, w, training_nonsensitive, training_sensitive, train)
    test_transformed = trans(prototypes, w, testing_nonsensitive, testing_sensitive, test)

    return (
        DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name),
        TestTuple(x=test_transformed, s=test.s, name=test.name),
    )
Example #7
0
def transform(data: T, prototypes: np.ndarray, w: np.ndarray) -> T:
    """Transform."""
    sens_col = data.s.columns[0]
    data_sens = data.x.loc[data.s[sens_col] == 0].to_numpy()
    data_nons = data.x.loc[data.s[sens_col] == 1].to_numpy()
    transformed = trans(prototypes, w, data_nons, data_sens, data)
    if isinstance(data, DataTuple):
        return DataTuple(x=transformed, s=data.s, y=data.y, name=data.name)
    elif isinstance(data, TestTuple):
        return TestTuple(x=transformed, s=data.s, name=data.name)
def scale_continuous(dataset: Dataset,
                     datatuple: DataTuple,
                     scaler: ScalerType,
                     inverse: bool = False) -> Tuple[DataTuple, ScalerType]:
    """Use a scaler on just the continuous features."""
    new_feats = datatuple.x.copy().astype('float64')
    if inverse:
        new_feats[dataset.continuous_features] = scaler.inverse_transform(
            new_feats[dataset.continuous_features])
    else:
        new_feats[dataset.continuous_features] = scaler.fit_transform(
            new_feats[dataset.continuous_features])
    return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
Example #9
0
def encode_dataset(enc: nn.Module, dataloader: torch.utils.data.DataLoader,
                   datatuple: DataTuple) -> DataTuple:
    """Encode a dataset."""
    data_to_return: List[Any] = []

    for embedding, _, _ in dataloader:
        data_to_return += enc(embedding).data.numpy().tolist()

    return DataTuple(
        x=pd.DataFrame(data_to_return),
        s=datatuple.s,
        y=datatuple.y,
        name=f"Beutel: {datatuple.name}",
    )
Example #10
0
def main() -> None:
    """LFR Model.

    Learning fair representations is a pre-processing technique that finds a
    latent representation which encodes the data well but obfuscates information
    about protected attributes [2]_.

    References:
        .. [2] R. Zemel, Y. Wu, K. Swersky, T. Pitassi, and C. Dwork,  "Learning
           Fair Representations." International Conference on Machine Learning,
           2013.
    Based on code from https://github.com/zjelveh/learning-fair-representations
    Which in turn, we've got from AIF360
    """
    args = ZemelArgs()
    args.parse_args()
    if args.mode == "run":
        assert args.train is not None
        assert args.new_train is not None
        assert args.test is not None
        assert args.new_test is not None
        train, test = load_data_from_flags(args)
        save_transformations(train_and_transform(train, test, args), args)
    elif args.mode == "fit":
        assert args.model is not None
        assert args.train is not None
        assert args.new_train is not None
        train = DataTuple.from_npz(Path(args.train))
        model = fit(train, args)
        sens_col = train.s.columns[0]
        training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
        training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()
        train_transformed = trans(model.prototypes, model.w,
                                  training_nonsensitive, training_sensitive,
                                  train)
        data = DataTuple(x=train_transformed,
                         s=train.s,
                         y=train.y,
                         name=train.name)
        data.to_npz(Path(args.new_train))
        dump(model, Path(args.model))
    elif args.mode == "transform":
        assert args.model is not None
        assert args.test is not None
        assert args.new_test is not None
        test = DataTuple.from_npz(Path(args.test))
        model = load(Path(args.model))
        transformed_test = transform(test, model.prototypes, model.w)
        transformed_test.to_npz(Path(args.new_test))
Example #11
0
def metric_per_sensitive_attribute(
        prediction: Prediction,
        actual: DataTuple,
        metric: Metric,
        use_sens_name: bool = True) -> Dict[str, float]:
    """Compute a metric repeatedly on subsets of the data that share a senstitive attribute."""
    if not metric.apply_per_sensitive:
        raise MetricNotApplicable(
            f"Metric {metric.name} is not applicable per sensitive "
            f"attribute, apply to whole dataset instead")

    assert actual.s.shape[0] == actual.x.shape[0]
    assert actual.s.shape[0] == actual.y.shape[0]
    assert prediction.hard.shape[0] == actual.y.shape[0]

    per_sensitive_attr: Dict[str, float] = {}

    s_columns: List[str] = list(actual.s.columns)
    y_columns: List[str] = list(actual.y.columns)
    assert len(y_columns) == 1

    for y_col in y_columns:
        for s_col in s_columns:
            for unique_s in actual.s[s_col].unique():
                mask: pd.Series = actual.s[s_col] == unique_s
                subset = DataTuple(
                    x=pd.DataFrame(
                        actual.x.loc[mask][actual.x.columns],
                        columns=actual.x.columns).reset_index(drop=True),
                    s=pd.DataFrame(actual.s.loc[mask][s_col],
                                   columns=[s_col]).reset_index(drop=True),
                    y=pd.DataFrame(actual.y.loc[mask][y_col],
                                   columns=[y_col]).reset_index(drop=True),
                    name=actual.name,
                )
                pred_y: Prediction
                if isinstance(prediction, SoftPrediction):
                    pred_y = SoftPrediction(
                        soft=prediction.soft.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                else:
                    pred_y = Prediction(
                        hard=prediction.hard.loc[mask].reset_index(drop=True),
                        info=prediction.info)
                key = (s_col if use_sens_name else "S") + "_" + str(unique_s)
                per_sensitive_attr[key] = metric.score(pred_y, subset)

    return per_sensitive_attr
Example #12
0
def scale_continuous(
    dataset: Dataset,
    datatuple: DataTuple,
    scaler: ScalerType,
    inverse: bool = False,
    fit: bool = True,
) -> Tuple[DataTuple, ScalerType]:
    """Use a scaler on just the continuous features.

    Args:
        dataset:
            Dataset object. Used to find the continuous features.
        datatuple:
            DataTuple on which to sclae the continuous features.
        scaler:
            Scaler object to scale the features. Must fit the SKLearn scaler API.
        inverse:
            Should the scaling be reversed?
        fit:
            If not `inverse`, should the scaler be fit to the data? If `True`, do
            `fit_transform` operation, else just `transform`.

    Returns:
        Tuple of (scaled) DataTuple, and the Scaler (which may have been fit to the data).

    Examples:
        >>> dataset = adult()
        >>> datatuple = dataset.load()
        >>> train, test = train_test_split(datatuple)
        >>> train, scaler = scale_continuous(dataset, train, scaler)
        >>> test, scaler = scale_continuous(dataset, test, scaler, fit=False)
    """
    new_feats = datatuple.x.copy().astype('float64')
    if inverse:
        new_feats[dataset.continuous_features] = scaler.inverse_transform(
            new_feats[dataset.continuous_features])
    elif fit:
        new_feats[dataset.continuous_features] = scaler.fit_transform(
            new_feats[dataset.continuous_features])
    else:
        new_feats[dataset.continuous_features] = scaler.transform(
            new_feats[dataset.continuous_features])
    return DataTuple(x=new_feats, s=datatuple.s, y=datatuple.y), scaler
Example #13
0
    def adjust(self, dataset: DataTuple) -> DataTuple:
        """Take a datatuple and make the labels [0,1]."""
        y_col = dataset.y.columns[0]
        assert dataset.y[y_col].nunique() == 2

        # make copy of dataset
        dataset = dataset.replace(y=dataset.y.copy())

        self.min_val = dataset.y.to_numpy().min().item()
        self.max_val = dataset.y.to_numpy().max().item()

        y_col = dataset.y.columns[0]

        dataset.y[y_col] = dataset.y[y_col].replace(self.min_val, 0)
        dataset.y[y_col] = dataset.y[y_col].replace(self.max_val, 1)

        return DataTuple(x=dataset.x,
                         s=dataset.s,
                         y=dataset.y,
                         name=dataset.name)
Example #14
0
def train_and_transform(train: DataTuple, test: TestTuple,
                        flags: ZemelArgs) -> (Tuple[DataTuple, TestTuple]):
    """Train and transform."""
    prototypes, w = fit(train, flags)
    sens_col = train.s.columns[0]

    training_sensitive = train.x.loc[train.s[sens_col] == 0].to_numpy()
    training_nonsensitive = train.x.loc[train.s[sens_col] == 1].to_numpy()

    testing_sensitive = test.x.loc[test.s[sens_col] == 0].to_numpy()
    testing_nonsensitive = test.x.loc[test.s[sens_col] == 1].to_numpy()

    train_transformed = trans(prototypes, w, training_nonsensitive,
                              training_sensitive, train)
    test_transformed = trans(prototypes, w, testing_nonsensitive,
                             testing_sensitive, test)

    return (
        DataTuple(x=train_transformed, s=train.s, y=train.y, name=train.name),
        TestTuple(x=test_transformed, s=test.s, name=test.name),
    )
def concat_datatuples(first_dt: DataTuple, second_dt: DataTuple) -> DataTuple:
    """Given 2 datatuples, concatenate them and shuffle."""
    assert (first_dt.x.columns == second_dt.x.columns).all()
    assert (first_dt.s.columns == second_dt.s.columns).all()
    assert (first_dt.y.columns == second_dt.y.columns).all()

    x_columns: pd.Index = first_dt.x.columns
    s_columns: pd.Index = first_dt.s.columns
    y_columns: pd.Index = first_dt.y.columns

    a_combined: pd.DataFrame = pd.concat([first_dt.x, first_dt.s, first_dt.y],
                                         axis="columns")
    b_combined: pd.DataFrame = pd.concat(
        [second_dt.x, second_dt.s, second_dt.y], axis="columns")

    combined: pd.DataFrame = pd.concat([a_combined, b_combined], axis="index")
    combined = combined.sample(frac=1.0, random_state=1).reset_index(drop=True)

    return DataTuple(x=combined[x_columns],
                     s=combined[s_columns],
                     y=combined[y_columns],
                     name=first_dt.name)
Example #16
0
    def _backend_load(
        self, dataframe: pd.DataFrame, *, labels_as_features: bool, ordered: bool
    ) -> DataTuple:
        # +++ BELOW HERE IS A COPY OF DATASET LOAD +++

        assert isinstance(dataframe, pd.DataFrame)

        feature_split = self.feature_split if not ordered else self.ordered_features
        if labels_as_features:
            feature_split_x = feature_split["x"] + feature_split["s"] + feature_split["y"]
        else:
            feature_split_x = feature_split["x"]

        # =========================================================================================
        # Check whether we have to generate some complementary columns for binary features.
        # This happens when we have for example several races: race-asian-pac-islander etc, but we
        # want to have a an attribute called "race_other" that summarizes them all. Now the problem
        # is that this cannot be done before this point, because only here have we actually loaded
        # the data. So, we have to do it here, with all the information we can piece together.

        disc_feature_groups = self._discrete_feature_groups
        if disc_feature_groups is not None:
            for group in disc_feature_groups.values():
                if len(group) == 1:
                    continue
                for feature in group:
                    if feature in dataframe.columns:
                        continue  # nothing to do
                    missing_feature = feature

                    existing_features = [other for other in group if other in dataframe.columns]
                    assert len(existing_features) == len(group) - 1, "at most 1 feature missing"
                    # the dummy feature is the inverse of the existing feature
                    or_combination = dataframe[existing_features[0]] == 1
                    for other in existing_features[1:]:
                        or_combination |= dataframe[other] == 1
                    inverse: pd.Series = 1 - or_combination
                    dataframe = pd.concat(
                        [dataframe, inverse.to_frame(name=missing_feature)], axis="columns"
                    )

        # =========================================================================================
        x_data = dataframe[feature_split_x]
        s_data = dataframe[feature_split["s"]]
        y_data = dataframe[feature_split["y"]]

        if self._map_to_binary:
            s_data = (s_data + 1) // 2  # map from {-1, 1} to {0, 1}
            y_data = (y_data + 1) // 2  # map from {-1, 1} to {0, 1}

        if self._invert_s:
            assert s_data.nunique().values[0] == 2, "s must be binary"
            s_data = 1 - s_data

        # the following operations remove rows if a label group is not properly one-hot encoded
        s_data, s_mask = self._maybe_combine_labels(s_data, label_type="s")
        if s_mask is not None:
            x_data = x_data.loc[s_mask].reset_index(drop=True)
            s_data = s_data.loc[s_mask].reset_index(drop=True)
            y_data = y_data.loc[s_mask].reset_index(drop=True)
        y_data, y_mask = self._maybe_combine_labels(y_data, label_type="y")
        if y_mask is not None:
            x_data = x_data.loc[y_mask].reset_index(drop=True)
            s_data = s_data.loc[y_mask].reset_index(drop=True)
            y_data = y_data.loc[y_mask].reset_index(drop=True)

        return DataTuple(x=x_data, s=s_data, y=y_data, name=self.name)
Example #17
0
def _calders_algorithm(
        dataset: DataTuple, test: TestTuple, good_class: int,
        disadvantaged_group: int) -> Tuple[DataTuple, TestTuple]:
    s_col = dataset.s.columns[0]
    y_col = dataset.y.columns[0]

    s_vals: List[int] = list(map(int, dataset.s[s_col].unique()))
    y_vals: List[int] = list(map(int, dataset.y[y_col].unique()))

    assert len(s_vals) == 2
    assert len(y_vals) == 2
    s_0, s_1 = s_vals
    y_0, y_1 = y_vals

    bad_class = y_0 if good_class == y_1 else y_1
    advantaged_group = s_0 if disadvantaged_group == s_1 else s_1

    groups = ((s_0, y_0), (s_0, y_1), (s_1, y_0), (s_1, y_1))
    data: Dict[Tuple[int, int], DataTuple] = {}
    for s, y in groups:
        s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y)
        data[(s, y)] = DataTuple(
            x=dataset.x.loc[s_y_mask].reset_index(drop=True),
            s=dataset.s.loc[s_y_mask].reset_index(drop=True),
            y=dataset.y.loc[s_y_mask].reset_index(drop=True),
            name=dataset.name,
        )

    dis_group = (disadvantaged_group, bad_class)
    adv_group = (advantaged_group, good_class)

    massaging_candidates = concat_dt([data[dis_group], data[adv_group]])

    ranker = LRProb()
    rank: SoftPrediction = ranker.run(dataset, massaging_candidates)

    dis_group_len = len(data[dis_group])
    adv_group_len = len(data[adv_group])

    dis_group_rank = rank.soft.iloc[:dis_group_len]
    adv_group_rank = rank.soft.iloc[dis_group_len:].reset_index(drop=True)
    assert len(adv_group_rank) == adv_group_len

    # sort the ranking
    dis_group_rank.sort_values(ascending=False, inplace=True)
    adv_group_rank.sort_values(inplace=True)

    # use the rank to sort the data
    for group, ranking in [(dis_group, dis_group_rank),
                           (adv_group, adv_group_rank)]:
        unsorted_data = data[group]
        data[group] = DataTuple(
            x=unsorted_data.x.reindex(index=ranking.index).reset_index(
                drop=True),
            s=unsorted_data.s.reindex(index=ranking.index).reset_index(
                drop=True),
            y=unsorted_data.y.reindex(index=ranking.index).reset_index(
                drop=True),
            name=unsorted_data.name,
        )

    all_disadvantaged = len(
        data[(disadvantaged_group, good_class)]) + dis_group_len
    all_advantaged = adv_group_len + len(data[(advantaged_group, bad_class)])
    dis_group_good_len = all_disadvantaged - dis_group_len

    # ensure that the ratio of good_class to bad_class is the same in both groups.
    # for this, we have to swap some labels
    num_to_swap = round((adv_group_len * all_disadvantaged -
                         dis_group_good_len * all_advantaged) / len(dataset))
    data[dis_group].y.iloc[:num_to_swap] = good_class
    data[adv_group].y.iloc[:num_to_swap] = bad_class

    return concat_dt(list(data.values())), test
def upsample(
    dataset: DataTuple, test: TestTuple,
    strategy: Literal["uniform", "preferential", "naive"]
) -> Tuple[DataTuple, TestTuple]:
    """Upsample a datatuple."""
    s_col = dataset.s.columns[0]
    y_col = dataset.y.columns[0]

    s_vals: List[int] = list(map(int, dataset.s[s_col].unique()))
    y_vals: List[int] = list(map(int, dataset.y[y_col].unique()))

    groups = itertools.product(s_vals, y_vals)

    data: Dict[Tuple[int, int], DataTuple] = {}
    for s, y in groups:
        s_y_mask = (dataset.s[s_col] == s) & (dataset.y[y_col] == y)
        data[(s, y)] = DataTuple(
            x=dataset.x.loc[s_y_mask].reset_index(drop=True),
            s=dataset.s.loc[s_y_mask].reset_index(drop=True),
            y=dataset.y.loc[s_y_mask].reset_index(drop=True),
            name=dataset.name,
        )

    percentages: Dict[Tuple[int, int], float] = {}

    vals: List[int] = []
    for key, val in data.items():
        vals.append(val.x.shape[0])

    for key, val in data.items():
        if strategy == "naive":
            percentages[key] = max(vals) / val.x.shape[0]
        else:
            s_val: int = key[0]
            y_val: int = key[1]

            y_eq_y = dataset.y.loc[dataset.y[y_col] ==
                                   y_val].count().to_numpy()[0]
            s_eq_s = dataset.s.loc[dataset.s[s_col] ==
                                   s_val].count().to_numpy()[0]

            num_samples = dataset.y.count().to_numpy()[0]
            num_batch = val.y.count().to_numpy()[0]

            percentages[key] = round(
                (y_eq_y * s_eq_s / (num_batch * num_samples)), 8)

    x_columns: pd.Index = dataset.x.columns
    s_columns: pd.Index = dataset.s.columns
    y_columns: pd.Index = dataset.y.columns

    upsampled: Dict[Tuple[int, int], DataTuple] = {}
    for key, val in data.items():
        all_data: pd.DataFrame = pd.concat([val.x, val.s, val.y],
                                           axis="columns")
        all_data = all_data.sample(frac=percentages[key],
                                   random_state=1,
                                   replace=True).reset_index(drop=True)
        upsampled[key] = DataTuple(x=all_data[x_columns],
                                   s=all_data[s_columns],
                                   y=all_data[y_columns],
                                   name=dataset.name)

    upsampled_datatuple: Optional[DataTuple] = None
    for key, val in upsampled.items():
        if upsampled_datatuple is None:
            upsampled_datatuple = val
        else:
            upsampled_datatuple = concat_datatuples(upsampled_datatuple, val)

    if strategy == "preferential":
        ranker = LRProb()
        rank: SoftPrediction = ranker.run(dataset, dataset)

        selected: List[pd.DataFrame] = []

        all_data = pd.concat([dataset.x, dataset.s, dataset.y], axis="columns")
        all_data = pd.concat(
            [all_data, pd.DataFrame(rank.soft, columns=["preds"])],
            axis="columns")

        for key, val in data.items():

            s_val = key[0]
            y_val = key[1]
            s_y_mask = (dataset.s[s_col] == s_val) & (dataset.y[y_col]
                                                      == y_val)

            ascending = False
            if s_val <= 0:
                ascending = True

            if percentages[key] > 1.0:
                selected.append(all_data.loc[s_y_mask])
                percentages[key] -= 1.0

            weight = all_data.loc[s_y_mask][y_col].count()
            selected.append(all_data.loc[s_y_mask].sort_values(
                by=["preds"],
                ascending=ascending).iloc[:int(percentages[key] * weight)])

        upsampled_dataframes: pd.DataFrame
        for i, df in enumerate(selected):
            if i == 0:
                upsampled_dataframes = df.drop(["preds"], axis="columns")
            else:
                upsampled_dataframes = pd.concat(
                    [upsampled_dataframes,
                     df.drop(["preds"], axis="columns")],
                    axis="index").reset_index(drop=True)
        upsampled_datatuple = DataTuple(
            x=upsampled_dataframes[x_columns],
            s=upsampled_dataframes[s_columns],
            y=upsampled_dataframes[y_columns],
            name=dataset.name,
        )

    assert upsampled_datatuple is not None
    return upsampled_datatuple, TestTuple(x=test.x, s=test.s, name=test.name)
Example #19
0
    def __call__(
            self,
            data: DataTuple,
            split_id: int = 0
    ) -> Tuple[DataTuple, DataTuple, Dict[str, float]]:
        random_seed = self._get_seed(split_id)
        random = RandomState(seed=random_seed)

        s_col = data.s.columns[0]
        y_col = data.y.columns[0]

        s_vals: List[int] = list(map(int, data.s[s_col].unique()))
        y_vals: List[int] = list(map(int, data.y[y_col].unique()))

        train_indexes: List[np.ndarray] = []
        test_indexes: List[np.ndarray] = []

        num_test: Dict[Tuple[int, int], int] = {}
        # find out how many samples are available for the test set
        for s, y in itertools.product(s_vals, y_vals):
            # find all indices for this group
            idx = ((data.s[s_col] == s) &
                   (data.y[y_col] == y)).to_numpy().nonzero()[0]
            # how many elements are in this "quadrant"
            quadrant_size = len(idx)
            # compute how many elements would be available for the test set
            num_test[(s,
                      y)] = round(quadrant_size * (1 - self.train_percentage))

        # compute how much we should take for the test set to make it balanced
        if self.balance_type == "P(s|y)=0.5":
            minimize_over_s = {
                y: min(num_test[(s, y)] for s in s_vals)
                for y in y_vals
            }
            num_test_balanced = {(s, y): minimize_over_s[y]
                                 for s in s_vals for y in y_vals}
        elif self.balance_type == "P(y|s)=0.5":
            minimize_over_y = {
                s: min(num_test[(s, y)] for y in y_vals)
                for s in s_vals
            }
            num_test_balanced = {(s, y): minimize_over_y[s]
                                 for s in s_vals for y in y_vals}
        elif self.balance_type == "P(s,y)=0.25":
            smallest_quadrant = min(num_test[(s, y)] for s in s_vals
                                    for y in y_vals)
            num_test_balanced = {(s, y): smallest_quadrant
                                 for s in s_vals for y in y_vals}
        else:
            raise ValueError("Unknown balance_type")

        num_dropped = 0
        # iterate over all combinations of s and y
        for s, y in itertools.product(s_vals, y_vals):
            # find all indices for this group
            idx = ((data.s[s_col] == s) &
                   (data.y[y_col] == y)).to_numpy().nonzero()[0]

            # shuffle and take subsets
            random.shuffle(idx)
            split_indexes: int = round(len(idx) * self.train_percentage)
            # append index subsets to the list of train indices
            train_indexes.append(idx[:split_indexes])
            test_indexes.append(idx[split_indexes:(split_indexes +
                                                   num_test_balanced[(s, y)])])
            num_dropped += num_test[(s, y)] - num_test_balanced[(s, y)]

        train_idx = np.concatenate(train_indexes, axis=0)
        test_idx = np.concatenate(test_indexes, axis=0)

        train: DataTuple = DataTuple(
            x=data.x.iloc[train_idx].reset_index(drop=True),
            s=data.s.iloc[train_idx].reset_index(drop=True),
            y=data.y.iloc[train_idx].reset_index(drop=True),
            name=f"{data.name} - Train",
        )

        test: DataTuple = DataTuple(
            x=data.x.iloc[test_idx].reset_index(drop=True),
            s=data.s.iloc[test_idx].reset_index(drop=True),
            y=data.y.iloc[test_idx].reset_index(drop=True),
            name=f"{data.name} - Test",
        )

        unbalanced_test_len = round(len(data) * (1 - self.train_percentage))
        split_info = {
            "seed": random_seed,
            "percent_dropped": num_dropped / unbalanced_test_len,
            self.balance_type: 1,
        }

        return train, test, split_info
Example #20
0
def train_test_split(data: DataTuple,
                     train_percentage: float = 0.8,
                     random_seed: int = 0) -> Tuple[DataTuple, DataTuple]:
    """Split a data tuple into two datatuple along the rows of the DataFrames.

    Args:
        data: data tuple to split
        train_percentage: percentage for train split
        random_seed: seed to make splitting reproducible

    Returns:
        train split and test split
    """
    # ======================== concatenate the datatuple to one dataframe =========================
    # save the column names for later
    x_columns: pd.Index = data.x.columns
    s_columns: pd.Index = data.s.columns
    y_columns: pd.Index = data.y.columns

    all_data: pd.DataFrame = pd.concat([data.x, data.s, data.y],
                                       axis="columns")

    all_data = shuffle_df(all_data, random_state=1)

    # ============================== split the concatenated dataframe =============================
    # permute
    all_data = shuffle_df(all_data, random_state=random_seed)

    # split
    train_len = int(train_percentage * len(all_data))
    all_data_train = all_data.iloc[:train_len]  # type: ignore[call-overload]
    all_data_test = all_data.iloc[train_len:]  # type: ignore[call-overload]

    assert isinstance(all_data_train, pd.DataFrame)
    assert isinstance(all_data_test, pd.DataFrame)

    all_data_train = all_data_train.reset_index(drop=True)
    all_data_test = all_data_test.reset_index(drop=True)

    # ================================== assemble train and test ==================================
    train: DataTuple = DataTuple(
        x=all_data_train[x_columns],
        s=all_data_train[s_columns],
        y=all_data_train[y_columns],
        name=f"{data.name} - Train",
    )

    test: DataTuple = DataTuple(
        x=all_data_test[x_columns],
        s=all_data_test[s_columns],
        y=all_data_test[y_columns],
        name=f"{data.name} - Test",
    )

    assert isinstance(train.x, pd.DataFrame)
    assert isinstance(test.x, pd.DataFrame)
    assert_index_equal(train.x.columns, x_columns)
    assert_index_equal(test.x.columns, x_columns)

    assert isinstance(train.s, pd.DataFrame)
    assert isinstance(test.s, pd.DataFrame)
    assert_index_equal(train.s.columns, s_columns)
    assert_index_equal(test.s.columns, s_columns)

    assert isinstance(train.y, pd.DataFrame)
    assert isinstance(test.y, pd.DataFrame)
    assert_index_equal(train.y.columns, y_columns)
    assert_index_equal(test.y.columns, y_columns)

    return train, test