Beispiel #1
0
def test_dp_flip_inverted_s(toy_train_test: TrainValPair) -> None:
    """Test the dem par flipping method."""
    train, test = toy_train_test
    train = train.replace(s=1 - train.s)
    test = test.replace(s=1 - test.s)
    train_test = em.concat_tt([train, test], ignore_index=True)

    in_model: InAlgorithm = LR()
    assert in_model is not None
    assert in_model.name == "Logistic Regression (C=1.0)"

    predictions: Prediction = in_model.run(train, train_test)

    # seperate out predictions on train set and predictions on test set
    pred_train = predictions.hard.iloc[: train.y.shape[0]]
    pred_test = predictions.hard.iloc[train.y.shape[0] :].reset_index(drop=True)
    assert np.count_nonzero(pred_test.values == 1) == 44
    assert np.count_nonzero(pred_test.values == 0) == 36

    post_model: PostAlgorithm = DPFlip()
    assert post_model.name == "DemPar. Post Process"
    fair_preds = post_model.run(Prediction(pred_train), train, Prediction(pred_test), test)
    assert np.count_nonzero(fair_preds.hard.values == 1) == 57
    assert np.count_nonzero(fair_preds.hard.values == 0) == 23
    diffs = em.diff_per_sensitive_attribute(
        em.metric_per_sensitive_attribute(fair_preds, test, ProbPos())
    )
    for diff in diffs.values():
        assert pytest.approx(diff, abs=1e-2) == 0
Beispiel #2
0
def test_predictions_loaded(temp_dir) -> None:
    """Test that predictions can be saved and loaded."""
    preds = Prediction(hard=pd.Series([1]))
    preds.to_npz(temp_dir / NPZ)
    loaded = Prediction.from_npz(temp_dir / NPZ)
    pd.testing.assert_series_equal(preds.hard, loaded.hard,
                                   check_dtype=False)  # type: ignore[call-arg]
Beispiel #3
0
def test_post_sep_fit_pred(
    toy_train_test: TrainValPair, post_model: PostAlgorithm, name: str, num_pos: int
) -> None:
    """Test the dem par flipping method."""
    train, test = toy_train_test
    train_test = em.concat_tt([train, test], ignore_index=True)

    in_model: InAlgorithm = LR()
    assert in_model is not None
    assert in_model.name == "Logistic Regression (C=1.0)"

    predictions: Prediction = in_model.run(train, train_test)

    # seperate out predictions on train set and predictions on test set
    pred_train = predictions.hard.iloc[: train.y.shape[0]]
    pred_test = predictions.hard.iloc[train.y.shape[0] :].reset_index(drop=True)
    assert np.count_nonzero(pred_test.values == 1) == 44
    assert np.count_nonzero(pred_test.values == 0) == 36

    assert post_model.name == name
    fair_model = post_model.fit(Prediction(pred_train), train)
    fair_preds = fair_model.predict(Prediction(pred_test), test)
    assert np.count_nonzero(fair_preds.hard.values == 1) == num_pos
    assert np.count_nonzero(fair_preds.hard.values == 0) == len(fair_preds) - num_pos
    diffs = em.diff_per_sensitive_attribute(
        em.metric_per_sensitive_attribute(fair_preds, test, ProbPos())
    )
    if isinstance(post_model, DPFlip):
        for diff in diffs.values():
            assert pytest.approx(diff, abs=1e-2) == 0
Beispiel #4
0
def test_dependence_measures_adult() -> None:
    """Test dependence measures."""
    data = load_data(em.adult(split="Sex"))
    train_percentage = 0.75
    unbalanced, balanced, _ = BalancedTestSplit(
        train_percentage=train_percentage)(data)

    fair_prediction = Prediction(
        hard=balanced.y["salary_>50K"])  # predict the balanced label
    unfair_prediction = Prediction(
        hard=unbalanced.y["salary_>50K"])  # predict the normal label
    extremely_unfair_prediction = Prediction(
        hard=unbalanced.s["sex_Male"])  # predict s

    # measure the dependence between s and the prediction in several ways
    assert _compute_di(fair_prediction, balanced) == approx(1, abs=1e-15)
    assert _compute_di(unfair_prediction, unbalanced) == approx(0.364,
                                                                abs=3e-3)
    assert _compute_di(extremely_unfair_prediction,
                       unbalanced) == approx(0, abs=3e-3)
    assert _compute_inv_cv(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert _compute_inv_cv(unfair_prediction, unbalanced) == approx(0.199,
                                                                    abs=3e-3)
    assert _compute_inv_cv(extremely_unfair_prediction,
                           unbalanced) == approx(1, abs=3e-3)
    nmi = NMI()
    assert nmi.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert nmi.score(unfair_prediction, unbalanced) == approx(0.0432, abs=3e-4)
    assert nmi.score(extremely_unfair_prediction,
                     unbalanced) == approx(1, abs=3e-4)
    yanovich = Yanovich()
    assert yanovich.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert yanovich.score(unfair_prediction, unbalanced) == approx(0.0396,
                                                                   abs=3e-4)
    assert yanovich.score(extremely_unfair_prediction,
                          unbalanced) == approx(1, abs=3e-4)
    renyi = RenyiCorrelation()
    assert renyi.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert renyi.score(unfair_prediction, unbalanced) == approx(0.216,
                                                                abs=3e-4)
    assert renyi.score(extremely_unfair_prediction,
                       unbalanced) == approx(1, abs=3e-4)
Beispiel #5
0
def test_dependence_measures(simple_data: DataTuple) -> None:
    """Test dependence measures."""
    train_percentage = 0.75
    unbalanced, balanced, _ = BalancedTestSplit(
        train_percentage=train_percentage)(simple_data)

    fair_prediction = Prediction(
        hard=balanced.y["y"])  # predict the balanced label
    unfair_prediction = Prediction(
        hard=unbalanced.y["y"])  # predict the normal label
    extremely_unfair_prediction = Prediction(
        hard=unbalanced.s["s"])  # predict s

    # measure the dependence between s and the prediction in several ways
    assert _compute_di(fair_prediction, balanced) == approx(1, abs=1e-15)
    assert _compute_di(unfair_prediction, unbalanced) == approx(0.602,
                                                                abs=3e-3)
    assert _compute_di(extremely_unfair_prediction,
                       unbalanced) == approx(0, abs=3e-3)
    assert _compute_inv_cv(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert _compute_inv_cv(unfair_prediction, unbalanced) == approx(0.265,
                                                                    abs=3e-3)
    assert _compute_inv_cv(extremely_unfair_prediction,
                           unbalanced) == approx(1, abs=3e-3)
    nmi = NMI()
    assert nmi.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert nmi.score(unfair_prediction, unbalanced) == approx(0.0437, abs=3e-4)
    assert nmi.score(extremely_unfair_prediction,
                     unbalanced) == approx(1, abs=3e-4)
    yanovich = Yanovich()
    assert yanovich.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert yanovich.score(unfair_prediction, unbalanced) == approx(0.0702,
                                                                   abs=3e-4)
    assert yanovich.score(extremely_unfair_prediction,
                          unbalanced) == approx(1, abs=3e-4)
    renyi = RenyiCorrelation()
    assert renyi.score(fair_prediction, balanced) == approx(0, abs=1e-15)
    assert renyi.score(unfair_prediction, unbalanced) == approx(0.234,
                                                                abs=3e-4)
    assert renyi.score(extremely_unfair_prediction,
                       unbalanced) == approx(1, abs=3e-4)
Beispiel #6
0
def compute_metrics(
    predictions: em.Prediction,
    actual: em.DataTuple,
    s_dim: int,
) -> dict[str, float]:
    """Compute accuracy and fairness metrics and log them.

    Args:
        args: args object
        predictions: predictions in a format that is compatible with EthicML
        actual: labels for the predictions
        model_name: name of the model used
        step: step of training (needed for logging to W&B)
        s_dim: dimension of s
        exp_name: name of the experiment
        save_summary: if True, a summary will be saved to wandb
        use_wandb: whether to use wandb at all
        additional_entries: entries that should go with in the summary
    Returns:
        dictionary with the computed metrics
    """

    predictions._info = {}
    metrics = em.run_metrics(
        predictions,
        actual,
        metrics=[em.Accuracy(),
                 em.TPR(),
                 em.TNR(),
                 em.RenyiCorrelation()],
        per_sens_metrics=[em.Accuracy(),
                          em.ProbPos(),
                          em.TPR(),
                          em.TNR()],
        diffs_and_ratios=s_dim <
        4,  # this just gets too much with higher s dim
    )
    # replace the slash; it's causing problems
    metrics = {k.replace("/", "÷"): v for k, v in metrics.items()}
    print_metrics(metrics)
    return metrics
Beispiel #7
0
def compute_metrics(
    cfg: BaseArgs,
    predictions: em.Prediction,
    actual: em.DataTuple,
    exp_name: str,
    model_name: str,
    step: int,
    save_to_csv: Optional[Path] = None,
    results_csv: str = "",
    use_wandb: bool = False,
    additional_entries: Optional[Mapping[str, float]] = None,
) -> Dict[str, float]:
    """Compute accuracy and fairness metrics and log them.

    Args:
        args: args object
        predictions: predictions in a format that is compatible with EthicML
        actual: labels for the predictions
        exp_name: name of the experiment
        model_name: name of the model used
        step: step of training (needed for logging to W&B)
        save_to_csv: if a path is given, the results are saved to a CSV file
        results_csv: name of the CSV file
    Returns:
        dictionary with the computed metrics
    """

    predictions._info = {}
    metrics = em.run_metrics(
        predictions,
        actual,
        metrics=[em.Accuracy(),
                 em.TPR(),
                 em.TNR(),
                 em.RenyiCorrelation()],
        per_sens_metrics=[em.Accuracy(),
                          em.ProbPos(),
                          em.TPR(),
                          em.TNR()],
        diffs_and_ratios=cfg.misc._s_dim <
        4,  # this just gets too much with higher s dim
    )
    # replace the slash; it's causing problems
    metrics = {k.replace("/", "÷"): v for k, v in metrics.items()}

    if use_wandb:
        wandb_log(cfg.misc,
                  {f"{k} ({model_name})": v
                   for k, v in metrics.items()},
                  step=step)

    if save_to_csv is not None:
        # full_name = f"{args.dataset}_{exp_name}"
        # exp_name += "_s" if pred_s else "_y"
        # if hasattr(args, "eval_on_recon"):
        #     exp_name += "_on_recons" if args.eval_on_recon else "_on_encodings"

        manual_entries = {
            "seed":
            str(getattr(cfg.misc, "seed", cfg.misc.data_split_seed)),
            "data":
            exp_name,
            "method":
            f'"{model_name}"',
            "wandb_url":
            str(wandb.run.get_url())
            if use_wandb and cfg.misc.use_wandb else "(None)",
        }

        external = additional_entries or {}

        if results_csv:
            assert isinstance(save_to_csv, Path)
            save_to_csv.mkdir(exist_ok=True, parents=True)
            results = {**metrics, **external}

            results_path = save_to_csv / f"{cfg.data.dataset.name}_{model_name}_{results_csv}"
            values = ",".join(
                list(manual_entries.values()) +
                [str(v) for v in results.values()])
            if not results_path.is_file():
                with results_path.open("w") as f:
                    # ========= header =========
                    f.write(",".join(
                        list(manual_entries) + [str(k)
                                                for k in results]) + "\n")
                    f.write(values + "\n")
            else:
                with results_path.open("a") as f:  # append to existing file
                    f.write(values + "\n")
            log.info(f"Results have been written to {results_path.resolve()}")
        if use_wandb:
            for metric_name, value in metrics.items():
                wandb.run.summary[f"{model_name}_{metric_name}"] = value
            # external metrics are without prefix
            for metric_name, value in external.items():
                wandb.run.summary[metric_name] = value

    log.info(f"Results for {exp_name} ({model_name}):")
    print_metrics({f"{k} ({model_name})": v for k, v in metrics.items()})
    log.info("")  # empty line
    return metrics
Beispiel #8
0
def test_predictions_info_loaded_bad(temp_dir) -> None:
    """Test that predictions can be saved and loaded."""
    preds = Prediction(hard=pd.Series([1]),
                       info={"sample": np.array([1, 2, 3])})  # type: ignore
    with pytest.raises(AssertionError):
        preds.to_npz(temp_dir / NPZ)