def inference_epoch_end(self, outputs: EPOCH_OUTPUT,
                            stage: Stage) -> Dict[str, float]:
        targets_all = aggregate_over_epoch(outputs=outputs, metric="targets")
        subgroup_inf_all = aggregate_over_epoch(outputs=outputs,
                                                metric="subgroup_inf")
        logits_y_all = aggregate_over_epoch(outputs=outputs, metric="logits_y")

        preds_y_all = hard_prediction(logits_y_all)

        dt = em.DataTuple(
            x=pd.DataFrame(
                torch.rand_like(subgroup_inf_all).detach().cpu().numpy(),
                columns=["x0"],
            ),
            s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(),
                           columns=["s"]),
            y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]),
        )

        return em.run_metrics(
            predictions=em.Prediction(
                hard=pd.Series(preds_y_all.detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(),
                     em.RenyiCorrelation(),
                     em.Yanovich()],
            per_sens_metrics=[em.Accuracy(),
                              em.ProbPos(),
                              em.TPR()],
        )
    def _inference_epoch_end(self, output_results: List[Dict[str, Tensor]],
                             stage: str) -> None:
        all_y = torch.cat([_r["y"] for _r in output_results], 0)
        all_s = torch.cat([_r["s"] for _r in output_results], 0)
        all_preds = torch.cat([_r["preds"] for _r in output_results], 0)

        dt = em.DataTuple(
            x=pd.DataFrame(torch.rand_like(all_s,
                                           dtype=float).detach().cpu().numpy(),
                           columns=["x0"]),
            s=pd.DataFrame(all_s.detach().cpu().numpy(), columns=["s"]),
            y=pd.DataFrame(all_y.detach().cpu().numpy(), columns=["y"]),
        )

        results = em.run_metrics(
            predictions=em.Prediction(
                hard=pd.Series(all_preds.detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(),
                     em.RenyiCorrelation(),
                     em.Yanovich()],
            per_sens_metrics=[em.Accuracy(),
                              em.ProbPos(),
                              em.TPR()],
        )

        tm_acc = self.val_acc if stage == "val" else self.test_acc
        acc = tm_acc.compute().item()
        results_dict = {f"{stage}/acc": acc}
        results_dict.update(
            {f"{stage}/{self.target}_{k}": v
             for k, v in results.items()})

        self.log_dict(results_dict)
    def inference_epoch_end(self, outputs: EPOCH_OUTPUT, stage: Stage) -> Dict[str, float]:
        targets_all = aggregate_over_epoch(outputs=outputs, metric="targets")
        subgroup_inf_all = aggregate_over_epoch(outputs=outputs, metric="subgroup_inf")
        preds_all = aggregate_over_epoch(outputs=outputs, metric="preds")

        mean_preds = preds_all.mean(-1)
        mean_preds_s0 = preds_all[subgroup_inf_all == 0].mean(-1)
        mean_preds_s1 = preds_all[subgroup_inf_all == 1].mean(-1)

        dt = em.DataTuple(
            x=pd.DataFrame(
                torch.rand_like(subgroup_inf_all, dtype=torch.float).detach().cpu().numpy(),
                columns=["x0"],
            ),
            s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(), columns=["s"]),
            y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]),
        )

        results_dict = em.run_metrics(
            predictions=em.Prediction(hard=pd.Series((preds_all > 0).detach().cpu().numpy())),
            actual=dt,
            metrics=[em.Accuracy(), em.RenyiCorrelation(), em.Yanovich()],
            per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR()],
        )

        results_dict.update(
            {
                "DP_Gap": float((mean_preds_s0 - mean_preds_s1).abs().item()),
                "mean_pred": float(mean_preds.item()),
            }
        )
        return results_dict
Beispiel #4
0
def test_run_alg_suite():
    """Test run alg suite."""
    dataset = em.adult(split="Race-Binary")
    datasets: List[em.Dataset] = [dataset, em.toy()]
    preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()]
    inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")]
    postprocess_models: List[em.PostAlgorithm] = []
    metrics: List[em.Metric] = [em.Accuracy(), em.CV()]
    per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()]
    parallel_results = em.evaluate_models_async(
        datasets=datasets,
        preprocess_models=preprocess_models,
        inprocess_models=inprocess_models,
        postprocess_models=postprocess_models,
        metrics=metrics,
        per_sens_metrics=per_sens_metrics,
        repeats=1,
        test_mode=True,
        topic="pytest",
    )
    results = em.evaluate_models(
        datasets,
        preprocess_models,
        inprocess_models,
        postprocess_models,
        metrics,
        per_sens_metrics,
        repeats=1,
        test_mode=True,
        delete_prev=True,
        topic="pytest",
    )
    pd.testing.assert_frame_equal(parallel_results, results, check_like=True)

    files = os.listdir(Path(".") / "results")
    file_names = [
        "pytest_Adult Race-Binary_Upsample uniform.csv",
        "pytest_Adult Race-Binary_no_transform.csv",
        "pytest_Toy_Upsample uniform.csv",
        "pytest_Toy_no_transform.csv",
    ]
    assert len(files) == 4
    assert sorted(files) == file_names

    for file in file_names:
        written_file = pd.read_csv(Path(f"./results/{file}"))
        assert (written_file["seed"][0], written_file["seed"][1]) == (0, 0)
        assert written_file.shape == (2, 16)

    reloaded = em.load_results("Adult Race-Binary", "Upsample uniform",
                               "pytest")
    assert reloaded is not None
    read = pd.read_csv(
        Path(".") / "results" /
        "pytest_Adult Race-Binary_Upsample uniform.csv")
    read = read.set_index(
        ["dataset", "scaler", "transform", "model", "split_id"])
    pd.testing.assert_frame_equal(reloaded, read)
Beispiel #5
0
def test_run_alg_suite_wrong_metrics():
    """Test run alg suite wrong metrics."""
    datasets: List[em.Dataset] = [em.toy(), em.adult()]
    preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()]
    inprocess_models: List[em.InAlgorithm] = [em.SVM(kernel="linear"), em.LR()]
    postprocess_models: List[em.PostAlgorithm] = []
    metrics: List[em.Metric] = [em.Accuracy(), em.CV()]
    per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR(), em.CV()]
    with pytest.raises(em.MetricNotApplicable):
        em.evaluate_models(
            datasets,
            preprocess_models,
            inprocess_models,
            postprocess_models,
            metrics,
            per_sens_metrics,
            repeats=1,
            test_mode=True,
        )
Beispiel #6
0
def compute_metrics(
    predictions: em.Prediction,
    actual: em.DataTuple,
    s_dim: int,
) -> dict[str, float]:
    """Compute accuracy and fairness metrics and log them.

    Args:
        args: args object
        predictions: predictions in a format that is compatible with EthicML
        actual: labels for the predictions
        model_name: name of the model used
        step: step of training (needed for logging to W&B)
        s_dim: dimension of s
        exp_name: name of the experiment
        save_summary: if True, a summary will be saved to wandb
        use_wandb: whether to use wandb at all
        additional_entries: entries that should go with in the summary
    Returns:
        dictionary with the computed metrics
    """

    predictions._info = {}
    metrics = em.run_metrics(
        predictions,
        actual,
        metrics=[em.Accuracy(),
                 em.TPR(),
                 em.TNR(),
                 em.RenyiCorrelation()],
        per_sens_metrics=[em.Accuracy(),
                          em.ProbPos(),
                          em.TPR(),
                          em.TNR()],
        diffs_and_ratios=s_dim <
        4,  # this just gets too much with higher s dim
    )
    # replace the slash; it's causing problems
    metrics = {k.replace("/", "÷"): v for k, v in metrics.items()}
    print_metrics(metrics)
    return metrics
Beispiel #7
0
def test_run_alg_suite_scaler():
    """Test run alg suite."""
    dataset = em.adult(split="Race-Binary")
    datasets: List[em.Dataset] = [dataset, em.toy()]
    preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()]
    inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")]
    postprocess_models: List[em.PostAlgorithm] = []
    metrics: List[em.Metric] = [em.Accuracy(), em.CV()]
    per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()]
    results_no_scaler = em.evaluate_models(
        datasets,
        preprocess_models,
        inprocess_models,
        postprocess_models,
        metrics,
        per_sens_metrics,
        repeats=1,
        test_mode=True,
        delete_prev=True,
        topic="pytest",
    )
    results_scaler = em.evaluate_models(
        datasets,
        preprocess_models,
        inprocess_models,
        postprocess_models,
        metrics,
        per_sens_metrics,
        scaler=StandardScaler(),
        repeats=1,
        test_mode=True,
        delete_prev=True,
        topic="pytest",
    )
    with pytest.raises(AssertionError):
        pd.testing.assert_frame_equal(results_scaler,
                                      results_no_scaler,
                                      check_like=True)
Beispiel #8
0
def compute_metrics(
    cfg: BaseArgs,
    predictions: em.Prediction,
    actual: em.DataTuple,
    exp_name: str,
    model_name: str,
    step: int,
    save_to_csv: Optional[Path] = None,
    results_csv: str = "",
    use_wandb: bool = False,
    additional_entries: Optional[Mapping[str, float]] = None,
) -> Dict[str, float]:
    """Compute accuracy and fairness metrics and log them.

    Args:
        args: args object
        predictions: predictions in a format that is compatible with EthicML
        actual: labels for the predictions
        exp_name: name of the experiment
        model_name: name of the model used
        step: step of training (needed for logging to W&B)
        save_to_csv: if a path is given, the results are saved to a CSV file
        results_csv: name of the CSV file
    Returns:
        dictionary with the computed metrics
    """

    predictions._info = {}
    metrics = em.run_metrics(
        predictions,
        actual,
        metrics=[em.Accuracy(),
                 em.TPR(),
                 em.TNR(),
                 em.RenyiCorrelation()],
        per_sens_metrics=[em.Accuracy(),
                          em.ProbPos(),
                          em.TPR(),
                          em.TNR()],
        diffs_and_ratios=cfg.misc._s_dim <
        4,  # this just gets too much with higher s dim
    )
    # replace the slash; it's causing problems
    metrics = {k.replace("/", "÷"): v for k, v in metrics.items()}

    if use_wandb:
        wandb_log(cfg.misc,
                  {f"{k} ({model_name})": v
                   for k, v in metrics.items()},
                  step=step)

    if save_to_csv is not None:
        # full_name = f"{args.dataset}_{exp_name}"
        # exp_name += "_s" if pred_s else "_y"
        # if hasattr(args, "eval_on_recon"):
        #     exp_name += "_on_recons" if args.eval_on_recon else "_on_encodings"

        manual_entries = {
            "seed":
            str(getattr(cfg.misc, "seed", cfg.misc.data_split_seed)),
            "data":
            exp_name,
            "method":
            f'"{model_name}"',
            "wandb_url":
            str(wandb.run.get_url())
            if use_wandb and cfg.misc.use_wandb else "(None)",
        }

        external = additional_entries or {}

        if results_csv:
            assert isinstance(save_to_csv, Path)
            save_to_csv.mkdir(exist_ok=True, parents=True)
            results = {**metrics, **external}

            results_path = save_to_csv / f"{cfg.data.dataset.name}_{model_name}_{results_csv}"
            values = ",".join(
                list(manual_entries.values()) +
                [str(v) for v in results.values()])
            if not results_path.is_file():
                with results_path.open("w") as f:
                    # ========= header =========
                    f.write(",".join(
                        list(manual_entries) + [str(k)
                                                for k in results]) + "\n")
                    f.write(values + "\n")
            else:
                with results_path.open("a") as f:  # append to existing file
                    f.write(values + "\n")
            log.info(f"Results have been written to {results_path.resolve()}")
        if use_wandb:
            for metric_name, value in metrics.items():
                wandb.run.summary[f"{model_name}_{metric_name}"] = value
            # external metrics are without prefix
            for metric_name, value in external.items():
                wandb.run.summary[metric_name] = value

    log.info(f"Results for {exp_name} ({model_name}):")
    print_metrics({f"{k} ({model_name})": v for k, v in metrics.items()})
    log.info("")  # empty line
    return metrics
Beispiel #9
0
def test_run_alg_suite_no_pipeline():
    """Run alg suite while avoiding the 'fair pipeline'."""
    datasets: List[em.Dataset] = [em.toy(), em.adult()]
    preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()]
    inprocess_models: List[em.InAlgorithm] = [
        em.Kamiran(classifier="LR"), em.LR()
    ]
    postprocess_models: List[em.PostAlgorithm] = []
    metrics: List[em.Metric] = [em.Accuracy(), em.CV()]
    per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()]

    parallel_results = em.evaluate_models_async(
        datasets=datasets,
        preprocess_models=preprocess_models,
        inprocess_models=inprocess_models,
        postprocess_models=postprocess_models,
        metrics=metrics,
        per_sens_metrics=per_sens_metrics,
        repeats=1,
        test_mode=True,
        topic="pytest",
        fair_pipeline=False,
    )
    results = em.evaluate_models(
        datasets,
        preprocess_models,
        inprocess_models,
        postprocess_models,
        metrics,
        per_sens_metrics,
        repeats=1,
        test_mode=True,
        topic="pytest",
        fair_pipeline=False,
        delete_prev=True,
    )
    pd.testing.assert_frame_equal(parallel_results, results, check_like=True)

    num_datasets = 2
    num_preprocess = 1
    num_fair_inprocess = 1
    num_unfair_inprocess = 1
    expected_num = num_datasets * (num_fair_inprocess +
                                   (num_preprocess + 1) * num_unfair_inprocess)
    assert len(results) == expected_num

    kc_name = "Kamiran & Calders LR"

    assert len(em.filter_results(results,
                                 [kc_name])) == 2  # result for Toy and Adult
    assert (len(em.filter_results(results, ["Toy"], index="dataset")) == 3
            )  # Kamiran, LR and Upsampler
    different_name = em.filter_and_map_results(results,
                                               {kc_name: "Kamiran & Calders"})
    assert len(em.filter_results(different_name, [kc_name])) == 0
    assert len(em.filter_results(different_name, ["Kamiran & Calders"])) == 2

    pd.testing.assert_frame_equal(
        em.filter_results(results, [kc_name]),
        results.query(f"model == '{kc_name}'"),
    )