def test_dp_flip_inverted_s(toy_train_test: TrainValPair) -> None: """Test the dem par flipping method.""" train, test = toy_train_test train = train.replace(s=1 - train.s) test = test.replace(s=1 - test.s) train_test = em.concat_tt([train, test], ignore_index=True) in_model: InAlgorithm = LR() assert in_model is not None assert in_model.name == "Logistic Regression (C=1.0)" predictions: Prediction = in_model.run(train, train_test) # seperate out predictions on train set and predictions on test set pred_train = predictions.hard.iloc[: train.y.shape[0]] pred_test = predictions.hard.iloc[train.y.shape[0] :].reset_index(drop=True) assert np.count_nonzero(pred_test.values == 1) == 44 assert np.count_nonzero(pred_test.values == 0) == 36 post_model: PostAlgorithm = DPFlip() assert post_model.name == "DemPar. Post Process" fair_preds = post_model.run(Prediction(pred_train), train, Prediction(pred_test), test) assert np.count_nonzero(fair_preds.hard.values == 1) == 57 assert np.count_nonzero(fair_preds.hard.values == 0) == 23 diffs = em.diff_per_sensitive_attribute( em.metric_per_sensitive_attribute(fair_preds, test, ProbPos()) ) for diff in diffs.values(): assert pytest.approx(diff, abs=1e-2) == 0
def test_predictions_loaded(temp_dir) -> None: """Test that predictions can be saved and loaded.""" preds = Prediction(hard=pd.Series([1])) preds.to_npz(temp_dir / NPZ) loaded = Prediction.from_npz(temp_dir / NPZ) pd.testing.assert_series_equal(preds.hard, loaded.hard, check_dtype=False) # type: ignore[call-arg]
def test_post_sep_fit_pred( toy_train_test: TrainValPair, post_model: PostAlgorithm, name: str, num_pos: int ) -> None: """Test the dem par flipping method.""" train, test = toy_train_test train_test = em.concat_tt([train, test], ignore_index=True) in_model: InAlgorithm = LR() assert in_model is not None assert in_model.name == "Logistic Regression (C=1.0)" predictions: Prediction = in_model.run(train, train_test) # seperate out predictions on train set and predictions on test set pred_train = predictions.hard.iloc[: train.y.shape[0]] pred_test = predictions.hard.iloc[train.y.shape[0] :].reset_index(drop=True) assert np.count_nonzero(pred_test.values == 1) == 44 assert np.count_nonzero(pred_test.values == 0) == 36 assert post_model.name == name fair_model = post_model.fit(Prediction(pred_train), train) fair_preds = fair_model.predict(Prediction(pred_test), test) assert np.count_nonzero(fair_preds.hard.values == 1) == num_pos assert np.count_nonzero(fair_preds.hard.values == 0) == len(fair_preds) - num_pos diffs = em.diff_per_sensitive_attribute( em.metric_per_sensitive_attribute(fair_preds, test, ProbPos()) ) if isinstance(post_model, DPFlip): for diff in diffs.values(): assert pytest.approx(diff, abs=1e-2) == 0
def test_dependence_measures_adult() -> None: """Test dependence measures.""" data = load_data(em.adult(split="Sex")) train_percentage = 0.75 unbalanced, balanced, _ = BalancedTestSplit( train_percentage=train_percentage)(data) fair_prediction = Prediction( hard=balanced.y["salary_>50K"]) # predict the balanced label unfair_prediction = Prediction( hard=unbalanced.y["salary_>50K"]) # predict the normal label extremely_unfair_prediction = Prediction( hard=unbalanced.s["sex_Male"]) # predict s # measure the dependence between s and the prediction in several ways assert _compute_di(fair_prediction, balanced) == approx(1, abs=1e-15) assert _compute_di(unfair_prediction, unbalanced) == approx(0.364, abs=3e-3) assert _compute_di(extremely_unfair_prediction, unbalanced) == approx(0, abs=3e-3) assert _compute_inv_cv(fair_prediction, balanced) == approx(0, abs=1e-15) assert _compute_inv_cv(unfair_prediction, unbalanced) == approx(0.199, abs=3e-3) assert _compute_inv_cv(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-3) nmi = NMI() assert nmi.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert nmi.score(unfair_prediction, unbalanced) == approx(0.0432, abs=3e-4) assert nmi.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4) yanovich = Yanovich() assert yanovich.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert yanovich.score(unfair_prediction, unbalanced) == approx(0.0396, abs=3e-4) assert yanovich.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4) renyi = RenyiCorrelation() assert renyi.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert renyi.score(unfair_prediction, unbalanced) == approx(0.216, abs=3e-4) assert renyi.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4)
def test_dependence_measures(simple_data: DataTuple) -> None: """Test dependence measures.""" train_percentage = 0.75 unbalanced, balanced, _ = BalancedTestSplit( train_percentage=train_percentage)(simple_data) fair_prediction = Prediction( hard=balanced.y["y"]) # predict the balanced label unfair_prediction = Prediction( hard=unbalanced.y["y"]) # predict the normal label extremely_unfair_prediction = Prediction( hard=unbalanced.s["s"]) # predict s # measure the dependence between s and the prediction in several ways assert _compute_di(fair_prediction, balanced) == approx(1, abs=1e-15) assert _compute_di(unfair_prediction, unbalanced) == approx(0.602, abs=3e-3) assert _compute_di(extremely_unfair_prediction, unbalanced) == approx(0, abs=3e-3) assert _compute_inv_cv(fair_prediction, balanced) == approx(0, abs=1e-15) assert _compute_inv_cv(unfair_prediction, unbalanced) == approx(0.265, abs=3e-3) assert _compute_inv_cv(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-3) nmi = NMI() assert nmi.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert nmi.score(unfair_prediction, unbalanced) == approx(0.0437, abs=3e-4) assert nmi.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4) yanovich = Yanovich() assert yanovich.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert yanovich.score(unfair_prediction, unbalanced) == approx(0.0702, abs=3e-4) assert yanovich.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4) renyi = RenyiCorrelation() assert renyi.score(fair_prediction, balanced) == approx(0, abs=1e-15) assert renyi.score(unfair_prediction, unbalanced) == approx(0.234, abs=3e-4) assert renyi.score(extremely_unfair_prediction, unbalanced) == approx(1, abs=3e-4)
def compute_metrics( predictions: em.Prediction, actual: em.DataTuple, s_dim: int, ) -> dict[str, float]: """Compute accuracy and fairness metrics and log them. Args: args: args object predictions: predictions in a format that is compatible with EthicML actual: labels for the predictions model_name: name of the model used step: step of training (needed for logging to W&B) s_dim: dimension of s exp_name: name of the experiment save_summary: if True, a summary will be saved to wandb use_wandb: whether to use wandb at all additional_entries: entries that should go with in the summary Returns: dictionary with the computed metrics """ predictions._info = {} metrics = em.run_metrics( predictions, actual, metrics=[em.Accuracy(), em.TPR(), em.TNR(), em.RenyiCorrelation()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR(), em.TNR()], diffs_and_ratios=s_dim < 4, # this just gets too much with higher s dim ) # replace the slash; it's causing problems metrics = {k.replace("/", "÷"): v for k, v in metrics.items()} print_metrics(metrics) return metrics
def compute_metrics( cfg: BaseArgs, predictions: em.Prediction, actual: em.DataTuple, exp_name: str, model_name: str, step: int, save_to_csv: Optional[Path] = None, results_csv: str = "", use_wandb: bool = False, additional_entries: Optional[Mapping[str, float]] = None, ) -> Dict[str, float]: """Compute accuracy and fairness metrics and log them. Args: args: args object predictions: predictions in a format that is compatible with EthicML actual: labels for the predictions exp_name: name of the experiment model_name: name of the model used step: step of training (needed for logging to W&B) save_to_csv: if a path is given, the results are saved to a CSV file results_csv: name of the CSV file Returns: dictionary with the computed metrics """ predictions._info = {} metrics = em.run_metrics( predictions, actual, metrics=[em.Accuracy(), em.TPR(), em.TNR(), em.RenyiCorrelation()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR(), em.TNR()], diffs_and_ratios=cfg.misc._s_dim < 4, # this just gets too much with higher s dim ) # replace the slash; it's causing problems metrics = {k.replace("/", "÷"): v for k, v in metrics.items()} if use_wandb: wandb_log(cfg.misc, {f"{k} ({model_name})": v for k, v in metrics.items()}, step=step) if save_to_csv is not None: # full_name = f"{args.dataset}_{exp_name}" # exp_name += "_s" if pred_s else "_y" # if hasattr(args, "eval_on_recon"): # exp_name += "_on_recons" if args.eval_on_recon else "_on_encodings" manual_entries = { "seed": str(getattr(cfg.misc, "seed", cfg.misc.data_split_seed)), "data": exp_name, "method": f'"{model_name}"', "wandb_url": str(wandb.run.get_url()) if use_wandb and cfg.misc.use_wandb else "(None)", } external = additional_entries or {} if results_csv: assert isinstance(save_to_csv, Path) save_to_csv.mkdir(exist_ok=True, parents=True) results = {**metrics, **external} results_path = save_to_csv / f"{cfg.data.dataset.name}_{model_name}_{results_csv}" values = ",".join( list(manual_entries.values()) + [str(v) for v in results.values()]) if not results_path.is_file(): with results_path.open("w") as f: # ========= header ========= f.write(",".join( list(manual_entries) + [str(k) for k in results]) + "\n") f.write(values + "\n") else: with results_path.open("a") as f: # append to existing file f.write(values + "\n") log.info(f"Results have been written to {results_path.resolve()}") if use_wandb: for metric_name, value in metrics.items(): wandb.run.summary[f"{model_name}_{metric_name}"] = value # external metrics are without prefix for metric_name, value in external.items(): wandb.run.summary[metric_name] = value log.info(f"Results for {exp_name} ({model_name}):") print_metrics({f"{k} ({model_name})": v for k, v in metrics.items()}) log.info("") # empty line return metrics
def test_predictions_info_loaded_bad(temp_dir) -> None: """Test that predictions can be saved and loaded.""" preds = Prediction(hard=pd.Series([1]), info={"sample": np.array([1, 2, 3])}) # type: ignore with pytest.raises(AssertionError): preds.to_npz(temp_dir / NPZ)