def inference_epoch_end(self, outputs: EPOCH_OUTPUT, stage: Stage) -> Dict[str, float]: targets_all = aggregate_over_epoch(outputs=outputs, metric="targets") subgroup_inf_all = aggregate_over_epoch(outputs=outputs, metric="subgroup_inf") logits_y_all = aggregate_over_epoch(outputs=outputs, metric="logits_y") preds_y_all = hard_prediction(logits_y_all) dt = em.DataTuple( x=pd.DataFrame( torch.rand_like(subgroup_inf_all).detach().cpu().numpy(), columns=["x0"], ), s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(), columns=["s"]), y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]), ) return em.run_metrics( predictions=em.Prediction( hard=pd.Series(preds_y_all.detach().cpu().numpy())), actual=dt, metrics=[em.Accuracy(), em.RenyiCorrelation(), em.Yanovich()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR()], )
def _inference_epoch_end(self, output_results: List[Dict[str, Tensor]], stage: str) -> None: all_y = torch.cat([_r["y"] for _r in output_results], 0) all_s = torch.cat([_r["s"] for _r in output_results], 0) all_preds = torch.cat([_r["preds"] for _r in output_results], 0) dt = em.DataTuple( x=pd.DataFrame(torch.rand_like(all_s, dtype=float).detach().cpu().numpy(), columns=["x0"]), s=pd.DataFrame(all_s.detach().cpu().numpy(), columns=["s"]), y=pd.DataFrame(all_y.detach().cpu().numpy(), columns=["y"]), ) results = em.run_metrics( predictions=em.Prediction( hard=pd.Series(all_preds.detach().cpu().numpy())), actual=dt, metrics=[em.Accuracy(), em.RenyiCorrelation(), em.Yanovich()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR()], ) tm_acc = self.val_acc if stage == "val" else self.test_acc acc = tm_acc.compute().item() results_dict = {f"{stage}/acc": acc} results_dict.update( {f"{stage}/{self.target}_{k}": v for k, v in results.items()}) self.log_dict(results_dict)
def inference_epoch_end(self, outputs: EPOCH_OUTPUT, stage: Stage) -> Dict[str, float]: targets_all = aggregate_over_epoch(outputs=outputs, metric="targets") subgroup_inf_all = aggregate_over_epoch(outputs=outputs, metric="subgroup_inf") preds_all = aggregate_over_epoch(outputs=outputs, metric="preds") mean_preds = preds_all.mean(-1) mean_preds_s0 = preds_all[subgroup_inf_all == 0].mean(-1) mean_preds_s1 = preds_all[subgroup_inf_all == 1].mean(-1) dt = em.DataTuple( x=pd.DataFrame( torch.rand_like(subgroup_inf_all, dtype=torch.float).detach().cpu().numpy(), columns=["x0"], ), s=pd.DataFrame(subgroup_inf_all.detach().cpu().numpy(), columns=["s"]), y=pd.DataFrame(targets_all.detach().cpu().numpy(), columns=["y"]), ) results_dict = em.run_metrics( predictions=em.Prediction(hard=pd.Series((preds_all > 0).detach().cpu().numpy())), actual=dt, metrics=[em.Accuracy(), em.RenyiCorrelation(), em.Yanovich()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR()], ) results_dict.update( { "DP_Gap": float((mean_preds_s0 - mean_preds_s1).abs().item()), "mean_pred": float(mean_preds.item()), } ) return results_dict
def test_run_alg_suite(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) files = os.listdir(Path(".") / "results") file_names = [ "pytest_Adult Race-Binary_Upsample uniform.csv", "pytest_Adult Race-Binary_no_transform.csv", "pytest_Toy_Upsample uniform.csv", "pytest_Toy_no_transform.csv", ] assert len(files) == 4 assert sorted(files) == file_names for file in file_names: written_file = pd.read_csv(Path(f"./results/{file}")) assert (written_file["seed"][0], written_file["seed"][1]) == (0, 0) assert written_file.shape == (2, 16) reloaded = em.load_results("Adult Race-Binary", "Upsample uniform", "pytest") assert reloaded is not None read = pd.read_csv( Path(".") / "results" / "pytest_Adult Race-Binary_Upsample uniform.csv") read = read.set_index( ["dataset", "scaler", "transform", "model", "split_id"]) pd.testing.assert_frame_equal(reloaded, read)
def test_run_alg_suite_wrong_metrics(): """Test run alg suite wrong metrics.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.SVM(kernel="linear"), em.LR()] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR(), em.CV()] with pytest.raises(em.MetricNotApplicable): em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, )
def compute_metrics( predictions: em.Prediction, actual: em.DataTuple, s_dim: int, ) -> dict[str, float]: """Compute accuracy and fairness metrics and log them. Args: args: args object predictions: predictions in a format that is compatible with EthicML actual: labels for the predictions model_name: name of the model used step: step of training (needed for logging to W&B) s_dim: dimension of s exp_name: name of the experiment save_summary: if True, a summary will be saved to wandb use_wandb: whether to use wandb at all additional_entries: entries that should go with in the summary Returns: dictionary with the computed metrics """ predictions._info = {} metrics = em.run_metrics( predictions, actual, metrics=[em.Accuracy(), em.TPR(), em.TNR(), em.RenyiCorrelation()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR(), em.TNR()], diffs_and_ratios=s_dim < 4, # this just gets too much with higher s dim ) # replace the slash; it's causing problems metrics = {k.replace("/", "÷"): v for k, v in metrics.items()} print_metrics(metrics) return metrics
def test_run_alg_suite_scaler(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] results_no_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) results_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, scaler=StandardScaler(), repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) with pytest.raises(AssertionError): pd.testing.assert_frame_equal(results_scaler, results_no_scaler, check_like=True)
def compute_metrics( cfg: BaseArgs, predictions: em.Prediction, actual: em.DataTuple, exp_name: str, model_name: str, step: int, save_to_csv: Optional[Path] = None, results_csv: str = "", use_wandb: bool = False, additional_entries: Optional[Mapping[str, float]] = None, ) -> Dict[str, float]: """Compute accuracy and fairness metrics and log them. Args: args: args object predictions: predictions in a format that is compatible with EthicML actual: labels for the predictions exp_name: name of the experiment model_name: name of the model used step: step of training (needed for logging to W&B) save_to_csv: if a path is given, the results are saved to a CSV file results_csv: name of the CSV file Returns: dictionary with the computed metrics """ predictions._info = {} metrics = em.run_metrics( predictions, actual, metrics=[em.Accuracy(), em.TPR(), em.TNR(), em.RenyiCorrelation()], per_sens_metrics=[em.Accuracy(), em.ProbPos(), em.TPR(), em.TNR()], diffs_and_ratios=cfg.misc._s_dim < 4, # this just gets too much with higher s dim ) # replace the slash; it's causing problems metrics = {k.replace("/", "÷"): v for k, v in metrics.items()} if use_wandb: wandb_log(cfg.misc, {f"{k} ({model_name})": v for k, v in metrics.items()}, step=step) if save_to_csv is not None: # full_name = f"{args.dataset}_{exp_name}" # exp_name += "_s" if pred_s else "_y" # if hasattr(args, "eval_on_recon"): # exp_name += "_on_recons" if args.eval_on_recon else "_on_encodings" manual_entries = { "seed": str(getattr(cfg.misc, "seed", cfg.misc.data_split_seed)), "data": exp_name, "method": f'"{model_name}"', "wandb_url": str(wandb.run.get_url()) if use_wandb and cfg.misc.use_wandb else "(None)", } external = additional_entries or {} if results_csv: assert isinstance(save_to_csv, Path) save_to_csv.mkdir(exist_ok=True, parents=True) results = {**metrics, **external} results_path = save_to_csv / f"{cfg.data.dataset.name}_{model_name}_{results_csv}" values = ",".join( list(manual_entries.values()) + [str(v) for v in results.values()]) if not results_path.is_file(): with results_path.open("w") as f: # ========= header ========= f.write(",".join( list(manual_entries) + [str(k) for k in results]) + "\n") f.write(values + "\n") else: with results_path.open("a") as f: # append to existing file f.write(values + "\n") log.info(f"Results have been written to {results_path.resolve()}") if use_wandb: for metric_name, value in metrics.items(): wandb.run.summary[f"{model_name}_{metric_name}"] = value # external metrics are without prefix for metric_name, value in external.items(): wandb.run.summary[metric_name] = value log.info(f"Results for {exp_name} ({model_name}):") print_metrics({f"{k} ({model_name})": v for k, v in metrics.items()}) log.info("") # empty line return metrics
def test_run_alg_suite_no_pipeline(): """Run alg suite while avoiding the 'fair pipeline'.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [ em.Kamiran(classifier="LR"), em.LR() ] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, delete_prev=True, ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) num_datasets = 2 num_preprocess = 1 num_fair_inprocess = 1 num_unfair_inprocess = 1 expected_num = num_datasets * (num_fair_inprocess + (num_preprocess + 1) * num_unfair_inprocess) assert len(results) == expected_num kc_name = "Kamiran & Calders LR" assert len(em.filter_results(results, [kc_name])) == 2 # result for Toy and Adult assert (len(em.filter_results(results, ["Toy"], index="dataset")) == 3 ) # Kamiran, LR and Upsampler different_name = em.filter_and_map_results(results, {kc_name: "Kamiran & Calders"}) assert len(em.filter_results(different_name, [kc_name])) == 0 assert len(em.filter_results(different_name, ["Kamiran & Calders"])) == 2 pd.testing.assert_frame_equal( em.filter_results(results, [kc_name]), results.query(f"model == '{kc_name}'"), )