def test_run_parallel(toy_train_test: em.TrainTestPair): """Test run parallel.""" from ethicml.evaluators import parallelism # this import requires ray, so do it only on demand data0 = toy_train_test data1 = toy_train_test result = parallelism.run_in_parallel( [em.LR(), em.SVM(), em.Majority()], [em.TrainTestPair(*data0), em.TrainTestPair(*data1)], num_cpus=2, ) # LR assert np.count_nonzero(result[0][0].hard.values == 1) == 44 assert np.count_nonzero(result[0][0].hard.values == 0) == 36 assert np.count_nonzero(result[0][1].hard.values == 1) == 44 assert np.count_nonzero(result[0][1].hard.values == 0) == 36 # SVM assert np.count_nonzero(result[1][0].hard.values == 1) == 45 assert np.count_nonzero(result[1][0].hard.values == 0) == 35 assert np.count_nonzero(result[1][1].hard.values == 1) == 45 assert np.count_nonzero(result[1][1].hard.values == 0) == 35 # Majority assert np.count_nonzero(result[2][0].hard.values == 1) == 80 assert np.count_nonzero(result[2][0].hard.values == 0) == 0 assert np.count_nonzero(result[2][1].hard.values == 1) == 80 assert np.count_nonzero(result[2][1].hard.values == 0) == 0
def baseline_metrics(cfg: Config, data: DatasetTriplet, save_to_csv: Optional[Path]) -> None: if cfg.data.dataset not in (DS.cmnist, DS.celeba, DS.genfaces): log.info("Baselines...") train_data = data.train test_data = data.test if not isinstance(train_data, em.DataTuple): train_data, test_data = get_data_tuples(train_data, test_data) train_data, test_data = make_tuple_from_data(train_data, test_data, pred_s=False) for clf in [ em.LR(), em.Majority(), em.Kamiran(classifier="LR"), em.LRCV(), em.SVM(), ]: preds = clf.run(train_data, test_data) compute_metrics( cfg=cfg, predictions=preds, actual=test_data, exp_name="original_data", model_name=clf.name, step=0, save_to_csv=save_to_csv, results_csv=cfg.misc.results_csv, use_wandb=False, )
def test_run_alg_suite(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) files = os.listdir(Path(".") / "results") file_names = [ "pytest_Adult Race-Binary_Upsample uniform.csv", "pytest_Adult Race-Binary_no_transform.csv", "pytest_Toy_Upsample uniform.csv", "pytest_Toy_no_transform.csv", ] assert len(files) == 4 assert sorted(files) == file_names for file in file_names: written_file = pd.read_csv(Path(f"./results/{file}")) assert (written_file["seed"][0], written_file["seed"][1]) == (0, 0) assert written_file.shape == (2, 16) reloaded = em.load_results("Adult Race-Binary", "Upsample uniform", "pytest") assert reloaded is not None read = pd.read_csv( Path(".") / "results" / "pytest_Adult Race-Binary_Upsample uniform.csv") read = read.set_index( ["dataset", "scaler", "transform", "model", "split_id"]) pd.testing.assert_frame_equal(reloaded, read)
def test_run_alg_suite_wrong_metrics(): """Test run alg suite wrong metrics.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.SVM(kernel="linear"), em.LR()] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR(), em.CV()] with pytest.raises(em.MetricNotApplicable): em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, )
def test_run_alg_suite_scaler(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] results_no_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) results_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, scaler=StandardScaler(), repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) with pytest.raises(AssertionError): pd.testing.assert_frame_equal(results_scaler, results_no_scaler, check_like=True)
def test_run_alg_suite_no_pipeline(): """Run alg suite while avoiding the 'fair pipeline'.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [ em.Kamiran(classifier="LR"), em.LR() ] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, delete_prev=True, ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) num_datasets = 2 num_preprocess = 1 num_fair_inprocess = 1 num_unfair_inprocess = 1 expected_num = num_datasets * (num_fair_inprocess + (num_preprocess + 1) * num_unfair_inprocess) assert len(results) == expected_num kc_name = "Kamiran & Calders LR" assert len(em.filter_results(results, [kc_name])) == 2 # result for Toy and Adult assert (len(em.filter_results(results, ["Toy"], index="dataset")) == 3 ) # Kamiran, LR and Upsampler different_name = em.filter_and_map_results(results, {kc_name: "Kamiran & Calders"}) assert len(em.filter_results(different_name, [kc_name])) == 0 assert len(em.filter_results(different_name, ["Kamiran & Calders"])) == 2 pd.testing.assert_frame_equal( em.filter_results(results, [kc_name]), results.query(f"model == '{kc_name}'"), )
def evaluate( cfg: Config, step: int, train_data: "Dataset[Tuple[Tensor, Tensor, Tensor]]", test_data: "Dataset[Tuple[Tensor, Tensor, Tensor]]", name: str, eval_on_recon: bool = True, pred_s: bool = False, save_to_csv: Optional[Path] = None, cluster_test_metrics: Optional[Dict[str, float]] = None, cluster_context_metrics: Optional[Dict[str, float]] = None, ): input_shape = next(iter(train_data))[0].shape additional_entries = {} if cluster_test_metrics is not None: additional_entries.update( {f"Clust/Test {k}": v for k, v in cluster_test_metrics.items()}) if cluster_context_metrics is not None: additional_entries.update({ f"Clust/Context {k}": v for k, v in cluster_context_metrics.items() }) if cfg.data.dataset in (DS.cmnist, DS.celeba, DS.genfaces): train_loader = DataLoader(train_data, batch_size=cfg.fdm.batch_size, shuffle=True, pin_memory=True) test_loader = DataLoader(test_data, batch_size=cfg.fdm.test_batch_size, shuffle=False, pin_memory=True) clf: Classifier = fit_classifier( cfg, input_shape, train_data=train_loader, train_on_recon=eval_on_recon, pred_s=pred_s, test_data=test_loader, ) preds, labels, sens = clf.predict_dataset(test_loader, device=torch.device( cfg.misc._device)) preds = em.Prediction(hard=pd.Series(preds)) if cfg.data.dataset == DS.cmnist: sens_name = "colour" elif cfg.data.dataset == DS.celeba: sens_name = cfg.data.celeba_sens_attr else: sens_name = "sens_Label" sens_pd = pd.DataFrame(sens.numpy().astype(np.float32), columns=[sens_name]) labels_pd = pd.DataFrame(labels, columns=["labels"]) actual = em.DataTuple(x=sens_pd, s=sens_pd, y=sens_pd if pred_s else labels_pd) compute_metrics( cfg, preds, actual, name, "pytorch_classifier", step=step, save_to_csv=save_to_csv, results_csv=cfg.misc.results_csv, use_wandb=cfg.misc.use_wandb, additional_entries=additional_entries, ) else: if not isinstance(train_data, em.DataTuple): train_data, test_data = get_data_tuples(train_data, test_data) train_data, test_data = make_tuple_from_data(train_data, test_data, pred_s=pred_s) for eth_clf in [em.LR(), em.LRCV()]: # , em.LRCV(), em.SVM(kernel="linear")]: preds = eth_clf.run(train_data, test_data) compute_metrics( cfg, preds, test_data, name, eth_clf.name, step=step, save_to_csv=save_to_csv, results_csv=cfg.misc.results_csv, use_wandb=cfg.misc.use_wandb, additional_entries=additional_entries, )