def toy_train_val() -> TrainTestPair: """By making this a fixture, pytest can cache the result.""" data: DataTuple = em.toy().load() train: DataTuple test: DataTuple train, test = em.train_test_split(data) return TrainTestPair(train, test)
def test_empty_evaluate(): """Test empty evaluate.""" empty_result = em.evaluate_models([em.toy()], repeats=3, delete_prev=True) expected_result = pd.DataFrame( [], columns=["dataset", "scaler", "transform", "model", "split_id"]) expected_result = expected_result.set_index( ["dataset", "scaler", "transform", "model", "split_id"]) pd.testing.assert_frame_equal(empty_result, expected_result)
def test_run_alg_suite(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) files = os.listdir(Path(".") / "results") file_names = [ "pytest_Adult Race-Binary_Upsample uniform.csv", "pytest_Adult Race-Binary_no_transform.csv", "pytest_Toy_Upsample uniform.csv", "pytest_Toy_no_transform.csv", ] assert len(files) == 4 assert sorted(files) == file_names for file in file_names: written_file = pd.read_csv(Path(f"./results/{file}")) assert (written_file["seed"][0], written_file["seed"][1]) == (0, 0) assert written_file.shape == (2, 16) reloaded = em.load_results("Adult Race-Binary", "Upsample uniform", "pytest") assert reloaded is not None read = pd.read_csv( Path(".") / "results" / "pytest_Adult Race-Binary_Upsample uniform.csv") read = read.set_index( ["dataset", "scaler", "transform", "model", "split_id"]) pd.testing.assert_frame_equal(reloaded, read)
def test_sequential_split(): """Test sequential split.""" data: DataTuple = em.load_data(em.toy()) train: DataTuple test: DataTuple train, test, _ = em.SequentialSplit(train_percentage=0.8)(data) assert all(data.x.iloc[0] == train.x.iloc[0]) assert all(data.x.iloc[-1] == test.x.iloc[-1]) assert len(train) == 320 assert len(test) == 80
def test_train_test_split(): """Test train test split.""" data: DataTuple = em.load_data(em.toy()) train_test: Tuple[DataTuple, DataTuple] = em.train_test_split(data) train, test = train_test assert train is not None assert test is not None assert train.x.shape[0] > test.x.shape[0] assert train.x["a1"].values[0] == 0.2365572108691669 assert train.x["a2"].values[0] == approx(0.008603090240657633, abs=1e-6) assert train.x.shape[0] == train.s.shape[0] assert train.s.shape[0] == train.y.shape[0] num_samples = len(data) len_default = math.floor((num_samples / 100) * 80) assert train.s.shape[0] == len_default assert test.s.shape[0] == num_samples - len_default len_0_9 = math.floor((num_samples / 100) * 90) train, test = em.train_test_split(data, train_percentage=0.9) assert train.s.shape[0] == len_0_9 assert test.s.shape[0] == num_samples - len_0_9 len_0_7 = math.floor((num_samples / 100) * 70) train, test = em.train_test_split(data, train_percentage=0.7) assert train.s.shape[0] == len_0_7 assert test.s.shape[0] == num_samples - len_0_7 len_0_5 = math.floor((num_samples / 100) * 50) train, test = em.train_test_split(data, train_percentage=0.5) assert train.s.shape[0] == len_0_5 assert test.s.shape[0] == num_samples - len_0_5 len_0_3 = math.floor((num_samples / 100) * 30) train, test = em.train_test_split(data, train_percentage=0.3) assert train.s.shape[0] == len_0_3 assert test.s.shape[0] == num_samples - len_0_3 len_0_1 = math.floor((num_samples / 100) * 10) train, test = em.train_test_split(data, train_percentage=0.1) assert train.s.shape[0] == len_0_1 assert test.s.shape[0] == num_samples - len_0_1 len_0_0 = math.floor((num_samples / 100) * 0) train, test = em.train_test_split(data, train_percentage=0.0) assert train.s.shape[0] == len_0_0 assert train.name == "Toy - Train" assert test.s.shape[0] == num_samples - len_0_0 assert test.name == "Toy - Test"
def test_random_seed(): """Test random seed.""" data: DataTuple = em.load_data(em.toy()) train_test_0: Tuple[DataTuple, DataTuple] = em.train_test_split(data) train_0, test_0 = train_test_0 assert train_0 is not None assert test_0 is not None assert train_0.x.shape[0] > test_0.x.shape[0] assert train_0.x["a1"].values[0] == 0.2365572108691669 assert train_0.x["a2"].values[0] == approx(0.008603090240657633, abs=1e-6) assert train_0.x.shape[0] == train_0.s.shape[0] assert train_0.s.shape[0] == train_0.y.shape[0] train_test_1: Tuple[DataTuple, DataTuple] = em.train_test_split(data, random_seed=1) train_1, test_1 = train_test_1 assert train_1 is not None assert test_1 is not None assert train_1.x.shape[0] > test_1.x.shape[0] assert train_1.x["a1"].values[0] == 1.3736566330173798 assert train_1.x["a2"].values[0] == approx(0.21742296144957174, abs=1e-6) assert train_1.x.shape[0] == train_1.s.shape[0] assert train_1.s.shape[0] == train_1.y.shape[0] train_test_2: Tuple[DataTuple, DataTuple] = em.train_test_split(data, random_seed=2) train_2, test_2 = train_test_2 assert train_2 is not None assert test_2 is not None assert train_2.x.shape[0] > test_2.x.shape[0] assert train_2.x["a1"].values[0] == 1.2255705960148289 assert train_2.x["a2"].values[0] == -1.208089015454192 assert train_2.x.shape[0] == train_2.s.shape[0] assert train_2.s.shape[0] == train_2.y.shape[0] train_test_3: Tuple[DataTuple, DataTuple] = em.train_test_split(data, random_seed=3) train_3, test_3 = train_test_3 assert train_3 is not None assert test_3 is not None assert train_3.x.shape[0] > test_3.x.shape[0] assert train_3.x["a1"].values[0] == approx(0.21165963748018515, abs=1e-6) assert train_3.x["a2"].values[0] == -2.425137404779957 assert train_3.x.shape[0] == train_3.s.shape[0] assert train_3.s.shape[0] == train_3.y.shape[0]
def test_threaded_agarwal(): """Test threaded agarwal.""" models: List[InAlgorithmAsync] = [ Agarwal(dir='/tmp', classifier="SVM", fairness="EqOd") ] class AssertResult(Metric): _name = "assert_result" def score(self, prediction, actual) -> float: return (np.count_nonzero(prediction.hard.values == 1) == 45 and np.count_nonzero(prediction.hard.values == 0) == 35) results = evaluate_models_async(datasets=[toy()], inprocess_models=models, metrics=[AssertResult()], delete_prev=True) assert results["assert_result"].iloc[0]
def test_run_alg_suite_wrong_metrics(): """Test run alg suite wrong metrics.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.SVM(kernel="linear"), em.LR()] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR(), em.CV()] with pytest.raises(em.MetricNotApplicable): em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, )
def test_plot_evals(): """Test plot evals.""" results: Results = evaluate_models( datasets=[adult(), toy()], preprocess_models=[Upsampler(strategy="preferential")], inprocess_models=[LR(), SVM(kernel="linear"), Kamiran()], metrics=[Accuracy(), CV()], per_sens_metrics=[TPR(), ProbPos()], repeats=3, test_mode=True, delete_prev=True, ) assert results["seed"][0] == results["seed"][1] == results["seed"][2] == 0 assert results["seed"][3] == results["seed"][4] == results["seed"][ 5] == 2410 assert results["seed"][6] == results["seed"][7] == results["seed"][ 8] == 4820 figs_and_plots: List[Tuple[plt.Figure, plt.Axes]] # type: ignore[name-defined] # plot with metrics figs_and_plots = plot_results(results, Accuracy(), ProbPos()) # num(datasets) * num(preprocess) * num(accuracy combinations) * num(prop_pos combinations) assert len( figs_and_plots) == 2 * 2 * 1 * 2 + 4 # TODO: this +4 should be FIXED, # it matches the column name containing a hyphen as a DIFF metric. # plot with column names figs_and_plots = plot_results(results, "Accuracy", "prob_pos_sensitive-attr_0") assert len(figs_and_plots) == 1 * 2 * 1 * 1 with pytest.raises( ValueError, match='No matching columns found for Metric "NMI preds and s".'): plot_results(results, Accuracy(), NMI()) with pytest.raises(ValueError, match='No column named "unknown metric".'): plot_results(results, "unknown metric", Accuracy())
def test_run_alg_suite_scaler(): """Test run alg suite.""" dataset = em.adult(split="Race-Binary") datasets: List[em.Dataset] = [dataset, em.toy()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [em.LR(), em.SVM(kernel="linear")] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] results_no_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) results_scaler = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, scaler=StandardScaler(), repeats=1, test_mode=True, delete_prev=True, topic="pytest", ) with pytest.raises(AssertionError): pd.testing.assert_frame_equal(results_scaler, results_no_scaler, check_like=True)
def test_prop_train_test_split(): """Test prop train test split.""" data: DataTuple = em.load_data(em.toy()) train: DataTuple test: DataTuple train, test, _ = ProportionalSplit(train_percentage=0.8)(data, split_id=0) assert train is not None assert test is not None assert train.x.shape[0] > test.x.shape[0] assert train.x["a1"].values[0] == -0.7135614558562237 assert train.x["a2"].values[0] == 1.1211390799513148 assert train.x.shape[0] == train.s.shape[0] assert train.s.shape[0] == train.y.shape[0] NUM_SAMPLES = len(data) len_default = math.floor((NUM_SAMPLES / 100) * 80) assert train.s.shape[0] == len_default assert test.s.shape[0] == NUM_SAMPLES - len_default # assert that the proportion of s=0 to s=1 has remained the same (and also test for y=0/y=1) assert np.count_nonzero(train.s.to_numpy() == 0) == round( 0.8 * np.count_nonzero(data.s.to_numpy() == 0)) assert np.count_nonzero(train.y.to_numpy() == 0) == round( 0.8 * np.count_nonzero(data.y.to_numpy() == 0)) len_0_9 = math.floor((NUM_SAMPLES / 100) * 90) train, test, _ = ProportionalSplit(train_percentage=0.9)(data, split_id=0) assert train.s.shape[0] == len_0_9 assert test.s.shape[0] == NUM_SAMPLES - len_0_9 assert np.count_nonzero(train.s.to_numpy() == 0) == approx(round( 0.9 * np.count_nonzero(data.s.to_numpy() == 0)), abs=1) assert np.count_nonzero(train.y.to_numpy() == 0) == approx(round( 0.9 * np.count_nonzero(data.y.to_numpy() == 0)), abs=1) len_0_7 = math.floor((NUM_SAMPLES / 100) * 70) train, test, _ = ProportionalSplit(train_percentage=0.7)(data, split_id=0) assert train.s.shape[0] == len_0_7 assert test.s.shape[0] == NUM_SAMPLES - len_0_7 assert np.count_nonzero(train.s.to_numpy() == 0) == approx(round( 0.7 * np.count_nonzero(data.s.to_numpy() == 0)), abs=1) assert np.count_nonzero(train.y.to_numpy() == 0) == approx(round( 0.7 * np.count_nonzero(data.y.to_numpy() == 0)), abs=1) len_0_5 = math.floor((NUM_SAMPLES / 100) * 50) train, test, _ = ProportionalSplit(train_percentage=0.5)(data, split_id=0) assert train.s.shape[0] == len_0_5 assert test.s.shape[0] == NUM_SAMPLES - len_0_5 len_0_3 = math.floor((NUM_SAMPLES / 100) * 30) train, test, _ = ProportionalSplit(train_percentage=0.3)(data, split_id=0) assert train.s.shape[0] == len_0_3 assert test.s.shape[0] == NUM_SAMPLES - len_0_3 len_0_1 = math.floor((NUM_SAMPLES / 100) * 10) train, test, _ = ProportionalSplit(train_percentage=0.1)(data, split_id=0) assert train.s.shape[0] == len_0_1 assert test.s.shape[0] == NUM_SAMPLES - len_0_1 len_0_0 = math.floor((NUM_SAMPLES / 100) * 0) train, test, _ = ProportionalSplit(train_percentage=0.0)(data, split_id=0) assert train.s.shape[0] == len_0_0 assert train.name == "Toy - Train" assert test.s.shape[0] == NUM_SAMPLES - len_0_0 assert test.name == "Toy - Test"
s = pd.DataFrame(np.random.randn(100), columns=["s"]) y = pd.DataFrame(np.random.randint(0, 5, 100), columns=["y"]) data = DataTuple(x=x, s=s, y=y) train_test: Tuple[DataTuple, DataTuple] = train_test_split(data) train, test = train_test model: InAlgorithm = LR() predictions: Prediction = model.run(train, test) acc_per_sens = metric_per_sensitive_attribute( predictions, test, TPR(pos_class=1, labels=list(range(y.nunique()[0])))) print(acc_per_sens) PER_SENS = [ PerSensMetricTest( dataset=toy(), classifier=SVM(), metric=Accuracy(), expected_values={ "sensitive-attr_0": 0.921, "sensitive-attr_1": 0.929 }, ), PerSensMetricTest( dataset=toy(), classifier=SVM(), metric=ProbPos(), expected_values={ "sensitive-attr_0": 0.368, "sensitive-attr_1": 0.738 },
), DT( dataset=em.sqf(split="Race-Sex"), samples=12_347, x_features=144, discrete_features=138, s_features=1, num_sens=4, y_features=1, num_labels=2, name="SQF Race-Sex", sum_s=24_336, sum_y=1_289, ), DT( dataset=em.toy(), samples=400, x_features=10, discrete_features=8, s_features=1, num_sens=2, y_features=1, num_labels=2, name="Toy", sum_s=200, sum_y=231, ), DT( dataset=em.acs_income( root=Path("~/Data"), year="2018", horizon=1, states=["AL"]), samples=22_268,
def test_run_alg_suite_no_pipeline(): """Run alg suite while avoiding the 'fair pipeline'.""" datasets: List[em.Dataset] = [em.toy(), em.adult()] preprocess_models: List[em.PreAlgorithm] = [em.Upsampler()] inprocess_models: List[em.InAlgorithm] = [ em.Kamiran(classifier="LR"), em.LR() ] postprocess_models: List[em.PostAlgorithm] = [] metrics: List[em.Metric] = [em.Accuracy(), em.CV()] per_sens_metrics: List[em.Metric] = [em.Accuracy(), em.TPR()] parallel_results = em.evaluate_models_async( datasets=datasets, preprocess_models=preprocess_models, inprocess_models=inprocess_models, postprocess_models=postprocess_models, metrics=metrics, per_sens_metrics=per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, ) results = em.evaluate_models( datasets, preprocess_models, inprocess_models, postprocess_models, metrics, per_sens_metrics, repeats=1, test_mode=True, topic="pytest", fair_pipeline=False, delete_prev=True, ) pd.testing.assert_frame_equal(parallel_results, results, check_like=True) num_datasets = 2 num_preprocess = 1 num_fair_inprocess = 1 num_unfair_inprocess = 1 expected_num = num_datasets * (num_fair_inprocess + (num_preprocess + 1) * num_unfair_inprocess) assert len(results) == expected_num kc_name = "Kamiran & Calders LR" assert len(em.filter_results(results, [kc_name])) == 2 # result for Toy and Adult assert (len(em.filter_results(results, ["Toy"], index="dataset")) == 3 ) # Kamiran, LR and Upsampler different_name = em.filter_and_map_results(results, {kc_name: "Kamiran & Calders"}) assert len(em.filter_results(different_name, [kc_name])) == 0 assert len(em.filter_results(different_name, ["Kamiran & Calders"])) == 2 pd.testing.assert_frame_equal( em.filter_results(results, [kc_name]), results.query(f"model == '{kc_name}'"), )