def test_init_w_auto_detection(self): """Test __init__ with auto detection.""" self.preprocessor = StandardPreprocessor(data=self.data, auto_detect_variable=True) assert self.preprocessor.data.equals(self.data) assert self.preprocessor.numeric_features == ["age", "capital-gain"] assert self.preprocessor.categoric_features == [ "workclass", "education", "marital-status", "occupation", "race", ] assert self.preprocessor.binary_features == ["sex", "Salary"]
def setup(self): """Set generic variables for test case.""" self.data = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/raw_data.csv" ) self.preprocessor = StandardPreprocessor( data=self.data, numeric_features=["age", "capital-gain"], categoric_features=[ "workclass", "education", "marital-status", "occupation", "race" ], binary_features=["sex", "Salary"], )
def test_standard_preprocessor_bad_init(data, numeric_features, categoric_features, binary_features, auto_detect_variable): """Test StandardPreprocessor.__init__ with bad arguments.""" with pytest.raises(TypeError): StandardPreprocessor(data, numeric_features, categoric_features, binary_features, auto_detect_variable)
def test_init_w_non_data_columns_feature(self): """Test __init__.""" with pytest.raises(ValueError): StandardPreprocessor(data=self.data, numeric_features=["non-feature"])
def test_init_w_empty_list_features(self): """Test __init__.""" with pytest.raises(ValueError): StandardPreprocessor(data=self.data)
def test_init_w_non_list_features(self): """Test __init__.""" with pytest.raises(TypeError): StandardPreprocessor(data=self.data, numeric_features="features")
class TestStandardPreprocessor: """Test StandardPreprocessor.""" def setup(self): """Set generic variables for test case.""" self.data = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/raw_data.csv" ) self.preprocessor = StandardPreprocessor( data=self.data, numeric_features=["age", "capital-gain"], categoric_features=[ "workclass", "education", "marital-status", "occupation", "race" ], binary_features=["sex", "Salary"], ) def test_init(self): """Test __init__.""" assert self.preprocessor.data.equals(self.data) assert self.preprocessor.numeric_features == ["age", "capital-gain"] assert self.preprocessor.categoric_features == [ "workclass", "education", "marital-status", "occupation", "race", ] assert self.preprocessor.binary_features == ["sex", "Salary"] def test_init_w_non_list_features(self): """Test __init__.""" with pytest.raises(TypeError): StandardPreprocessor(data=self.data, numeric_features="features") def test_init_w_empty_list_features(self): """Test __init__.""" with pytest.raises(ValueError): StandardPreprocessor(data=self.data) def test_init_w_non_data_columns_feature(self): """Test __init__.""" with pytest.raises(ValueError): StandardPreprocessor(data=self.data, numeric_features=["non-feature"]) def test_init_w_auto_detection(self): """Test __init__ with auto detection.""" self.preprocessor = StandardPreprocessor(data=self.data, auto_detect_variable=True) assert self.preprocessor.data.equals(self.data) assert self.preprocessor.numeric_features == ["age", "capital-gain"] assert self.preprocessor.categoric_features == [ "workclass", "education", "marital-status", "occupation", "race", ] assert self.preprocessor.binary_features == ["sex", "Salary"] def test_process(self): """Test process.""" x_train, x_test, y_train, y_test = self.preprocessor.process( "Salary", remove_nans=True, random_state=2020) x_train_expected = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/x_train.csv" ) x_test_expected = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/x_test.csv" ) y_train_expected = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/y_train.csv", squeeze=True) y_test_expected = pd.read_csv( "tests/test_files/standard_preprocessor/all_estimators_run/y_test.csv", squeeze=True) assert_frame_equal( x_train.reset_index(drop=True).sort_index(axis=1), x_train_expected.sort_index(axis=1), check_dtype=False, ) assert_frame_equal( x_test.reset_index(drop=True).sort_index(axis=1), x_test_expected.sort_index(axis=1), check_dtype=False, ) assert_series_equal(y_train.reset_index(drop=True), y_train_expected, check_dtype=False) assert_series_equal(y_test.reset_index(drop=True), y_test_expected, check_dtype=False) def test_process_with_invalid_target(self): """Test process with target not in features_to_check.""" with pytest.raises(ValueError): self.preprocessor.process("Fake target") def test_process_no_estimators(self): """Test process with no estimators given.""" x, y = self.preprocessor.process( "Salary", train_test_splitter=None, imputer=None, scaler=None, label_encoder=None, one_hot_encode=False, remove_nans=True, random_state=2020, ) x.to_csv( "tests/test_files/standard_preprocessor/no_estimators_run/x.csv", index=False) y.to_csv( "tests/test_files/standard_preprocessor/no_estimators_run/y.csv", index=False) x_expected = pd.read_csv( "tests/test_files/standard_preprocessor/no_estimators_run/x.csv") y_expected = pd.read_csv( "tests/test_files/standard_preprocessor/no_estimators_run/y.csv", squeeze=True) assert_frame_equal( x.reset_index(drop=True).sort_index(axis=1), x_expected.sort_index(axis=1), check_dtype=False, ) assert_series_equal(y.reset_index(drop=True), y_expected, check_dtype=False) def test_check_columns_for_nans_no_remove_nans(self): """Test _check_columns_for_nans with remove_nans set to False.""" self.preprocessor.processed_data = self.data with pytest.raises(ValueError): self.preprocessor._check_columns_for_nans(remove_nans=False) def test_check_columns_for_nans_no_nans(self): """Test _check_columns_for_nans with no nans in dataset.""" save_data = self.data.dropna() self.preprocessor.processed_data = deepcopy(save_data) self.preprocessor._check_columns_for_nans(remove_nans=False) assert_frame_equal(save_data, self.preprocessor.processed_data) def test_impute_no_numeric_features(self): """Test _impute with no numeric_features set.""" self.preprocessor.numeric_features = [] self.preprocessor._impute("", "") def test_label_encode_no_binary_features(self): """Test _label_encode with no binary_features set.""" self.preprocessor.binary_features = [] self.preprocessor._label_encode("") def test_manually_set_column_types(self): """Test _manually_set_column_types with non list.""" with pytest.raises(TypeError): self.preprocessor._manually_set_column_types("not-list", [], []) def test_manually_set_column_with_not_subset(self): """Test _manually_set_column_types with a feature list not being a subset.""" with pytest.raises(ValueError): self.preprocessor._manually_set_column_types( ["not-subset-feature"], [], []) def test_scale_data_with_scale_target_as_false(self): """Test _scale_data with 'scale_target' as False.""" self.preprocessor = StandardPreprocessor(data=self.data, binary_features=["Salary"]) self.preprocessor._scale_data("", target="Salary", scale_target=False) def test_features_to_check(self): """Test features_to_check property.""" assert self.preprocessor.features_to_check == [ "age", "capital-gain", "workclass", "education", "marital-status", "occupation", "race", "sex", "Salary", ]
def test_scale_data_with_scale_target_as_false(self): """Test _scale_data with 'scale_target' as False.""" self.preprocessor = StandardPreprocessor(data=self.data, binary_features=["Salary"]) self.preprocessor._scale_data("", target="Salary", scale_target=False)