def test_targetvalidator_inversetransform(): """ Test that the encoding/decoding works in 1D """ validator = TabularTargetValidator(is_classification=True) validator.fit( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) y = validator.transform( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y) y_decoded = validator.inverse_transform(y) assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist() assert validator.classes_.tolist() == ['a', 'b', 'c'] validator = TabularTargetValidator(is_classification=True) multi_label = pd.DataFrame(np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), dtype=bool) validator.fit(multi_label) y = validator.transform(multi_label) y_decoded = validator.inverse_transform(y) np.testing.assert_array_almost_equal(y, y_decoded) # Multilabel classification is not encoded # For this reason, classes_ attribute does not contain a class np.testing.assert_array_almost_equal(validator.classes_, np.array([]))
def test_is_single_column_target(): validator = TabularTargetValidator(is_classification=True) validator.fit(np.array([1, 2, 3, 4])) assert validator.is_single_column_target() validator = TabularTargetValidator(is_classification=True) validator.fit(np.array([[1, 0, 1, 0], [1, 1, 1, 1]])) assert not validator.is_single_column_target()
def test_targetvalidator_continuous_multioutput(input_data_targettest): assert type_of_target(input_data_targettest) == 'continuous-multioutput' validator = TabularTargetValidator(is_classification=False) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) assert type_of_target(transformed_y) == 'continuous-multioutput'
def test_targetvalidator_multilabel(input_data_targettest): assert type_of_target(input_data_targettest) == 'multilabel-indicator' validator = TabularTargetValidator(is_classification=True) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) assert type_of_target(transformed_y) == 'multilabel-indicator'
def test_targetvalidator_supported_types_noclassification( input_data_targettest): validator = TabularTargetValidator(is_classification=False) validator.fit(input_data_targettest) transformed_y = validator.transform(input_data_targettest) if sparse.issparse(input_data_targettest): assert sparse.issparse(transformed_y) else: assert isinstance(transformed_y, np.ndarray) epected_shape = np.shape(input_data_targettest) if len(epected_shape) > 1 and epected_shape[1] == 1: # The target should have (N,) dimensionality instead of (N, 1) epected_shape = (epected_shape[0], ) assert epected_shape == np.shape(transformed_y) assert np.issubdtype(transformed_y.dtype, np.number) assert validator._is_fitted # Because there is no classification, we do not expect a encoder assert validator.encoder is None if hasattr(input_data_targettest, "iloc"): np.testing.assert_array_equal( np.ravel(input_data_targettest.to_numpy()), np.ravel(transformed_y)) elif sparse.issparse(input_data_targettest): np.testing.assert_array_equal( np.ravel(input_data_targettest.todense()), np.ravel(transformed_y.todense())) else: np.testing.assert_array_equal( np.ravel(np.array(input_data_targettest)), np.ravel(transformed_y))
def test_type_of_target_unsupported(input_data_targettest): """ Makes sure we raise a proper message to the user, when providing not supported data input """ validator = TabularTargetValidator() with pytest.raises(ValueError, match=r"legacy multi-.* data representation."): validator.fit(input_data_targettest)
def test_targetvalidator_supported_types_classification(input_data_targettest): validator = TabularTargetValidator(is_classification=True) validator.fit(input_data_targettest) transformed_y = validator.transform(input_data_targettest) if sparse.issparse(input_data_targettest): assert sparse.issparse(transformed_y) else: assert isinstance(transformed_y, np.ndarray) epected_shape = np.shape(input_data_targettest) if len(epected_shape) > 1 and epected_shape[1] == 1: # The target should have (N,) dimensionality instead of (N, 1) epected_shape = (epected_shape[0], ) assert epected_shape == np.shape(transformed_y) assert np.issubdtype(transformed_y.dtype, np.number) assert validator._is_fitted # Because there is no classification, we do not expect a encoder if not sparse.issparse(input_data_targettest): assert validator.encoder is not None # The encoding should be per column if len(transformed_y.shape) == 1: assert np.min(transformed_y) == 0 assert np.max(transformed_y) == len(np.unique(transformed_y)) - 1 else: for col in range(transformed_y.shape[1]): assert np.min(transformed_y[:, col]) == 0 assert np.max(transformed_y[:, col]) == len( np.unique(transformed_y[:, col])) - 1 # Make sure we can perform inverse transform y_inverse = validator.inverse_transform(transformed_y) if hasattr(input_data_targettest, 'dtype'): # In case of numeric, we need to make sure dtype is preserved if is_numeric_dtype(input_data_targettest.dtype): assert y_inverse.dtype == input_data_targettest.dtype # Then make sure every value is properly inverse-transformed np.testing.assert_array_equal(np.array(y_inverse), np.array(input_data_targettest)) elif hasattr(input_data_targettest, 'dtypes'): if is_numeric_dtype(input_data_targettest.dtypes[0]): assert y_inverse.dtype == input_data_targettest.dtypes[0] # Then make sure every value is properly inverse-transformed np.testing.assert_array_equal( np.array(y_inverse), # pandas is always (N, 1) but targets are ravel() input_data_targettest.to_numpy().reshape(-1)) else: # Sparse is not encoded, mainly because the sparse data is expected # to be numpy of numerical type -- which currently does not require encoding np.testing.assert_array_equal( np.ravel(input_data_targettest.todense()), np.ravel(transformed_y.todense()))
def test_unknown_categories_in_targets(input_data_targettest): validator = TabularTargetValidator(is_classification=True) validator.fit(input_data_targettest) # Add an extra category if isinstance(input_data_targettest, list): input_data_targettest.append(input_data_targettest[-1] + 5000) elif isinstance(input_data_targettest, (pd.DataFrame, pd.Series)): input_data_targettest.iloc[-1] = 5000 elif isinstance(input_data_targettest, np.ndarray): input_data_targettest[-1] = 5000 x_t = validator.transform(input_data_targettest) assert x_t[-1].item(0) == -1
def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest): """ Check if we can fit in a given type (numpy) yet transform if the user changes the type (pandas then) This is problematic only in the case we create an encoder """ validator = TabularTargetValidator(is_classification=True) validator.fit(input_data_targettest) if isinstance(input_data_targettest, pd.DataFrame): complementary_type = input_data_targettest.to_numpy() elif isinstance(input_data_targettest, pd.Series): complementary_type = pd.DataFrame(input_data_targettest) elif isinstance(input_data_targettest, np.ndarray): complementary_type = pd.DataFrame(input_data_targettest) elif isinstance(input_data_targettest, list): complementary_type = pd.DataFrame(input_data_targettest) validator.transform(complementary_type)
def __init__( self, is_classification: bool = False, logger_port: Optional[int] = None, ) -> None: self.is_classification = is_classification self.logger_port = logger_port if self.logger_port is not None: self.logger: Union[ logging.Logger, PicklableClientLogger] = get_named_client_logger( name='Validation', port=self.logger_port, ) else: self.logger = logging.getLogger('Validation') self.feature_validator = TabularFeatureValidator(logger=self.logger) self.target_validator = TabularTargetValidator( is_classification=self.is_classification, logger=self.logger) self._is_fitted = False
def test_target_unsupported(): """ Makes sure we raise a proper message to the user, when providing not supported data input """ validator = TabularTargetValidator(is_classification=True) with pytest.raises( ValueError, match=r"The dimensionality of the train and test targets"): validator.fit( np.array([[0, 1, 0], [0, 1, 1]]), np.array([[0, 1, 0, 0], [0, 1, 1, 1]]), ) with pytest.raises( ValueError, match=r"Train and test targets must both have the same dtypes"): validator.fit( pd.DataFrame({'a': [1, 2, 3]}), pd.DataFrame({'a': [True, False, False]}), ) with pytest.raises(ValueError, match=r"Provided targets are not supported.*"): validator.fit( np.array([[0, 1, 2], [0, 3, 4]]), np.array([[0, 1, 2, 5], [0, 3, 4, 6]]), ) with pytest.raises(ValueError, match="Train and test targets must both have the same"): validator.fit( pd.DataFrame({'string': ['foo']}), pd.DataFrame({'int': [1]}), ) with pytest.raises(ValueError, match=r"AutoPyTorch only supports Numpy arrays, .*"): validator.fit({'input1': 1, 'input2': 2}) with pytest.raises( ValueError, match=r"arget values cannot contain missing/NaN values"): validator.fit(np.array([np.nan, 1, 2])) with pytest.raises( ValueError, match=r"arget values cannot contain missing/NaN values"): validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan]))) with pytest.raises( ValueError, match=r"Cannot call transform on a validator that is not fit"): validator.transform(np.array([1, 2, 3])) with pytest.raises( ValueError, match=r"Cannot call inverse_transform on a validator that is"): validator.inverse_transform(np.array([1, 2, 3])) with pytest.raises( ValueError, match=r"Multi-dimensional classification is not yet supported"): validator._fit(np.array([[1, 2, 3], [1, 5, 6]])) # Dia/ DOK are not supported as type of target makes calls len on the array # which causes TypeError: len() of unsized object. Basically, sparse data as # multi-label is the only thing that makes sense in this format. with pytest.raises( ValueError, match=r"The provided data could not be interpreted by AutoPyTorch" ): validator.fit(sparse.dia_matrix(np.array([1, 2, 3]))) validator.fit(np.array([[0, 1, 0], [0, 1, 1]])) with pytest.raises(ValueError, match=r"Number of outputs changed from"): validator.fit(np.array([0, 1, 0]))