def test_targetvalidator_continuous_multioutput(input_data_targettest): assert type_of_target(input_data_targettest) == 'continuous-multioutput' validator = TargetValidator(is_classification=False) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) assert type_of_target(transformed_y) == 'continuous-multioutput'
def test_targetvalidator_inversetransform(): """ Test that the encoding/decoding works in 1D """ validator = TargetValidator(is_classification=True) validator.fit( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) y = validator.transform( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y) y_decoded = validator.inverse_transform(y) assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist() assert validator.classes_.tolist() == ['a', 'b', 'c'] validator = TargetValidator(is_classification=True) multi_label = pd.DataFrame( np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), dtype=bool ) validator.fit(multi_label) y = validator.transform(multi_label) y_decoded = validator.inverse_transform(y) np.testing.assert_array_almost_equal(y, y_decoded) # Multilabel classification is not encoded # For this reason, classes_ attribute does not contain a class np.testing.assert_array_almost_equal(validator.classes_, np.array([]))
def test_targetvalidator_multilabel(input_data_targettest): assert type_of_target(input_data_targettest) == 'multilabel-indicator' validator = TargetValidator(is_classification=True) # Test the X_test also! validator.fit(input_data_targettest, input_data_targettest) transformed_y = validator.transform(input_data_targettest) assert type_of_target(transformed_y) == 'multilabel-indicator'
def test_targetvalidator_supported_types_noclassification(input_data_targettest): validator = TargetValidator(is_classification=False) validator.fit(input_data_targettest) transformed_y = validator.transform(input_data_targettest) if sparse.issparse(input_data_targettest): assert sparse.issparse(transformed_y) else: assert isinstance(transformed_y, np.ndarray) epected_shape = np.shape(input_data_targettest) if len(epected_shape) > 1 and epected_shape[1] == 1: # The target should have (N,) dimensionality instead of (N, 1) epected_shape = (epected_shape[0], ) assert epected_shape == np.shape(transformed_y) assert np.issubdtype(transformed_y.dtype, np.number) assert validator._is_fitted # Because there is no classification, we do not expect a encoder assert validator.encoder is None if hasattr(input_data_targettest, "iloc"): np.testing.assert_array_equal( np.ravel(input_data_targettest.to_numpy()), np.ravel(transformed_y) ) elif sparse.issparse(input_data_targettest): np.testing.assert_array_equal( np.ravel(input_data_targettest.todense()), np.ravel(transformed_y.todense()) ) else: np.testing.assert_array_equal( np.ravel(np.array(input_data_targettest)), np.ravel(transformed_y) )
def test_targetvalidator_inversetransform(): """ Test that the encoding/decoding works in 1D """ validator = TargetValidator(is_classification=True) validator.fit( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) y = validator.transform( pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'), ) np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y) y_decoded = validator.inverse_transform(y) assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist() validator = TargetValidator(is_classification=True) multi_label = pd.DataFrame( np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]), dtype=bool ) validator.fit(multi_label) y = validator.transform(multi_label) y_decoded = validator.inverse_transform(y) np.testing.assert_array_almost_equal(y, y_decoded)
def test_is_single_column_target(): validator = TargetValidator(is_classification=True) validator.fit(np.array([1, 2, 3, 4])) assert validator.is_single_column_target() validator = TargetValidator(is_classification=True) validator.fit(np.array([[1, 0, 1, 0], [1, 1, 1, 1]])) assert not validator.is_single_column_target()
def test_type_of_target_unsupported(input_data_targettest): """ Makes sure we raise a proper message to the user, when providing not supported data input """ validator = TargetValidator() with pytest.raises(ValueError, match=r"legacy multi-.* data representation."): validator.fit(input_data_targettest)
def test_targetvalidator_supported_types_classification(input_data_targettest): validator = TargetValidator(is_classification=True) validator.fit(input_data_targettest) transformed_y = validator.transform(input_data_targettest) if sparse.issparse(input_data_targettest): assert sparse.issparse(transformed_y) else: assert isinstance(transformed_y, np.ndarray) epected_shape = np.shape(input_data_targettest) if len(epected_shape) > 1 and epected_shape[1] == 1: # The target should have (N,) dimensionality instead of (N, 1) epected_shape = (epected_shape[0], ) assert epected_shape == np.shape(transformed_y) assert np.issubdtype(transformed_y.dtype, np.number) assert validator._is_fitted # Because there is no classification, we do not expect a encoder if not sparse.issparse(input_data_targettest): assert validator.encoder is not None # The encoding should be per column if len(transformed_y.shape) == 1: assert np.min(transformed_y) == 0 assert np.max(transformed_y) == len(np.unique(transformed_y)) - 1 else: for col in range(transformed_y.shape[1]): assert np.min(transformed_y[:, col]) == 0 assert np.max(transformed_y[:, col]) == len(np.unique(transformed_y[:, col])) - 1 # Make sure we can perform inverse transform y_inverse = validator.inverse_transform(transformed_y) if hasattr(input_data_targettest, 'dtype'): # In case of numeric, we need to make sure dtype is preserved if is_numeric_dtype(input_data_targettest.dtype): assert y_inverse.dtype == input_data_targettest.dtype # Then make sure every value is properly inverse-transformed np.testing.assert_array_equal(np.array(y_inverse), np.array(input_data_targettest)) elif hasattr(input_data_targettest, 'dtypes'): if is_numeric_dtype(input_data_targettest.dtypes[0]): assert y_inverse.dtype == input_data_targettest.dtypes[0] # Then make sure every value is properly inverse-transformed np.testing.assert_array_equal(np.array(y_inverse), # pandas is always (N, 1) but targets are ravel() input_data_targettest.to_numpy().reshape(-1)) else: # Sparse is not encoded, mainly because the sparse data is expected # to be numpy of numerical type -- which currently does not require encoding np.testing.assert_array_equal( np.ravel(input_data_targettest.todense()), np.ravel(transformed_y.todense()) )
def test_unknown_categories_in_targets(input_data_targettest): validator = TargetValidator(is_classification=True) validator.fit(input_data_targettest) # Add an extra category if isinstance(input_data_targettest, list): input_data_targettest.append(input_data_targettest[-1] + 5000) elif isinstance(input_data_targettest, (pd.DataFrame, pd.Series)): input_data_targettest.iloc[-1] = 5000 elif isinstance(input_data_targettest, np.ndarray): input_data_targettest[-1] = 5000 x_t = validator.transform(input_data_targettest) assert x_t[-1].item(0) == -1
def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest): """ Check if we can fit in a given type (numpy) yet transform if the user changes the type (pandas then) This is problematic only in the case we create an encoder """ validator = TargetValidator(is_classification=True) validator.fit(input_data_targettest) if isinstance(input_data_targettest, pd.DataFrame): complementary_type = input_data_targettest.to_numpy() elif isinstance(input_data_targettest, pd.Series): complementary_type = pd.DataFrame(input_data_targettest) elif isinstance(input_data_targettest, np.ndarray): complementary_type = pd.DataFrame(input_data_targettest) elif isinstance(input_data_targettest, list): complementary_type = pd.DataFrame(input_data_targettest) validator.transform(complementary_type)
class InputValidator(BaseEstimator): """ Makes sure the input data complies with Auto-sklearn requirements. Categorical inputs are encoded via a Label Encoder, if the input is a dataframe. This class also perform checks for data integrity and flags the user via informative errors. Attributes ---------- feat_type: typing.Optional[typing.List[str]] In case the dataset is not a pandas DataFrame: + If provided, this list indicates which columns should be treated as categorical it is internally transformed into a dictionary that indicates a mapping from column index to categorical/numerical + If not provided, by default all columns are treated as numerical If the input dataset is of type pandas dataframe, this argument must be none, as the column type will be inferred from the pandas dtypes. is_classification: bool For classification task, this flag indicates that the target data should be encoded feature_validator: FeatureValidator A FeatureValidator instance used to validate and encode feature columns to match sklearn expectations on the data target_validator: TargetValidator A TargetValidator instance used to validate and encode (in case of classification) the target values """ def __init__( self, feat_type: typing.Optional[typing.List[str]] = None, is_classification: bool = False, logger_port: typing.Optional[int] = None, ) -> None: self.feat_type = feat_type self.is_classification = is_classification self.logger_port = logger_port if self.logger_port is not None: self.logger = get_named_client_logger( name='Validation', port=self.logger_port, ) else: self.logger = logging.getLogger('Validation') self.feature_validator = FeatureValidator(feat_type=self.feat_type, logger=self.logger) self.target_validator = TargetValidator( is_classification=self.is_classification, logger=self.logger) self._is_fitted = False def fit( self, X_train: SUPPORTED_FEAT_TYPES, y_train: SUPPORTED_TARGET_TYPES, X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None, y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None, ) -> BaseEstimator: """ Validates and fit a categorical encoder (if needed) to the features, and a encoder for targets in the case of classification. Specifically: For features: + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy sparse) as well as dimensionality checks + If the provided data is a pandas DataFrame with categorical/boolean/int columns, such columns will be encoded using an Ordinal Encoder For targets: + Checks for dimensionality as well as missing values are performed. + If performing a classification task, the data is going to be encoded Parameters ---------- X_train: SUPPORTED_FEAT_TYPES A set of features that are going to be validated (type and dimensionality checks). If this data contains categorical columns, an encoder is going to be instantiated and trained with this data. y_train: SUPPORTED_TARGET_TYPES A set of targets that are going to be encoded if the task is for classification X_test: typing.Optional[SUPPORTED_FEAT_TYPES] A hold out set of features used for checking y_test: SUPPORTED_TARGET_TYPES A hold out set of targets used for checking. Additionally, if the current task is a classification task, this y_test categories are also going to be used to fit a pre-processing encoding (to prevent errors on unseen classes). Returns ------- self """ # Check that the data is valid if np.shape(X_train)[0] != np.shape(y_train)[0]: raise ValueError( "Inconsistent number of train datapoints for features and targets," " {} for features and {} for targets".format( np.shape(X_train)[0], np.shape(y_train)[0], )) if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]: raise ValueError( "Inconsistent number of test datapoints for features and targets," " {} for features and {} for targets".format( np.shape(X_test)[0], np.shape(y_test)[0], )) self.feature_validator.fit(X_train, X_test) self.target_validator.fit(y_train, y_test) self._is_fitted = True return self def transform( self, X: SUPPORTED_FEAT_TYPES, y: typing.Optional[SUPPORTED_TARGET_TYPES] = None, ) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]: """ Transform the given target or features to a numpy array Parameters ---------- X: SUPPORTED_FEAT_TYPES A set of features to transform y: typing.Optional[SUPPORTED_TARGET_TYPES] A set of targets to transform Return ------ np.ndarray: The transformed features array np.ndarray: The transformed targets array """ if not self._is_fitted: raise NotFittedError( "Cannot call transform on a validator that is not fitted") X_transformed = self.feature_validator.transform(X) if y is not None: return X_transformed, self.target_validator.transform(y) else: return X_transformed, y
def test_target_unsupported(): """ Makes sure we raise a proper message to the user, when providing not supported data input """ validator = TargetValidator(is_classification=True) with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"): validator.fit( np.array([[0, 1, 0], [0, 1, 1]]), np.array([[0, 1, 0, 0], [0, 1, 1, 1]]), ) with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"): validator.fit( pd.DataFrame({'a': [1, 2, 3]}), pd.DataFrame({'a': [True, False, False]}), ) with pytest.raises(ValueError, match=r"Provided targets are not supported.*"): validator.fit( np.array([[0, 1, 2], [0, 3, 4]]), np.array([[0, 1, 2, 5], [0, 3, 4, 6]]), ) with pytest.raises(ValueError, match="Train and test targets must both have the same"): validator.fit( pd.DataFrame({'string': ['foo']}), pd.DataFrame({'int': [1]}), ) with pytest.raises(ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"): validator.fit({'input1': 1, 'input2': 2}) with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): validator.fit(np.array([np.nan, 1, 2])) with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"): validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan]))) with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"): validator.transform(np.array([1, 2, 3])) with pytest.raises(ValueError, match=r"Cannot call inverse_transform on a validator that is"): validator.inverse_transform(np.array([1, 2, 3])) with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"): validator._fit(np.array([[1, 2, 3], [1, 5, 6]])) # Dia/ DOK are not supported as type of target makes calls len on the array # which causes TypeError: len() of unsized object. Basically, sparse data as # multi-label is the only thing that makes sense in this format. with pytest.raises(ValueError, match=r"The provided data could not be interpreted by Sklearn"): validator.fit(sparse.dia_matrix(np.array([1, 2, 3]))) validator.fit(np.array([[0, 1, 0], [0, 1, 1]])) with pytest.raises(ValueError, match=r"Number of outputs changed from"): validator.fit(np.array([0, 1, 0]))