Beispiel #1
0
def test_targetvalidator_inversetransform():
    """
    Test that the encoding/decoding works in 1D
    """
    validator = TargetValidator(is_classification=True)
    validator.fit(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    y = validator.transform(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

    y_decoded = validator.inverse_transform(y)
    assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()

    assert validator.classes_.tolist() == ['a', 'b', 'c']

    validator = TargetValidator(is_classification=True)
    multi_label = pd.DataFrame(
        np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
        dtype=bool
    )
    validator.fit(multi_label)
    y = validator.transform(multi_label)

    y_decoded = validator.inverse_transform(y)
    np.testing.assert_array_almost_equal(y, y_decoded)

    # Multilabel classification is not encoded
    # For this reason, classes_ attribute does not contain a class
    np.testing.assert_array_almost_equal(validator.classes_, np.array([]))
def test_targetvalidator_inversetransform():
    """
    Test that the encoding/decoding works in 1D
    """
    validator = TargetValidator(is_classification=True)
    validator.fit(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    y = validator.transform(
        pd.DataFrame(data=['a', 'a', 'b', 'c', 'a'], dtype='category'),
    )
    np.testing.assert_array_almost_equal(np.array([0, 0, 1, 2, 0]), y)

    y_decoded = validator.inverse_transform(y)
    assert ['a', 'a', 'b', 'c', 'a'] == y_decoded.tolist()

    validator = TargetValidator(is_classification=True)
    multi_label = pd.DataFrame(
        np.array([[1, 0, 0, 1], [0, 0, 1, 1], [0, 0, 0, 0]]),
        dtype=bool
    )
    validator.fit(multi_label)
    y = validator.transform(multi_label)

    y_decoded = validator.inverse_transform(y)
    np.testing.assert_array_almost_equal(y, y_decoded)
Beispiel #3
0
def test_target_unsupported():
    """
    Makes sure we raise a proper message to the user,
    when providing not supported data input
    """
    validator = TargetValidator(is_classification=True)
    with pytest.raises(ValueError, match=r"The dimensionality of the train and test targets"):
        validator.fit(
            np.array([[0, 1, 0], [0, 1, 1]]),
            np.array([[0, 1, 0, 0], [0, 1, 1, 1]]),
        )
    with pytest.raises(ValueError, match=r"Train and test targets must both have the same dtypes"):
        validator.fit(
            pd.DataFrame({'a': [1, 2, 3]}),
            pd.DataFrame({'a': [True, False, False]}),
        )
    with pytest.raises(ValueError, match=r"Provided targets are not supported.*"):
        validator.fit(
            np.array([[0, 1, 2], [0, 3, 4]]),
            np.array([[0, 1, 2, 5], [0, 3, 4, 6]]),
        )
    with pytest.raises(ValueError, match="Train and test targets must both have the same"):
        validator.fit(
            pd.DataFrame({'string': ['foo']}),
            pd.DataFrame({'int': [1]}),
        )
    with pytest.raises(ValueError, match=r"Auto-sklearn only supports Numpy arrays, .*"):
        validator.fit({'input1': 1, 'input2': 2})
    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
        validator.fit(np.array([np.nan, 1, 2]))
    with pytest.raises(ValueError, match=r"arget values cannot contain missing/NaN values"):
        validator.fit(sparse.csr_matrix(np.array([1, 2, np.nan])))
    with pytest.raises(ValueError, match=r"Cannot call transform on a validator that is not fit"):
        validator.transform(np.array([1, 2, 3]))
    with pytest.raises(ValueError, match=r"Cannot call inverse_transform on a validator that is"):
        validator.inverse_transform(np.array([1, 2, 3]))
    with pytest.raises(ValueError, match=r"Multi-dimensional classification is not yet supported"):
        validator._fit(np.array([[1, 2, 3], [1, 5, 6]]))

    # Dia/ DOK are not supported as type of target makes calls len on the array
    # which causes TypeError: len() of unsized object. Basically, sparse data as
    # multi-label is the only thing that makes sense in this format.
    with pytest.raises(ValueError, match=r"The provided data could not be interpreted by Sklearn"):
        validator.fit(sparse.dia_matrix(np.array([1, 2, 3])))

    validator.fit(np.array([[0, 1, 0], [0, 1, 1]]))
    with pytest.raises(ValueError, match=r"Number of outputs changed from"):
        validator.fit(np.array([0, 1, 0]))
Beispiel #4
0
def test_targetvalidator_continuous_multioutput(input_data_targettest):
    assert type_of_target(input_data_targettest) == 'continuous-multioutput'
    validator = TargetValidator(is_classification=False)
    # Test the X_test also!
    validator.fit(input_data_targettest, input_data_targettest)
    transformed_y = validator.transform(input_data_targettest)
    assert type_of_target(transformed_y) == 'continuous-multioutput'
Beispiel #5
0
def test_targetvalidator_multilabel(input_data_targettest):
    assert type_of_target(input_data_targettest) == 'multilabel-indicator'
    validator = TargetValidator(is_classification=True)
    # Test the X_test also!
    validator.fit(input_data_targettest, input_data_targettest)
    transformed_y = validator.transform(input_data_targettest)
    assert type_of_target(transformed_y) == 'multilabel-indicator'
Beispiel #6
0
def test_targetvalidator_supported_types_noclassification(input_data_targettest):
    validator = TargetValidator(is_classification=False)
    validator.fit(input_data_targettest)
    transformed_y = validator.transform(input_data_targettest)
    if sparse.issparse(input_data_targettest):
        assert sparse.issparse(transformed_y)
    else:
        assert isinstance(transformed_y, np.ndarray)
    epected_shape = np.shape(input_data_targettest)
    if len(epected_shape) > 1 and epected_shape[1] == 1:
        # The target should have (N,) dimensionality instead of (N, 1)
        epected_shape = (epected_shape[0], )
    assert epected_shape == np.shape(transformed_y)
    assert np.issubdtype(transformed_y.dtype, np.number)
    assert validator._is_fitted

    # Because there is no classification, we do not expect a encoder
    assert validator.encoder is None

    if hasattr(input_data_targettest, "iloc"):
        np.testing.assert_array_equal(
            np.ravel(input_data_targettest.to_numpy()),
            np.ravel(transformed_y)
        )
    elif sparse.issparse(input_data_targettest):
        np.testing.assert_array_equal(
            np.ravel(input_data_targettest.todense()),
            np.ravel(transformed_y.todense())
        )
    else:
        np.testing.assert_array_equal(
            np.ravel(np.array(input_data_targettest)),
            np.ravel(transformed_y)
        )
Beispiel #7
0
def test_targetvalidator_fitontypeA_transformtypeB(input_data_targettest):
    """
    Check if we can fit in a given type (numpy) yet transform
    if the user changes the type (pandas then)

    This is problematic only in the case we create an encoder
    """
    validator = TargetValidator(is_classification=True)
    validator.fit(input_data_targettest)
    if isinstance(input_data_targettest, pd.DataFrame):
        complementary_type = input_data_targettest.to_numpy()
    elif isinstance(input_data_targettest, pd.Series):
        complementary_type = pd.DataFrame(input_data_targettest)
    elif isinstance(input_data_targettest, np.ndarray):
        complementary_type = pd.DataFrame(input_data_targettest)
    elif isinstance(input_data_targettest, list):
        complementary_type = pd.DataFrame(input_data_targettest)
    validator.transform(complementary_type)
Beispiel #8
0
def test_targetvalidator_supported_types_classification(input_data_targettest):
    validator = TargetValidator(is_classification=True)
    validator.fit(input_data_targettest)
    transformed_y = validator.transform(input_data_targettest)
    if sparse.issparse(input_data_targettest):
        assert sparse.issparse(transformed_y)
    else:
        assert isinstance(transformed_y, np.ndarray)
    epected_shape = np.shape(input_data_targettest)
    if len(epected_shape) > 1 and epected_shape[1] == 1:
        # The target should have (N,) dimensionality instead of (N, 1)
        epected_shape = (epected_shape[0], )
    assert epected_shape == np.shape(transformed_y)
    assert np.issubdtype(transformed_y.dtype, np.number)
    assert validator._is_fitted

    # Because there is no classification, we do not expect a encoder
    if not sparse.issparse(input_data_targettest):
        assert validator.encoder is not None

        # The encoding should be per column
        if len(transformed_y.shape) == 1:
            assert np.min(transformed_y) == 0
            assert np.max(transformed_y) == len(np.unique(transformed_y)) - 1
        else:
            for col in range(transformed_y.shape[1]):
                assert np.min(transformed_y[:, col]) == 0
                assert np.max(transformed_y[:, col]) == len(np.unique(transformed_y[:, col])) - 1

        # Make sure we can perform inverse transform
        y_inverse = validator.inverse_transform(transformed_y)
        if hasattr(input_data_targettest, 'dtype'):
            # In case of numeric, we need to make sure dtype is preserved
            if is_numeric_dtype(input_data_targettest.dtype):
                assert y_inverse.dtype == input_data_targettest.dtype
            # Then make sure every value is properly inverse-transformed
            np.testing.assert_array_equal(np.array(y_inverse), np.array(input_data_targettest))
        elif hasattr(input_data_targettest, 'dtypes'):
            if is_numeric_dtype(input_data_targettest.dtypes[0]):
                assert y_inverse.dtype == input_data_targettest.dtypes[0]
            # Then make sure every value is properly inverse-transformed
            np.testing.assert_array_equal(np.array(y_inverse),
                                          # pandas is always (N, 1) but targets are ravel()
                                          input_data_targettest.to_numpy().reshape(-1))
    else:
        # Sparse is not encoded, mainly because the sparse data is expected
        # to be numpy of numerical type -- which currently does not require encoding
        np.testing.assert_array_equal(
            np.ravel(input_data_targettest.todense()),
            np.ravel(transformed_y.todense())
        )
Beispiel #9
0
def test_unknown_categories_in_targets(input_data_targettest):
    validator = TargetValidator(is_classification=True)
    validator.fit(input_data_targettest)

    # Add an extra category
    if isinstance(input_data_targettest, list):
        input_data_targettest.append(input_data_targettest[-1] + 5000)
    elif isinstance(input_data_targettest, (pd.DataFrame, pd.Series)):
        input_data_targettest.iloc[-1] = 5000
    elif isinstance(input_data_targettest, np.ndarray):
        input_data_targettest[-1] = 5000

    x_t = validator.transform(input_data_targettest)
    assert x_t[-1].item(0) == -1
Beispiel #10
0
class InputValidator(BaseEstimator):
    """
    Makes sure the input data complies with Auto-sklearn requirements.
    Categorical inputs are encoded via a Label Encoder, if the input
    is a dataframe.

    This class also perform checks for data integrity and flags the user
    via informative errors.
    Attributes
    ----------
        feat_type: typing.Optional[typing.List[str]]
            In case the dataset is not a pandas DataFrame:
                + If provided, this list indicates which columns should be treated as categorical
                  it is internally transformed into a dictionary that indicates a mapping from
                  column index to categorical/numerical
                + If not provided, by default all columns are treated as numerical
            If the input dataset is of type pandas dataframe, this argument
            must be none, as the column type will be inferred from the pandas dtypes.
        is_classification: bool
            For classification task, this flag indicates that the target data
            should be encoded
        feature_validator: FeatureValidator
            A FeatureValidator instance used to validate and encode feature columns to match
            sklearn expectations on the data
        target_validator: TargetValidator
            A TargetValidator instance used to validate and encode (in case of classification)
            the target values
    """
    def __init__(
        self,
        feat_type: typing.Optional[typing.List[str]] = None,
        is_classification: bool = False,
        logger_port: typing.Optional[int] = None,
    ) -> None:
        self.feat_type = feat_type
        self.is_classification = is_classification
        self.logger_port = logger_port
        if self.logger_port is not None:
            self.logger = get_named_client_logger(
                name='Validation',
                port=self.logger_port,
            )
        else:
            self.logger = logging.getLogger('Validation')

        self.feature_validator = FeatureValidator(feat_type=self.feat_type,
                                                  logger=self.logger)
        self.target_validator = TargetValidator(
            is_classification=self.is_classification, logger=self.logger)
        self._is_fitted = False

    def fit(
        self,
        X_train: SUPPORTED_FEAT_TYPES,
        y_train: SUPPORTED_TARGET_TYPES,
        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> BaseEstimator:
        """
        Validates and fit a categorical encoder (if needed) to the features, and
        a encoder for targets in the case of classification. Specifically:

        For features:
            + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy
              sparse) as well as dimensionality checks
            + If the provided data is a pandas DataFrame with categorical/boolean/int columns,
              such columns will be encoded using an Ordinal Encoder
        For targets:
            + Checks for dimensionality as well as missing values are performed.
            + If performing a classification task, the data is going to be encoded

        Parameters
        ----------
            X_train: SUPPORTED_FEAT_TYPES
                A set of features that are going to be validated (type and dimensionality
                checks). If this data contains categorical columns, an encoder is going to
                be instantiated and trained with this data.
            y_train: SUPPORTED_TARGET_TYPES
                A set of targets that are going to be encoded if the task is for classification
            X_test: typing.Optional[SUPPORTED_FEAT_TYPES]
                A hold out set of features used for checking
            y_test: SUPPORTED_TARGET_TYPES
                A hold out set of targets used for checking. Additionally, if the current task
                is a classification task, this y_test categories are also going to be used to
                fit a pre-processing encoding (to prevent errors on unseen classes).
        Returns
        -------
            self
        """
        # Check that the data is valid
        if np.shape(X_train)[0] != np.shape(y_train)[0]:
            raise ValueError(
                "Inconsistent number of train datapoints for features and targets,"
                " {} for features and {} for targets".format(
                    np.shape(X_train)[0],
                    np.shape(y_train)[0],
                ))
        if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]:
            raise ValueError(
                "Inconsistent number of test datapoints for features and targets,"
                " {} for features and {} for targets".format(
                    np.shape(X_test)[0],
                    np.shape(y_test)[0],
                ))

        self.feature_validator.fit(X_train, X_test)
        self.target_validator.fit(y_train, y_test)
        self._is_fitted = True

        return self

    def transform(
        self,
        X: SUPPORTED_FEAT_TYPES,
        y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]:
        """
        Transform the given target or features to a numpy array

        Parameters
        ----------
            X: SUPPORTED_FEAT_TYPES
                A set of features to transform
            y: typing.Optional[SUPPORTED_TARGET_TYPES]
                A set of targets to transform

        Return
        ------
            np.ndarray:
                The transformed features array
            np.ndarray:
                The transformed targets array
        """
        if not self._is_fitted:
            raise NotFittedError(
                "Cannot call transform on a validator that is not fitted")
        X_transformed = self.feature_validator.transform(X)
        if y is not None:
            return X_transformed, self.target_validator.transform(y)
        else:
            return X_transformed, y