def test_featurevalidator_get_columns_to_encode():
    """
    Makes sure that encoded columns are returned by _get_columns_to_encode
    whereas numerical columns are not returned
    """
    validator = FeatureValidator()

    df = pd.DataFrame([
        {
            'int': 1,
            'float': 1.0,
            'category': 'one',
            'bool': True
        },
        {
            'int': 2,
            'float': 2.0,
            'category': 'two',
            'bool': False
        },
    ])

    for col in df.columns:
        df[col] = df[col].astype(col)

    enc_columns, feature_types = validator._get_columns_to_encode(df)

    assert enc_columns == ['category', 'bool']
    assert feature_types == [
        'numerical', 'numerical', 'categorical', 'categorical'
    ]
def test_featurevalidator_supported_types(input_data_featuretest):
    validator = FeatureValidator()
    validator.fit(input_data_featuretest, input_data_featuretest)
    transformed_X = validator.transform(input_data_featuretest)
    if sparse.issparse(input_data_featuretest):
        assert sparse.issparse(transformed_X)
    else:
        assert isinstance(transformed_X, np.ndarray)
    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
    assert np.issubdtype(transformed_X.dtype, np.number)
    assert validator._is_fitted
def test_featurevalidator_supported_types(input_data_featuretest):
    validator = FeatureValidator()
    validator.fit(input_data_featuretest, input_data_featuretest)
    transformed_X = validator.transform(input_data_featuretest)
    if sparse.issparse(input_data_featuretest):
        assert sparse.issparse(transformed_X)
    elif isinstance(input_data_featuretest, list):
        assert isinstance(transformed_X, pd.DataFrame)
    else:
        assert isinstance(transformed_X, type(input_data_featuretest))
    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
    assert validator._is_fitted
def test_featurevalidator_new_data_after_fit(openml_id, train_data_type,
                                             test_data_type):

    # List is currently not supported as infer_objects
    # cast list objects to type objects
    if train_data_type == 'list' or test_data_type == 'list':
        pytest.skip()

    validator = FeatureValidator()

    if train_data_type == 'numpy':
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True,
                                             as_frame=False)
    elif train_data_type == 'pandas':
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True,
                                             as_frame=True)
    else:
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True,
                                             as_frame=True)
        X = X.values.tolist()
        y = y.values.tolist()

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, random_state=1)

    validator.fit(X_train)

    transformed_X = validator.transform(X_test)

    # Basic Checking
    if sparse.issparse(input_data_featuretest):
        assert sparse.issparse(transformed_X)
    else:
        assert isinstance(transformed_X, np.ndarray)
    assert np.shape(X_test) == np.shape(transformed_X)

    # And then check proper error messages
    if train_data_type == 'pandas':
        old_dtypes = copy.deepcopy(validator.dtypes)
        validator.dtypes = ['dummy' for dtype in X_train.dtypes]
        with pytest.raises(
                ValueError,
                match=r"hanging the dtype of the features after fit"):
            transformed_X = validator.transform(X_test)
        validator.dtypes = old_dtypes
        if test_data_type == 'pandas':
            columns = X_test.columns.tolist()
            random.shuffle(columns)
            X_test = X_test[columns]
            with pytest.raises(
                    ValueError,
                    match=r"Changing the column order of the features"):
                transformed_X = validator.transform(X_test)
def test_unknown_encode_value():
    x = pd.DataFrame([
        {
            'a': -41,
            'b': -3,
            'c': 'a',
            'd': -987.2
        },
        {
            'a': -21,
            'b': -3,
            'c': 'a',
            'd': -9.2
        },
        {
            'a': 0,
            'b': -4,
            'c': 'b',
            'd': -97.2
        },
        {
            'a': -51,
            'b': -3,
            'c': 'a',
            'd': 987.2
        },
        {
            'a': 500,
            'b': -3,
            'c': 'a',
            'd': -92
        },
    ])
    x['c'] = x['c'].astype('category')
    validator = FeatureValidator()

    # Make sure that this value is honored
    validator.fit(x)
    x['c'].cat.add_categories(['NA'], inplace=True)
    x.loc[0, 'c'] = 'NA'  # unknown value
    x_t = validator.transform(x)
    # The first row should have a -1 as we added a new categorical there
    expected_row = [-1, -41, -3, -987.2]
    assert expected_row == x_t[0].tolist()
def test_featurevalidator_fitontypeA_transformtypeB(input_data_featuretest):
    """
    Check if we can fit in a given type (numpy) yet transform
    if the user changes the type (pandas then)
    """
    validator = FeatureValidator()
    validator.fit(input_data_featuretest, input_data_featuretest)
    if isinstance(input_data_featuretest, pd.DataFrame):
        complementary_type = input_data_featuretest.to_numpy()
    elif isinstance(input_data_featuretest, np.ndarray):
        complementary_type = pd.DataFrame(input_data_featuretest)
    elif isinstance(input_data_featuretest, list):
        complementary_type = pd.DataFrame(input_data_featuretest)
    elif sparse.issparse(input_data_featuretest):
        complementary_type = sparse.csr_matrix(input_data_featuretest.todense())
    else:
        raise ValueError(type(input_data_featuretest))
    transformed_X = validator.transform(complementary_type)
    assert np.shape(input_data_featuretest) == np.shape(transformed_X)
    assert validator._is_fitted
def test_featurevalidatorget_feat_type_from_columns():
    """
    Makes sure that encoded columns are returned by get_feat_type_from_columns
    whereas numerical columns are not returned
    """
    validator = FeatureValidator()

    df = pd.DataFrame([
        {'int': 1, 'float': 1.0, 'category': 'one', 'bool': True},
        {'int': 2, 'float': 2.0, 'category': 'two', 'bool': False},
    ])

    for col in df.columns:
        df[col] = df[col].astype(col)

    feature_types = validator.get_feat_type_from_columns(df)

    assert feature_types == {'int': 'numerical',
                             'float': 'numerical',
                             'category': 'categorical',
                             'bool': 'categorical'}
def test_encoder_created(input_data_featuretest):
    """
    This test ensures an encoder is created if categorical data is provided
    """
    validator = FeatureValidator()
    validator.fit(input_data_featuretest)
    transformed_X = validator.transform(input_data_featuretest)
    assert validator.encoder is not None

    # Make sure that the encoded features are actually encoded. Categorical columns are at
    # the start after transformation. In our fixtures, this is also honored prior encode
    enc_columns, feature_types = validator._get_columns_to_encode(
        input_data_featuretest)

    # At least one categorical
    assert 'categorical' in validator.feat_type

    # Numerical if the original data has numerical only columns
    if np.any([
            pd.api.types.is_numeric_dtype(input_data_featuretest[col])
            for col in input_data_featuretest.columns
    ]):
        assert 'numerical' in validator.feat_type
    for i, feat_type in enumerate(feature_types):
        if 'numerical' in feat_type:
            np.testing.assert_array_equal(
                transformed_X[:, i], input_data_featuretest[
                    input_data_featuretest.columns[i]].to_numpy())
        elif 'categorical' in feat_type:
            np.testing.assert_array_equal(
                transformed_X[:, i],
                # Expect always 0, 1... because we use a ordinal encoder
                np.array([0, 1]))
        else:
            raise ValueError(feat_type)
def test_list_to_dataframe(openml_id):

    X_pandas, y_pandas = sklearn.datasets.fetch_openml(data_id=openml_id,
                                                       return_X_y=True, as_frame=True)
    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X_pandas, y_pandas, random_state=1)

    X_list = X_train.values.tolist()
    validator = FeatureValidator()
    validator.fit(X_list)
    transformed_X = validator.transform(X_list)
    for i, col in enumerate(X_pandas.columns):
        if is_numeric_dtype(X_pandas[col].dtype):
            # convert dtype translates 72.0 to 72. Be robust against this!
            assert is_numeric_dtype(transformed_X[i].dtype)
        else:
            assert X_pandas[col].dtype.name == transformed_X[i].dtype.name, col

    # Also make sure that at testing time
    # this work
    transformed_X = validator.transform(X_test.values.tolist())
    for i, col in enumerate(X_pandas.columns):
        if is_numeric_dtype(X_pandas[col].dtype):
            # convert dtype translates 72.0 to 72. Be robust against this!
            assert is_numeric_dtype(transformed_X[i].dtype)
        else:
            assert X_pandas[col].dtype.name == transformed_X[i].dtype.name, col
Beispiel #10
0
    def __init__(
        self,
        feat_type: typing.Optional[typing.List[str]] = None,
        is_classification: bool = False,
        logger_port: typing.Optional[int] = None,
    ) -> None:
        self.feat_type = feat_type
        self.is_classification = is_classification
        self.logger_port = logger_port
        if self.logger_port is not None:
            self.logger = get_named_client_logger(
                name='Validation',
                port=self.logger_port,
            )
        else:
            self.logger = logging.getLogger('Validation')

        self.feature_validator = FeatureValidator(feat_type=self.feat_type,
                                                  logger=self.logger)
        self.target_validator = TargetValidator(
            is_classification=self.is_classification, logger=self.logger)
        self._is_fitted = False
def test_no_encoder_created(input_data_featuretest):
    """
    Makes sure that for numerical only features, no encoder is created
    """
    validator = FeatureValidator()
    validator.fit(input_data_featuretest)
    validator.transform(input_data_featuretest)
    assert validator.encoder is None
def test_featurevalidator_new_data_after_fit(openml_id,
                                             train_data_type, test_data_type):

    # List is currently not supported as infer_objects
    # cast list objects to type objects
    if train_data_type == 'list' or test_data_type == 'list':
        pytest.skip()

    validator = FeatureValidator()

    if train_data_type == 'numpy':
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True, as_frame=False)
    elif train_data_type == 'pandas':
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True, as_frame=True)
    else:
        X, y = sklearn.datasets.fetch_openml(data_id=openml_id,
                                             return_X_y=True, as_frame=True)
        X = X.values.tolist()
        y = y.values.tolist()

    X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
        X, y, random_state=1)

    validator.fit(X_train)

    transformed_X = validator.transform(X_test)

    # Basic Checking
    if sparse.issparse(input_data_featuretest):
        assert sparse.issparse(transformed_X)
    elif isinstance(input_data_featuretest, list):
        assert isinstance(transformed_X, pd.DataFrame)
    else:
        assert isinstance(transformed_X, type(X_train))
    assert np.shape(X_test) == np.shape(transformed_X)
def test_no_new_category_after_fit():
    """
    This test makes sure that we can actually pass new categories to the estimator
    without throwing an error
    """
    # Then make sure we catch categorical extra categories
    x = pd.DataFrame({'A': [1, 2, 3, 4], 'B': [5, 6, 7, 8]}, dtype='category')
    validator = FeatureValidator()
    validator.fit(x)
    x['A'] = x['A'].apply(lambda x: x * x)
    validator.transform(x)
def test_featurevalidator_unsupported_pandas(input_data_featuretest):
    validator = FeatureValidator()
    with pytest.raises(
            ValueError,
            match=r"Categorical features in a dataframe.*missing/NaN"):
        validator.fit(input_data_featuretest)
Beispiel #15
0
class InputValidator(BaseEstimator):
    """
    Makes sure the input data complies with Auto-sklearn requirements.
    Categorical inputs are encoded via a Label Encoder, if the input
    is a dataframe.

    This class also perform checks for data integrity and flags the user
    via informative errors.
    Attributes
    ----------
        feat_type: typing.Optional[typing.List[str]]
            In case the dataset is not a pandas DataFrame:
                + If provided, this list indicates which columns should be treated as categorical
                  it is internally transformed into a dictionary that indicates a mapping from
                  column index to categorical/numerical
                + If not provided, by default all columns are treated as numerical
            If the input dataset is of type pandas dataframe, this argument
            must be none, as the column type will be inferred from the pandas dtypes.
        is_classification: bool
            For classification task, this flag indicates that the target data
            should be encoded
        feature_validator: FeatureValidator
            A FeatureValidator instance used to validate and encode feature columns to match
            sklearn expectations on the data
        target_validator: TargetValidator
            A TargetValidator instance used to validate and encode (in case of classification)
            the target values
    """
    def __init__(
        self,
        feat_type: typing.Optional[typing.List[str]] = None,
        is_classification: bool = False,
        logger_port: typing.Optional[int] = None,
    ) -> None:
        self.feat_type = feat_type
        self.is_classification = is_classification
        self.logger_port = logger_port
        if self.logger_port is not None:
            self.logger = get_named_client_logger(
                name='Validation',
                port=self.logger_port,
            )
        else:
            self.logger = logging.getLogger('Validation')

        self.feature_validator = FeatureValidator(feat_type=self.feat_type,
                                                  logger=self.logger)
        self.target_validator = TargetValidator(
            is_classification=self.is_classification, logger=self.logger)
        self._is_fitted = False

    def fit(
        self,
        X_train: SUPPORTED_FEAT_TYPES,
        y_train: SUPPORTED_TARGET_TYPES,
        X_test: typing.Optional[SUPPORTED_FEAT_TYPES] = None,
        y_test: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> BaseEstimator:
        """
        Validates and fit a categorical encoder (if needed) to the features, and
        a encoder for targets in the case of classification. Specifically:

        For features:
            + Valid data types are enforced (List, np.ndarray, pd.DataFrame, pd.Series, scipy
              sparse) as well as dimensionality checks
            + If the provided data is a pandas DataFrame with categorical/boolean/int columns,
              such columns will be encoded using an Ordinal Encoder
        For targets:
            + Checks for dimensionality as well as missing values are performed.
            + If performing a classification task, the data is going to be encoded

        Parameters
        ----------
            X_train: SUPPORTED_FEAT_TYPES
                A set of features that are going to be validated (type and dimensionality
                checks). If this data contains categorical columns, an encoder is going to
                be instantiated and trained with this data.
            y_train: SUPPORTED_TARGET_TYPES
                A set of targets that are going to be encoded if the task is for classification
            X_test: typing.Optional[SUPPORTED_FEAT_TYPES]
                A hold out set of features used for checking
            y_test: SUPPORTED_TARGET_TYPES
                A hold out set of targets used for checking. Additionally, if the current task
                is a classification task, this y_test categories are also going to be used to
                fit a pre-processing encoding (to prevent errors on unseen classes).
        Returns
        -------
            self
        """
        # Check that the data is valid
        if np.shape(X_train)[0] != np.shape(y_train)[0]:
            raise ValueError(
                "Inconsistent number of train datapoints for features and targets,"
                " {} for features and {} for targets".format(
                    np.shape(X_train)[0],
                    np.shape(y_train)[0],
                ))
        if X_test is not None and np.shape(X_test)[0] != np.shape(y_test)[0]:
            raise ValueError(
                "Inconsistent number of test datapoints for features and targets,"
                " {} for features and {} for targets".format(
                    np.shape(X_test)[0],
                    np.shape(y_test)[0],
                ))

        self.feature_validator.fit(X_train, X_test)
        self.target_validator.fit(y_train, y_test)
        self._is_fitted = True

        return self

    def transform(
        self,
        X: SUPPORTED_FEAT_TYPES,
        y: typing.Optional[SUPPORTED_TARGET_TYPES] = None,
    ) -> typing.Tuple[np.ndarray, typing.Optional[np.ndarray]]:
        """
        Transform the given target or features to a numpy array

        Parameters
        ----------
            X: SUPPORTED_FEAT_TYPES
                A set of features to transform
            y: typing.Optional[SUPPORTED_TARGET_TYPES]
                A set of targets to transform

        Return
        ------
            np.ndarray:
                The transformed features array
            np.ndarray:
                The transformed targets array
        """
        if not self._is_fitted:
            raise NotFittedError(
                "Cannot call transform on a validator that is not fitted")
        X_transformed = self.feature_validator.transform(X)
        if y is not None:
            return X_transformed, self.target_validator.transform(y)
        else:
            return X_transformed, y
def test_sparse_output_is_csr(input_data_featuretest):
    validator = FeatureValidator()
    validator.fit(input_data_featuretest, input_data_featuretest)
    transformed_X = validator.transform(input_data_featuretest)
    assert sparse.issparse(transformed_X)
    assert isinstance(transformed_X, sparse.csr_matrix)
def test_unsupported_dataframe_sparse():
    df = pd.DataFrame({'A': pd.Series(pd.arrays.SparseArray(np.random.randn(10)))})
    validator = FeatureValidator()
    with pytest.raises(ValueError, match=r"Auto-sklearn does not yet support sparse pandas"):
        validator.fit(df)
def test_features_unsupported_calls_are_raised():
    """
    Makes sure we raise a proper message to the user,
    when providing not supported data input or using the validator in a way that is not
    expected
    """
    validator = FeatureValidator()
    with pytest.raises(ValueError,
                       match=r"Auto-sklearn does not support time"):
        validator.fit(pd.DataFrame({'datetime': [pd.Timestamp('20180310')]}))
    with pytest.raises(ValueError, match="has invalid type object"):
        validator.fit(pd.DataFrame({'string': ['foo']}))
    with pytest.raises(
            ValueError,
            match=r"Auto-sklearn only supports.*yet, the provided input"):
        validator.fit({'input1': 1, 'input2': 2})
    with pytest.raises(ValueError, match=r"has unsupported dtype string"):
        validator.fit(pd.DataFrame([{'A': 1, 'B': 2}], dtype='string'))
    with pytest.raises(
            ValueError,
            match=r"The feature dimensionality of the train and test"):
        validator.fit(
            X_train=np.array([[1, 2, 3], [4, 5, 6]]),
            X_test=np.array([[1, 2, 3, 4], [4, 5, 6, 7]]),
        )
    with pytest.raises(
            ValueError,
            match=r"Cannot call transform on a validator that is not fit"):
        validator.transform(np.array([[1, 2, 3], [4, 5, 6]]))
    validator.feat_type = ['Numerical']
    with pytest.raises(
            ValueError,
            match=r"providing the option feat_type to the fit method is.*"):
        validator.fit(pd.DataFrame([[1, 2, 3], [4, 5, 6]]))
    with pytest.raises(
            ValueError,
            match=r"Array feat_type does not have same number of.*"):
        validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
    validator.feat_type = [1, 2, 3]
    with pytest.raises(ValueError,
                       match=r"Array feat_type must only contain strings.*"):
        validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
    validator.feat_type = ['1', '2', '3']
    with pytest.raises(ValueError,
                       match=r"Only `Categorical` and `Numerical` are.*"):
        validator.fit(np.array([[1, 2, 3], [4, 5, 6]]))
def test_featurevalidator_unsupported_list(input_data_featuretest):
    validator = FeatureValidator()
    with pytest.raises(
            ValueError,
            match=r".*has invalid type object. Cast it to a valid dtype.*"):
        validator.fit(input_data_featuretest)
def test_featurevalidator_unsupported_numpy(input_data_featuretest):
    validator = FeatureValidator()
    with pytest.raises(
            ValueError,
            match=r".*When providing a numpy array.*not supported."):
        validator.fit(input_data_featuretest)