Exemple #1
0
def test_is_categorical(is_categorical, msg):
    if is_categorical is None:
        encoder = Encoder()
        assert encoder.is_categorical is None
        assert encoder._is_categorical is None
        encoder = Encoder()
        encoder.is_categorical = None
        assert encoder.is_categorical is None
        assert encoder._is_categorical is None
    else:
        if msg is None:
            encoder = Encoder(is_categorical=is_categorical)
            assert np.all(encoder.is_categorical == is_categorical)
            assert np.all(encoder._is_categorical == is_categorical)
            assert isinstance(encoder._is_categorical, np.ndarray)
            assert encoder._is_categorical.dtype.name == "bool"

            encoder.is_categorical = is_categorical
            assert np.all(encoder.is_categorical == is_categorical)
            assert np.all(encoder._is_categorical == is_categorical)
            assert isinstance(encoder._is_categorical, np.ndarray)
            assert encoder._is_categorical.dtype.name == "bool"

        else:
            with pytest.raises(ValueError, match=msg):
                _ = Encoder(is_categorical=is_categorical)
            with pytest.raises(ValueError, match=msg):
                encoder = Encoder()
                encoder.is_categorical = is_categorical
Exemple #2
0
def test_encoder_max_bins():
    encoder = Encoder()
    assert encoder.max_bins == 256
    encoder = Encoder(max_bins=127)
    assert encoder.max_bins == 127
    encoder.max_bins = 42
    assert encoder.max_bins == 42
    with pytest.raises(ValueError, match="max_bins must be an integer number"):
        encoder = Encoder(max_bins=3.14)
    with pytest.raises(ValueError, match="max_bins must be >= 3"):
        encoder.max_bins = 2
Exemple #3
0
def test_encoder_handle_unknown():
    encoder = Encoder()
    assert encoder.handle_unknown == "error"
    encoder = Encoder(handle_unknown="consider_missing")
    assert encoder.handle_unknown == "consider_missing"
    encoder.handle_unknown = "error"
    assert encoder.handle_unknown == "error"

    msg = "handle_unknown must be 'error' or 'consider_missing' but " "got {0}".format(
        "truc")
    with pytest.raises(ValueError, match=msg):
        encoder = Encoder(handle_unknown="truc")
Exemple #4
0
def test_encoder_fit_transform_dataframes(
    df,
    max_bins,
    n_bins_no_missing_values_,
    categories_,
    binning_thresholds_,
    X_binned,
    df_inverse_transform,
    warnings,
):
    encoder = Encoder(max_bins=max_bins, verbose=True)
    if warnings is not None:
        with pytest.warns(UserWarning) as warn_records:
            encoder.fit(df)
        assert len(warn_records) == len(warnings)
        for warn_record, warning in zip(warn_records, warnings):
            assert str(warn_record.message) == warning
    else:
        encoder.fit(df)

    assert encoder.n_samples_in_ == df.shape[0]
    assert encoder.n_features_in_ == df.shape[1]
    np.testing.assert_array_equal(encoder.n_bins_no_missing_values_,
                                  n_bins_no_missing_values_)
    # Check that categories are OK
    assert encoder.categories_.keys() == categories_.keys()
    for (categories1, categories2) in zip(encoder.categories_.values(),
                                          categories_.values()):
        np.testing.assert_array_equal(categories1, categories2)
    # Check that binning thresholds are OK
    assert encoder.binning_thresholds_.keys() == binning_thresholds_.keys()
    for (thresholds1, thresholds2) in zip(encoder.binning_thresholds_.values(),
                                          binning_thresholds_.values()):
        np.testing.assert_array_equal(thresholds1, thresholds2)

    # Check that dataset is correct
    dataset = encoder.transform(df)
    X_binned_out = features_bitarray_to_array(dataset)
    np.testing.assert_array_equal(X_binned_out, X_binned)

    # Check that reconstructed dataframe is correct
    df_inverse_transform_out = encoder.inverse_transform(dataset)
    pd.testing.assert_frame_equal(df_inverse_transform_out,
                                  df_inverse_transform)

    # Test also fit_transform
    encoder = Encoder(max_bins=max_bins)
    dataset = encoder.fit_transform(df)
    df_inverse_transform_out = encoder.inverse_transform(dataset)
    pd.testing.assert_frame_equal(df_inverse_transform_out,
                                  df_inverse_transform)
Exemple #5
0
def test_encoder_subsample():
    encoder = Encoder()
    assert encoder.subsample == int(2e5)
    encoder = Encoder(subsample=None)
    assert encoder.subsample is None
    encoder.subsample = 100_000.0
    assert encoder.subsample == 100_000

    msg = "subsample should be None or a number >= 50000"
    with pytest.raises(ValueError, match=msg):
        encoder = Encoder(subsample="truc")
    with pytest.raises(ValueError, match=msg):
        encoder = Encoder(subsample=10000)
    with pytest.raises(ValueError, match=msg):
        encoder = Encoder(subsample=-1)
Exemple #6
0
def test_encoder_large_all_categorical(n_samples, max_values, dtype):
    n_features = max_values.size
    X_in = np.asfortranarray(np.random.randint(max_values + 1,
                                               size=(n_samples, n_features)),
                             dtype=dtype)
    df_in = pd.DataFrame(X_in).astype("category")
    encoder = Encoder()
    dataset_out = encoder.fit_transform(df_in)
    df_out = encoder.inverse_transform(dataset_out)
    assert df_in.equals(df_out)
Exemple #7
0
def test_encoder_detects_unknowns():
    df = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, 0, 0, 0, 0, 1],
            "B": ["b", "a", "b", "c", "a", "a", "b", "c", "b"],
            "C": [3, 3, 0, -1, 42, 7, 1, 17, 8],
            "D": ["b", "a", "b", "c", "a", "a", "d", "c", "a"],
            "E": [-4, 1, 2, 1, -3, 17, 2, 3.0, -1],
        }, )
    # Same dataframe as df but with less data and less modalities / unknown modalities
    df2 = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, 0, 0],
            # Same column as df["B"] but with less modalities
            "B": ["b", "b", "c", "c", "b", "b"],
            # Same column as df["C"] but with a missing value
            "C": [3, 3, 0, -1, None, 7],
            # Same column with an unknown modality and missing ones
            "D": ["c", "b", "f", "c", "d", "f"],
            "E": [-1, 4, -67, 128, 2, 0],
        }, )

    for colname in ["A", "B", "D"]:
        df[colname] = df[colname].astype("category")
        df2[colname] = df2[colname].astype("category")

    max_bins = 4
    encoder = Encoder(max_bins=max_bins)
    encoder.fit(df)

    msg = "Found unknown categories {0} in column {1} during " "transform".format(
        {"f"}, 3)
    with pytest.raises(ValueError, match=msg):
        dataset = encoder.transform(df2)

    max_bins = 4
    encoder = Encoder(max_bins=max_bins)
    encoder.fit(df)
Exemple #8
0
def test_encoder_cat_min_categories(cat_min_categories, msg,
                                    test_cat_min_categories_):
    if cat_min_categories is None:
        encoder = Encoder()
        assert encoder.cat_min_categories == "log"
    else:
        if msg is None:
            encoder = Encoder(cat_min_categories=cat_min_categories)
            assert encoder.cat_min_categories == cat_min_categories
            assert encoder._cat_min_categories == cat_min_categories
            encoder = Encoder()
            encoder.cat_min_categories = cat_min_categories
            assert encoder.cat_min_categories == cat_min_categories
            assert encoder._cat_min_categories == cat_min_categories
        else:
            with pytest.raises(ValueError, match=msg):
                _ = Encoder(cat_min_categories=cat_min_categories)
            encoder = Encoder()
            with pytest.raises(ValueError, match=msg):
                encoder.cat_min_categories = cat_min_categories

    if test_cat_min_categories_:
        n_samples = 13
        df = pd.DataFrame({"col": np.random.randn(n_samples)})
        encoder = Encoder(cat_min_categories=cat_min_categories)
        encoder.fit(df)
        if cat_min_categories == "log":
            assert encoder.cat_min_categories_ == floor(log(n_samples))
        elif cat_min_categories == "sqrt":
            assert encoder.cat_min_categories_ == floor(sqrt(n_samples))
        elif isinstance(cat_min_categories, int):
            assert encoder.cat_min_categories_ == cat_min_categories

        n_samples = 2
        df = pd.DataFrame({"col": np.random.randn(n_samples)})
        encoder = Encoder(cat_min_categories=cat_min_categories)
        encoder.fit(df)
        if cat_min_categories == "log":
            assert encoder.cat_min_categories_ == 2
        elif cat_min_categories == "sqrt":
            assert encoder.cat_min_categories_ == 2
        elif isinstance(cat_min_categories, int):
            assert encoder.cat_min_categories_ == cat_min_categories
Exemple #9
0
def test_encoder_deals_with_unknown():

    df = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, 0, 0, 0, 0, 1],
            "B": ["b", "a", "b", "c", "a", "a", "b", "c", "b"],
            "C": [3, 3, 0, -1, 42, 7, 1, 17, 8],
            "D": ["b", "a", "b", "c", "a", "a", "d", "c", "a"],
            "E": [-4, 1, 2, 1, -3, 17, 2, 3.0, -1],
        }, )

    # Same dataframe as df but with less data and less modalities / unknown modalities
    df2 = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, 0, 0],
            # Same column as df["B"] but with less modalities
            "B": ["b", "b", "c", "c", "b", "b"],
            # Same column as df["C"] but with a missing value
            "C": [3, 3, 0, -1, None, 7],
            # Same column with an unknown modality and missing ones
            "D": ["c", "b", "f", "c", "d", "f"],
            "E": [-1, 4, -67, 128, 2, 0],
        }, )

    for colname in ["A", "B", "D"]:
        df[colname] = df[colname].astype("category")
        df2[colname] = df2[colname].astype("category")

    max_bins = 4
    encoder = Encoder(max_bins=max_bins, handle_unknown="consider_missing")

    encoder.fit(df)
    assert encoder.n_samples_in_ == df.shape[0]
    assert encoder.n_features_in_ == df.shape[1]

    n_bins_no_missing_values_ = np.array([2, 3, 4, 4, 4])
    np.testing.assert_array_equal(encoder.n_bins_no_missing_values_,
                                  n_bins_no_missing_values_)
Exemple #10
0
def test_check_is_category_matches_X():
    encoder = Encoder(is_categorical=np.zeros(3, dtype=np.bool_))
Exemple #11
0
def test_encoder_deals_with_unknown_again():
    df = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, 0, 0, 0, 0, 1],
            "B": ["b", "a", "b", "c", "a", "a", "b", "c", "b"],
            "C": [3, 3, 0, -1, 42, 7, 1, 17, 8],
            "D": ["b", "a", "b", "c", "a", "a", "d", "c", "a"],
            "E": [-4, 1, 2, 1, -3, 17, 2, 3.0, -1],
        }, )
    # Same dataframe as df but with less data and less modalities / unknown modalities
    df2 = pd.DataFrame(
        {
            "A": [0, 1, 1, 1, None, 0],
            "B": ["b", "b", "c", "c", "b", "b"],
            "C": [4, 2, -4, -1, None, 7],
            "D": ["c", "b", "f", "c", None, "f"],
            "E": [-1, 4, -67, 128, 2, 0],
        }, )
    for colname in ["A", "B", "D"]:
        df[colname] = df[colname].astype("category")
        df2[colname] = df2[colname].astype("category")

    max_bins = 4
    encoder = Encoder(max_bins=max_bins, handle_unknown="consider_missing")

    encoder.fit(df)
    assert encoder.n_samples_in_ == df.shape[0]
    assert encoder.n_features_in_ == df.shape[1]

    n_bins_no_missing_values_ = np.array([2, 3, 4, 4, 4])
    np.testing.assert_array_equal(encoder.n_bins_no_missing_values_,
                                  n_bins_no_missing_values_)

    categories_ = {
        0: np.array([0, 1]),
        1: np.array(["a", "b", "c"]),
        3: np.array(["a", "b", "c", "d"]),
    }
    assert encoder.categories_.keys() == categories_.keys()
    for (categories1, categories2) in zip(encoder.categories_.values(),
                                          categories_.values()):
        np.testing.assert_array_equal(categories1, categories2)

    binning_thresholds_ = {
        2: np.array([1.0, 3.0, 8.0]),
        4: np.array([-1.0, 1.0, 2.0])
    }
    assert encoder.binning_thresholds_.keys() == binning_thresholds_.keys()
    for (thresholds1, thresholds2) in zip(encoder.binning_thresholds_.values(),
                                          binning_thresholds_.values()):
        np.testing.assert_array_equal(thresholds1, thresholds2)

    is_categorical_ = np.array([True, True, False, True, False])
    np.testing.assert_array_equal(encoder.is_categorical_, is_categorical_)

    dataset = encoder.transform(df2)

    X_binned = np.array([
        [0, 1, 2, 2, 0],
        [1, 1, 1, 1, 3],
        [1, 2, 0, 4, 0],
        [1, 2, 0, 2, 3],
        [2, 1, 4, 4, 2],
        [0, 1, 2, 4, 1],
    ])
    X_binned_out = features_bitarray_to_array(dataset)
    np.testing.assert_array_equal(X_binned_out, X_binned)

    df_inverse_transform = pd.DataFrame()
    df_inverse_transform[0] = pd.Categorical([0, 1, 1, 1, None, 0])
    df_inverse_transform[1] = pd.Categorical(["b", "b", "c", "c", "b", "b"])
    df_inverse_transform[2] = pd.arrays.IntervalArray.from_tuples([
        (3.0, 8.0), (1.0, 3.0), (-np.inf, 1.0), (-np.inf, 1.0), np.nan,
        (3.0, 8.0)
    ])
    df_inverse_transform[3] = pd.Categorical(
        ["c", "b", np.nan, "c", np.nan, np.nan], categories=["a", "b", "c"])
    df_inverse_transform[4] = pd.arrays.IntervalArray.from_tuples([
        (-np.inf, -1.0),
        (2.0, np.inf),
        (-np.inf, -1.0),
        (2.0, np.inf),
        (1.0, 2.0),
        (-1.0, 1.0),
    ])
    df_inverse_transform_out = encoder.inverse_transform(dataset)
    pd.testing.assert_frame_equal(df_inverse_transform_out,
                                  df_inverse_transform)
Exemple #12
0
def test_encoder_errors():
    # unsupported dtype in DataFrame raises an error
    df = pd.DataFrame({"col": pd.to_datetime(["2011-10-01", "2009-08-17"])})
    with pytest.raises(ValueError) as err_info:
        _ = Encoder().fit(df)
    assert err_info.type is ValueError
    assert (
        err_info.value.args[0] ==
        "Column col has dtype datetime64[ns] which is not supported by WildWood."
    )

    # object dtype cannot be converted to float when column is declared numerical
    X = np.array([
        ["a", 0.1],
        ["a", 0.2],
        ["b", 0.1],
        ["a", 0.3],
        ["a", 0.0],
        ["c", -1.2],
        ["b", 0.1],
    ])
    is_categorical = [False, False]
    with pytest.raises(
            ValueError,
            match=
            f"Column 0 is declared as numerical, but it cannot be converted to float",
    ):
        _ = Encoder(is_categorical=is_categorical).fit(X)

    # non-2d ndarray are not supported
    X = np.random.randn(3, 2, 3)
    with pytest.raises(
            ValueError,
            match=
            "X is must be a `pandas.DataFrame` or a two-dimensional `numpy.ndarray`.",
    ):
        _ = Encoder().fit(X)

    # ndarray with weird dtype are not supported
    X = np.empty((3, 2), dtype=np.dtype([("a", np.float64)]))
    with pytest.raises(ValueError) as err_info:
        _ = Encoder().fit(X)
    assert err_info.type is ValueError
    assert (err_info.value.args[0] ==
            "The dtype of X [('a', '<f8')] is not supported by WildWood")

    # the number of features cannot change between fit and transform
    X1 = np.random.randn(9, 4)
    X2 = np.random.randn(9, 3)
    encoder = Encoder().fit(X1)
    with pytest.raises(ValueError) as err_info:
        _ = encoder.transform(X2)
    assert err_info.type is ValueError
    assert (err_info.value.args[0] ==
            "The number of features in X is different from the "
            "number of features of the fitted data. The fitted "
            "data had 4 features and the X has 3 features.")

    # the number of features must match the size of is_categorical
    is_categorical = [True, False]
    X = np.random.randn(7, 3)
    with pytest.raises(ValueError) as err_info:
        _ = Encoder(is_categorical=is_categorical).fit(X)
    assert err_info.type is ValueError
    assert (
        err_info.value.args[0] ==
        "The number of features in X differs from the size of is_categorical. X has "
        "shape (7, 3) while is_categorical has shape (2,)")
Exemple #13
0
def test_encoder_fit_transform_ndarray(
    X,
    max_bins,
    is_categorical,
    n_bins_no_missing_values_,
    categories_,
    binning_thresholds_,
    X_binned,
    df_inverse_transform,
    warnings,
):
    encoder = Encoder(max_bins=max_bins,
                      is_categorical=is_categorical,
                      verbose=True)
    if warnings is not None:
        with pytest.warns(UserWarning) as warn_records:
            encoder.fit(X)
        assert len(warn_records) == len(warnings)
        for warn_record, warning in zip(warn_records, warnings):
            assert str(warn_record.message) == warning
    else:
        encoder.fit(X)
    assert encoder.n_samples_in_ == X.shape[0]
    assert encoder.n_features_in_ == X.shape[1]
    np.testing.assert_array_equal(encoder.n_bins_no_missing_values_,
                                  n_bins_no_missing_values_)
    # Check that categories are OK
    assert encoder.categories_.keys() == categories_.keys()
    for (categories1, categories2) in zip(encoder.categories_.values(),
                                          categories_.values()):
        if isinstance(categories2, np.ndarray):
            if categories2.dtype.kind in "uif":
                assert categories1 == pytest.approx(categories2,
                                                    rel=1e-8,
                                                    abs=1e-8)
            else:
                np.testing.assert_array_equal(categories1, categories2)
        else:
            np.testing.assert_array_equal(categories1, categories2)
    # Check that binning thresholds are OK
    assert encoder.binning_thresholds_.keys() == binning_thresholds_.keys()
    for (thresholds1, thresholds2) in zip(encoder.binning_thresholds_.values(),
                                          binning_thresholds_.values()):
        assert thresholds1 == pytest.approx(thresholds2, rel=1e-8, abs=1e-8)

    # Check that dataset is correct
    dataset = encoder.transform(X)
    X_binned_out = features_bitarray_to_array(dataset)
    np.testing.assert_array_equal(X_binned_out, X_binned)

    # If we have categorical dtype with float values, exact comparison of dtype will
    # fail.
    has_float_categorical = False
    for _, col in df_inverse_transform.items():
        if col.dtype.name == "category":
            if col.dtype.categories.values.dtype.kind == "f":
                has_float_categorical = True

    # Check that reconstructed dataframe is correct
    df_inverse_transform_out = encoder.inverse_transform(dataset)
    if has_float_categorical:
        assert_frames_equal(
            df_inverse_transform_out,
            df_inverse_transform,
            check_categorical=False,
            check_exact=False,
            atol=1e-7,
            rtol=1e-3,
        )
    else:
        assert_frames_equal(
            df_inverse_transform_out,
            df_inverse_transform,
            check_exact=False,
            atol=1e-7,
            rtol=1e-3,
        )

    # # Test also fit_transform
    encoder = Encoder(max_bins=max_bins, is_categorical=is_categorical)
    dataset = encoder.fit_transform(X)
    df_inverse_transform_out = encoder.inverse_transform(dataset)
    if has_float_categorical:
        pd.testing.assert_frame_equal(df_inverse_transform_out,
                                      df_inverse_transform,
                                      check_categorical=False)
    else:
        assert_frames_equal(
            df_inverse_transform_out,
            df_inverse_transform,
            check_exact=False,
            atol=1e-7,
            rtol=1e-3,
        )