def test_ignore_variable_format_with_frequency(df_vartypes):
    encoder = CountFrequencyEncoder(encoding_method="frequency",
                                    variables=None,
                                    ignore_format=True)
    X = encoder.fit_transform(df_vartypes)

    # expected output
    transf_df = {
        "Name": [0.25, 0.25, 0.25, 0.25],
        "City": [0.25, 0.25, 0.25, 0.25],
        "Age": [0.25, 0.25, 0.25, 0.25],
        "Marks": [0.25, 0.25, 0.25, 0.25],
        "dob": [0.25, 0.25, 0.25, 0.25],
    }

    transf_df = pd.DataFrame(transf_df)

    # init params
    assert encoder.encoding_method == "frequency"
    assert encoder.variables is None
    # fit params
    assert encoder.variables_ == ["Name", "City", "Age", "Marks", "dob"]
    assert encoder.n_features_in_ == 5
    # transform params
    pd.testing.assert_frame_equal(X, transf_df)
def test_column_names_are_numbers(df_numeric_columns):
    encoder = CountFrequencyEncoder(encoding_method="frequency",
                                    variables=[0, 1, 2, 3],
                                    ignore_format=True)
    X = encoder.fit_transform(df_numeric_columns)

    # expected output
    transf_df = {
        0: [0.25, 0.25, 0.25, 0.25],
        1: [0.25, 0.25, 0.25, 0.25],
        2: [0.25, 0.25, 0.25, 0.25],
        3: [0.25, 0.25, 0.25, 0.25],
        4: pd.date_range("2020-02-24", periods=4, freq="T"),
    }

    transf_df = pd.DataFrame(transf_df)

    # init params
    assert encoder.encoding_method == "frequency"
    assert encoder.variables == [0, 1, 2, 3]
    # fit params
    assert encoder.variables_ == [0, 1, 2, 3]
    assert encoder.n_features_in_ == 5
    # transform params
    pd.testing.assert_frame_equal(X, transf_df)
Esempio n. 3
0
def clean_data(X):
    X.dropna(subset=['target'], inplace=True)
    y = X.pop('target')
    X.drop(columns='ID', inplace=True)
    X['v22'] = X['v22'].apply(az_to_int)
    cat_cols = X.select_dtypes(include=['object']).columns.tolist()
    con_cols = X.select_dtypes(include=['number']).columns.tolist()
    num_missing_imputer = SimpleImputer(strategy='median')
    cat_missing_imputer = CategoricalImputer(fill_value='__MISS__')
    rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__')
    cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency")
    X[con_cols] = num_missing_imputer.fit_transform(X[con_cols])
    X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols])
    X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols])
    X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols])
    # more cleaning
    trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005)
    X = trimmer.fit_transform(X)
    undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234)
    X, Y = undersampler.fit_resample(X, y)
    quasi_constant = DropConstantFeatures(tol=0.998)
    X = quasi_constant.fit_transform(X)
    print(f"Quasi Features to drop {quasi_constant.features_to_drop_}")
    # Remove duplicated features¶
    duplicates = DropDuplicateFeatures()
    X = duplicates.fit_transform(X)
    print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}")
    print(f"Dropping duplicate features {duplicates.features_to_drop_}")
    drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore")
    X = drop_corr.fit_transform(X)
    print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}")
    print(f"Dropping correlared features {drop_corr.features_to_drop_}")
    X['target'] = Y
    return X
def test_nan_encoding_for_new_categories_if_errors_is_ignore():
    df_fit = pd.DataFrame(
        {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]}
    )
    df_transf = pd.DataFrame(
        {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]}
    )
    encoder = CountFrequencyEncoder(errors="ignore").fit(df_fit)
    result = encoder.transform(df_transf)

    # check that no NaNs are added
    assert pd.isnull(result).sum().sum() == 2

    # check that the counts are correct for both new and old
    expected_result = pd.DataFrame(
        {"col1": [3, nan, 1, 3, 1], "col2": [2, 2, 1, 2, nan]}
    )
    pd.testing.assert_frame_equal(result, expected_result)
def test_encode_1_variable_with_counts(df_enc):
    # test case 1: 1 variable, counts
    encoder = CountFrequencyEncoder(encoding_method="count",
                                    variables=["var_A"])
    X = encoder.fit_transform(df_enc)

    # expected result
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        6,
        6,
        6,
        6,
        6,
        6,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        4,
        4,
        4,
        4,
    ]

    # init params
    assert encoder.encoding_method == "count"
    assert encoder.variables == ["var_A"]
    # fit params
    assert encoder.variables_ == ["var_A"]
    assert encoder.encoder_dict_ == {"var_A": {"A": 6, "B": 10, "C": 4}}
    assert encoder.n_features_in_ == 3
    # transform params
    pd.testing.assert_frame_equal(X, transf_df)
def test_variables_cast_as_category(df_enc_category_dtypes):
    encoder = CountFrequencyEncoder(encoding_method="count", variables=["var_A"])
    X = encoder.fit_transform(df_enc_category_dtypes)

    # expected result
    transf_df = df_enc_category_dtypes.copy()
    transf_df["var_A"] = [
        6,
        6,
        6,
        6,
        6,
        6,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        10,
        4,
        4,
        4,
        4,
    ]
    # transform params
    pd.testing.assert_frame_equal(X, transf_df, check_dtype=False)
    assert X["var_A"].dtypes == int

    encoder = CountFrequencyEncoder(encoding_method="frequency", variables=["var_A"])
    X = encoder.fit_transform(df_enc_category_dtypes)
    assert X["var_A"].dtypes == float
def test_zero_encoding_for_unseen_categories_if_errors_is_encode():
    df_fit = pd.DataFrame(
        {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]}
    )
    df_transform = pd.DataFrame(
        {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]}
    )

    # count encoding
    encoder = CountFrequencyEncoder(errors="encode").fit(df_fit)
    result = encoder.transform(df_transform)

    # check that no NaNs are added
    assert pd.isnull(result).sum().sum() == 0

    # check that the counts are correct
    expected_result = pd.DataFrame({"col1": [3, 0, 1, 3, 1], "col2": [2, 2, 1, 2, 0]})
    pd.testing.assert_frame_equal(result, expected_result)

    # with frequency
    encoder = CountFrequencyEncoder(encoding_method="frequency", errors="encode").fit(
        df_fit
    )
    result = encoder.transform(df_transform)

    # check that no NaNs are added
    assert pd.isnull(result).sum().sum() == 0

    # check that the frequencies are correct
    expected_result = pd.DataFrame(
        {"col1": [0.6, 0, 0.2, 0.6, 0.2], "col2": [0.4, 0.4, 0.2, 0.4, 0]}
    )
    pd.testing.assert_frame_equal(result, expected_result)
def test_error_if_input_df_contains_categories_not_present_in_fit_df(
        df_enc, df_enc_rare):
    # test case 3: when dataset to be transformed contains categories not present in
    # training dataset
    with pytest.warns(UserWarning):
        encoder = CountFrequencyEncoder()
        encoder.fit(df_enc)
        encoder.transform(df_enc_rare)
def test_no_error_triggered_when_df_contains_unseen_categories_and_errors_is_encode(
    df_enc, df_enc_rare
):
    # dataset to be transformed contains categories not present in
    # training dataset (unseen categories).

    # check for no error and no warning when errors equals 'encode'
    warnings.simplefilter("error")
    encoder = CountFrequencyEncoder(errors="encode")
    encoder.fit(df_enc)
    with warnings.catch_warnings():
        encoder.transform(df_enc_rare)
def test_warning_when_df_contains_unseen_categories(df_enc, df_enc_rare):
    # dataset to be transformed contains categories not present in
    # training dataset (unseen categories), errors set to ignore.

    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    # check for warning when errors equals 'ignore'
    encoder = CountFrequencyEncoder(errors="ignore")
    encoder.fit(df_enc)
    with pytest.warns(UserWarning) as record:
        encoder.transform(df_enc_rare)

    # check that only one warning was raised
    assert len(record) == 1
    # check that the message matches
    assert record[0].message.args[0] == msg
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na):
    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = CountFrequencyEncoder()
        encoder.fit(df_enc)
        encoder.transform(df_enc_na)
def test_error_if_encoding_method_not_permitted_value():
    with pytest.raises(ValueError):
        CountFrequencyEncoder(encoding_method="arbitrary")
def test_fit_raises_error_if_df_contains_na(errors, df_enc_na):
    # test case 4: when dataset contains na, fit method
    encoder = CountFrequencyEncoder(errors=errors)
    with pytest.raises(ValueError):
        encoder.fit(df_enc_na)
def test_exception_if_errors_gets_not_permitted_value(errors):
    with pytest.raises(ValueError):
        CountFrequencyEncoder(errors=errors)
Esempio n. 15
0
[
    ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)),
    
    ('categorical_impute', CategoricalImputer(imputation_method='missing', 
                                              variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES)),
    
    ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10,
                                           variables=config.CATEGORICAL_FEATURES+
                                              config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+
                                              config.DISCRETE_SET3_FEATURES,
                                            replace_with='Rare')),
    
    ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', 
                                          variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)),
    
    ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', 
                                          variables=config.DISCRETE_SET1_FEATURES)),
    
    ('categorical_encode3', CountFrequencyEncoder(encoding_method='count',
                                          variables=config.DISCRETE_SET3_FEATURES)),
    
    ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)),
    
    ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)),
    
    ('scaling', StandardScaler()),
        
    ('clf', RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=10, random_state=0))    
])
def test_raises_non_fitted_error(df_enc):
    with pytest.raises(NotFittedError):
        encoder = CountFrequencyEncoder()
        encoder.transform(df_enc)
@parametrize_with_checks([
    MeanMedianImputer(),
    ArbitraryNumberImputer(),
    CategoricalImputer(fill_value=0, ignore_format=True),
    EndTailImputer(),
    AddMissingIndicator(),
    RandomSampleImputer(),
    DropMissingData(),
])
def test_sklearn_compatible_imputer(estimator, check):
    check(estimator)


# encoding
@parametrize_with_checks([
    CountFrequencyEncoder(ignore_format=True),
    DecisionTreeEncoder(regression=False, ignore_format=True),
    MeanEncoder(ignore_format=True),
    OneHotEncoder(ignore_format=True),
    OrdinalEncoder(ignore_format=True),
    RareLabelEncoder(
        tol=0.00000000001,
        n_categories=100000000000,
        replace_with=10,
        ignore_format=True,
    ),
    WoEEncoder(ignore_format=True),
    PRatioEncoder(ignore_format=True),
])
def test_sklearn_compatible_encoder(estimator, check):
    check(estimator)
def test_error_when_df_contains_unseen_categories(df_enc, df_enc_rare):
    # dataset to be transformed contains categories not present in
    # training dataset (unseen categories), errors set to raise.

    msg = "During the encoding, NaN values were introduced in the feature(s) var_A."

    encoder = CountFrequencyEncoder(errors="raise")
    encoder.fit(df_enc)

    # check for exception when errors equals 'raise'
    with pytest.raises(ValueError) as record:
        encoder.transform(df_enc_rare)

    # check that the error message matches
    assert str(record.value) == msg

    # check for no error and no warning when errors equals 'encode'
    with warnings.catch_warnings():
        warnings.simplefilter("error")
        encoder = CountFrequencyEncoder(errors="encode")
        encoder.fit(df_enc)
        encoder.transform(df_enc_rare)
def test_automatically_select_variables_encode_with_frequency(df_enc):
    # test case 2: automatically select variables, frequency
    encoder = CountFrequencyEncoder(encoding_method="frequency",
                                    variables=None)
    X = encoder.fit_transform(df_enc)

    # expected output
    transf_df = df_enc.copy()
    transf_df["var_A"] = [
        0.3,
        0.3,
        0.3,
        0.3,
        0.3,
        0.3,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.2,
        0.2,
        0.2,
        0.2,
    ]
    transf_df["var_B"] = [
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.5,
        0.3,
        0.3,
        0.3,
        0.3,
        0.3,
        0.3,
        0.2,
        0.2,
        0.2,
        0.2,
    ]

    # init params
    assert encoder.encoding_method == "frequency"
    assert encoder.variables == ["var_A", "var_B"]
    # fit params
    assert encoder.encoder_dict_ == {
        "var_A": {
            "A": 0.3,
            "B": 0.5,
            "C": 0.2
        },
        "var_B": {
            "A": 0.5,
            "B": 0.3,
            "C": 0.2
        },
    }
    assert encoder.input_shape_ == (20, 3)
    # transform params
    pd.testing.assert_frame_equal(X, transf_df)
Esempio n. 20
0
def convertFromField36ToField40WithCountFrequencyEncoder(array):
    dataFrame = pd.DataFrame(array[:, 44:49])
    encoder = CountFrequencyEncoder(encoding_method='frequency')
    encoder.fit(dataFrame)
    encodedField36Field40 = encoder.transform(dataFrame)
    array[:, 44:49] = encodedField36Field40