Exemple #1
0
def test_RareLabelEncoder():
    df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    df = pd.DataFrame(df)

    transf_df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['Rare'] * 8 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = RareLabelCategoricalEncoder(tol=0.05,
                                          n_categories=9,
                                          variables=['category'])
    encoder.fit(df)
    X = encoder.transform(df)

    pd.testing.assert_frame_equal(X, transf_df)
    assert encoder.variables == ['category']
    assert encoder.input_shape_ == (63, 2)

    df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    df = pd.DataFrame(df)

    transf_df = {
        'category': ['A'] * 10 + ['B'] * 10 + ['C'] * 10 + ['D'] * 10 +
        ['E'] * 2 + ['F'] * 2 + ['G'] * 2 + ['H'] * 2 + ['I'] * 10 + ['K'] * 5,
        'target': [1] * 63
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = RareLabelCategoricalEncoder(tol=0.01,
                                          n_categories=9,
                                          variables=['category'])
    encoder.fit(df)
    X = encoder.transform(df)

    pd.testing.assert_frame_equal(X, transf_df)
    assert encoder.variables == ['category']
    assert encoder.input_shape_ == (63, 2)
Exemple #2
0
---# Transforming						   
for variable in ['Neighborhood', 'Exterior1st', 'Exterior2nd']:
    X_train, X_test = rare_encoding(X_train, X_test, variable, 0.05)
	
---With Feature-Engine

from feature_engine.categorical_encoders import RareLabelCategoricalEncoder						   
# Rare value encoder
rare_encoder = RareLabelCategoricalEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=4, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['Neighborhood', 'Exterior1st', 'Exterior2nd',
               'MasVnrType', 'ExterQual', 'BsmtCond'] # variables to re-group
)  
																									 
rare_encoder.fit(X_train)
X_train = rare_encoder.transform(X_train)
X_test = rare_encoder.transform(X_test)

rare_encoder.variables
# the encoder_dict_ is a dictionary of variable: frequent labels pair
rare_encoder.encoder_dict_


----------DISCRETISATION---------
----Equal width discretisation
# with Scikit Learn
from sklearn.preprocessing import KBinsDiscretizer
disc = KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')
disc.fit(X_train[['age', 'fare']])
Exemple #3
0
def test_RareLabelEncoder(dataframe_enc_big, dataframe_enc_big_na):
    # test case 1: defo params, automatically select variables
    encoder = RareLabelCategoricalEncoder(tol=0.06,
                                          n_categories=5,
                                          variables=None,
                                          replace_with='Rare')
    X = encoder.fit_transform(dataframe_enc_big)

    df = {
        'var_A': ['A'] * 6 + ['B'] * 10 + ['C'] * 4 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
        'var_B': ['A'] * 10 + ['B'] * 6 + ['C'] * 4 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
        'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 +
        ['Rare'] * 4 + ['G'] * 6,
    }
    df = pd.DataFrame(df)

    # init params
    assert encoder.tol == 0.06
    assert encoder.n_categories == 5
    assert encoder.replace_with == 'Rare'
    assert encoder.variables == ['var_A', 'var_B', 'var_C']
    # fit params
    assert encoder.input_shape_ == (40, 3)
    # transform params
    pd.testing.assert_frame_equal(X, df)

    # test case 2: user provides alternative grouping value and variable list
    encoder = RareLabelCategoricalEncoder(tol=0.15,
                                          n_categories=5,
                                          variables=['var_A', 'var_B'],
                                          replace_with='Other')
    X = encoder.fit_transform(dataframe_enc_big)

    df = {
        'var_A': ['A'] * 6 + ['B'] * 10 + ['Other'] * 4 + ['D'] * 10 +
        ['Other'] * 4 + ['G'] * 6,
        'var_B': ['A'] * 10 + ['B'] * 6 + ['Other'] * 4 + ['D'] * 10 +
        ['Other'] * 4 + ['G'] * 6,
        'var_C': ['A'] * 4 + ['B'] * 6 + ['C'] * 10 + ['D'] * 10 + ['E'] * 2 +
        ['F'] * 2 + ['G'] * 6
    }
    df = pd.DataFrame(df)

    # init params
    assert encoder.tol == 0.15
    assert encoder.n_categories == 5
    assert encoder.replace_with == 'Other'
    assert encoder.variables == ['var_A', 'var_B']
    # fit params
    assert encoder.input_shape_ == (40, 3)
    # transform params
    pd.testing.assert_frame_equal(X, df)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(tol=5)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=0.5)

    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(replace_with=0.5)

    # test case 3: when the variable has low cardinality
    with pytest.warns(UserWarning):
        encoder = RareLabelCategoricalEncoder(n_categories=10)
        encoder.fit(dataframe_enc_big)

    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=4)
        encoder.fit(dataframe_enc_big_na)

    # test case 5: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = RareLabelCategoricalEncoder(n_categories=4)
        encoder.fit(dataframe_enc_big)
        encoder.transform(dataframe_enc_big_na)
read_data = read_data[read_data.A4 != 'l']
read_data=read_data[read_data.A5 != 'gg']



## remove category l and gg from variable l A4 and A5 simultaneously
read_data = read_data[read_data.A4 != 'l']
read_data=read_data[read_data.A5 != 'gg']

## encode rare categories in dataset
rare_encoder = RareLabelCategoricalEncoder(
    tol=0.05,  # minimal percentage to be considered non-rare
    n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['A1','A4','A5','A6','A7','A9','A10','A12'] # variables to re-group
) 
rare_encoder.fit(read_data)
rare_encoder.variables
read_data = rare_encoder.transform(read_data)


rare_encoder_A13 = RareLabelCategoricalEncoder(
    tol=0.1,  # minimal percentage to be considered non-rare
    n_categories=2, # minimal number of categories the variable should have to re-cgroup rare categories
    variables=['A13'] # variables to re-group
) 
rare_encoder_A13.fit(read_data)
rare_encoder_A13.variables
read_data = rare_encoder_A13.transform(read_data)

## standarized numerical variables
read_data[['A2', 'A3','A8','A11','A14','A15']] = StandardScaler().fit_transform(read_data[['A2', 'A3','A8','A11','A14','A15']])