Example #1
0
def test_MeanCategoricalEncoder():
    # test dataframe
    df = {
        'category': ['A'] * 10 + ['B'] * 6 + ['C'] * 4,
        'target': [
            1,
            1,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
            0,
            0,
            1,
            1,
            0,
            0,
        ]
    }
    df = pd.DataFrame(df)

    # transformed dataframe
    transf_df = {
        'category': [
            0.200000, 0.200000, 0.200000, 0.200000, 0.200000, 0.200000,
            0.200000, 0.200000, 0.200000, 0.200000, 0.333333, 0.333333,
            0.333333, 0.333333, 0.333333, 0.333333, 0.500000, 0.500000,
            0.500000, 0.500000
        ],
        'target': [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0]
    }
    transf_df = pd.DataFrame(transf_df)

    encoder = MeanCategoricalEncoder(variables=['category'])
    encoder.fit(df['category'].to_frame(), df['target'])
    X = encoder.transform(df['category'].to_frame())

    pd.testing.assert_frame_equal(X, transf_df['category'].to_frame())
    assert encoder.variables == ['category']
    assert encoder.encoder_dict_ == {
        'category': {
            'A': 0.20000000000000001,
            'B': 0.33333333333333331,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 1)
Example #2
0
homes.drop(['ParcelNumber', 'Address'], axis=1, inplace=True)

#Save the first dataframe that we will need for the application
homes.to_csv('Data/houses.csv', index=False)

#Create a copy of the homes dataframe
neighborhoods = homes.copy()

#We are going to create another column that stores the mean target value for each neighborhood
#First we need to save the neighborhood name into another column
neighborhoods['neighborhood_name'] = neighborhoods['Neighborhood']

mean_enc = MeanCategoricalEncoder(variables=['Neighborhood'])

#Fit the encoder
mean_enc.fit(neighborhoods, neighborhoods['SalePrice'])

#Transform the neighborhoods dataframe
neighborhoods = mean_enc.transform(neighborhoods)

#Load a dataframe that has the geocoordinates of each neighborhood
hoods = gpd.read_file('Los Angeles Neighborhood Map.geojson')
hoods = hoods.rename(columns={'name': 'neighborhood_name'})
hoods[['longitude', 'latitude']] = hoods[['longitude',
                                          'latitude']].astype(float)

#Create a dataframe that averages out every attribute by neighborhood
avg = neighborhoods.groupby('neighborhood_name')[[
    'HomeSize', 'LotSize', 'Bedrooms', 'Bathrooms', 'SexOffenders',
    'EnviornmentalHazards', 'Age', 'SalePrice', 'Neighborhood', 'YearBuilt'
]].mean()
Example #3
0
def test_MeanCategoricalEncoder(dataframe_enc, dataframe_enc_rare,
                                dataframe_enc_na):
    # test case 1: 1 variable
    encoder = MeanCategoricalEncoder(variables=['var_A'])
    encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
    X = encoder.transform(dataframe_enc[['var_A', 'var_B']])

    # transformed dataframe
    transf_df = dataframe_enc.copy()
    transf_df['var_A'] = [
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2,
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5
    ]

    # init params
    assert encoder.variables == ['var_A']
    # fit params
    assert encoder.encoder_dict_ == {
        'var_A': {
            'A': 0.3333333333333333,
            'B': 0.2,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 2)
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']])

    # test case 2: automatically select variables
    encoder = MeanCategoricalEncoder(variables=None)
    encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
    X = encoder.transform(dataframe_enc[['var_A', 'var_B']])

    # transformed dataframe
    transf_df['var_A'] = [
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333, 0.2, 0.2,
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.5, 0.5, 0.5, 0.5
    ]
    transf_df['var_B'] = [
        0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.3333333333333333,
        0.3333333333333333, 0.3333333333333333, 0.5, 0.5, 0.5, 0.5
    ]

    # init params
    assert encoder.variables == ['var_A', 'var_B']
    # fit params
    assert encoder.encoder_dict_ == {
        'var_A': {
            'A': 0.3333333333333333,
            'B': 0.2,
            'C': 0.5
        },
        'var_B': {
            'A': 0.2,
            'B': 0.3333333333333333,
            'C': 0.5
        }
    }
    assert encoder.input_shape_ == (20, 2)
    # transform params
    pd.testing.assert_frame_equal(X, transf_df[['var_A', 'var_B']])

    # test case 3: raises error if target is not passed
    with pytest.raises(TypeError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc)

    # test case 4: when dataset to be transformed contains categories not present in training dataset
    with pytest.warns(UserWarning):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
        encoder.transform(dataframe_enc_rare[['var_A', 'var_B']])

    # test case 4: when dataset contains na, fit method
    with pytest.raises(ValueError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc_na[['var_A', 'var_B']],
                    dataframe_enc_na['target'])

    # test case 4: when dataset contains na, transform method
    with pytest.raises(ValueError):
        encoder = MeanCategoricalEncoder()
        encoder.fit(dataframe_enc[['var_A', 'var_B']], dataframe_enc['target'])
        encoder.transform(dataframe_enc_na)

    with pytest.raises(NotFittedError):
        imputer = OrdinalCategoricalEncoder()
        imputer.transform(dataframe_enc)
Example #4
0
    return df.groupby([variable])[target].mean().to_dict()

def integer_encode(train, test, variable, ordinal_mapping):
    X_train[variable] = X_train[variable].map(ordinal_mapping)
    X_test[variable] = X_test[variable].map(ordinal_mapping)
	
for variable in ['sex', 'embarked']:
    mappings = find_category_mappings(X_train, variable, 'survived')
    integer_encode(X_train, X_test, variable, mappings)

	
-- #With Feature-Engine
from feature_engine.categorical_encoders import MeanCategoricalEncoder
mean_enc = MeanCategoricalEncoder(
    variables=['cabin', 'sex', 'embarked'])
mean_enc.fit(X_train, y_train)
X_train = mean_enc.transform(X_train)
X_test = mean_enc.transform(X_test)

mean_enc.encoder_dict_
mean_enc.variables


					      -
---------Probability Ration Encoding
# Only for binary classification problems
#Replacing categorical labels with this code and method will generate missing values
#for categories present in the test set that were not seen in the training set. 
#Therefore it is extremely important to handle rare labels before-hand
			   
ratio_enc = WoERatioCategoricalEncoder(