def test_ignore_variable_format_with_frequency(df_vartypes): encoder = CountFrequencyEncoder(encoding_method="frequency", variables=None, ignore_format=True) X = encoder.fit_transform(df_vartypes) # expected output transf_df = { "Name": [0.25, 0.25, 0.25, 0.25], "City": [0.25, 0.25, 0.25, 0.25], "Age": [0.25, 0.25, 0.25, 0.25], "Marks": [0.25, 0.25, 0.25, 0.25], "dob": [0.25, 0.25, 0.25, 0.25], } transf_df = pd.DataFrame(transf_df) # init params assert encoder.encoding_method == "frequency" assert encoder.variables is None # fit params assert encoder.variables_ == ["Name", "City", "Age", "Marks", "dob"] assert encoder.n_features_in_ == 5 # transform params pd.testing.assert_frame_equal(X, transf_df)
def test_column_names_are_numbers(df_numeric_columns): encoder = CountFrequencyEncoder(encoding_method="frequency", variables=[0, 1, 2, 3], ignore_format=True) X = encoder.fit_transform(df_numeric_columns) # expected output transf_df = { 0: [0.25, 0.25, 0.25, 0.25], 1: [0.25, 0.25, 0.25, 0.25], 2: [0.25, 0.25, 0.25, 0.25], 3: [0.25, 0.25, 0.25, 0.25], 4: pd.date_range("2020-02-24", periods=4, freq="T"), } transf_df = pd.DataFrame(transf_df) # init params assert encoder.encoding_method == "frequency" assert encoder.variables == [0, 1, 2, 3] # fit params assert encoder.variables_ == [0, 1, 2, 3] assert encoder.n_features_in_ == 5 # transform params pd.testing.assert_frame_equal(X, transf_df)
def clean_data(X): X.dropna(subset=['target'], inplace=True) y = X.pop('target') X.drop(columns='ID', inplace=True) X['v22'] = X['v22'].apply(az_to_int) cat_cols = X.select_dtypes(include=['object']).columns.tolist() con_cols = X.select_dtypes(include=['number']).columns.tolist() num_missing_imputer = SimpleImputer(strategy='median') cat_missing_imputer = CategoricalImputer(fill_value='__MISS__') rare_label_encoder = RareLabelEncoder(tol=0.01, n_categories=10, replace_with='__OTHER__') cat_freq_encoder = CountFrequencyEncoder(encoding_method="frequency") X[con_cols] = num_missing_imputer.fit_transform(X[con_cols]) X[cat_cols] = cat_missing_imputer.fit_transform(X[cat_cols]) X[cat_cols] = rare_label_encoder.fit_transform(X[cat_cols]) X[cat_cols] = cat_freq_encoder.fit_transform(X[cat_cols]) # more cleaning trimmer = Winsorizer(capping_method='quantiles', tail='both', fold=0.005) X = trimmer.fit_transform(X) undersampler = RandomUnderSampler(sampling_strategy=0.7, random_state=1234) X, Y = undersampler.fit_resample(X, y) quasi_constant = DropConstantFeatures(tol=0.998) X = quasi_constant.fit_transform(X) print(f"Quasi Features to drop {quasi_constant.features_to_drop_}") # Remove duplicated features¶ duplicates = DropDuplicateFeatures() X = duplicates.fit_transform(X) print(f"Duplicate feature sets {duplicates.duplicated_feature_sets_}") print(f"Dropping duplicate features {duplicates.features_to_drop_}") drop_corr = DropCorrelatedFeatures(method="pearson", threshold=0.95, missing_values="ignore") X = drop_corr.fit_transform(X) print(f"Drop correlated feature sets {drop_corr.correlated_feature_sets_}") print(f"Dropping correlared features {drop_corr.features_to_drop_}") X['target'] = Y return X
def test_nan_encoding_for_new_categories_if_errors_is_ignore(): df_fit = pd.DataFrame( {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]} ) df_transf = pd.DataFrame( {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]} ) encoder = CountFrequencyEncoder(errors="ignore").fit(df_fit) result = encoder.transform(df_transf) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 2 # check that the counts are correct for both new and old expected_result = pd.DataFrame( {"col1": [3, nan, 1, 3, 1], "col2": [2, 2, 1, 2, nan]} ) pd.testing.assert_frame_equal(result, expected_result)
def test_encode_1_variable_with_counts(df_enc): # test case 1: 1 variable, counts encoder = CountFrequencyEncoder(encoding_method="count", variables=["var_A"]) X = encoder.fit_transform(df_enc) # expected result transf_df = df_enc.copy() transf_df["var_A"] = [ 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, ] # init params assert encoder.encoding_method == "count" assert encoder.variables == ["var_A"] # fit params assert encoder.variables_ == ["var_A"] assert encoder.encoder_dict_ == {"var_A": {"A": 6, "B": 10, "C": 4}} assert encoder.n_features_in_ == 3 # transform params pd.testing.assert_frame_equal(X, transf_df)
def test_variables_cast_as_category(df_enc_category_dtypes): encoder = CountFrequencyEncoder(encoding_method="count", variables=["var_A"]) X = encoder.fit_transform(df_enc_category_dtypes) # expected result transf_df = df_enc_category_dtypes.copy() transf_df["var_A"] = [ 6, 6, 6, 6, 6, 6, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 4, 4, 4, 4, ] # transform params pd.testing.assert_frame_equal(X, transf_df, check_dtype=False) assert X["var_A"].dtypes == int encoder = CountFrequencyEncoder(encoding_method="frequency", variables=["var_A"]) X = encoder.fit_transform(df_enc_category_dtypes) assert X["var_A"].dtypes == float
def test_zero_encoding_for_unseen_categories_if_errors_is_encode(): df_fit = pd.DataFrame( {"col1": ["a", "a", "b", "a", "c"], "col2": ["1", "2", "3", "1", "2"]} ) df_transform = pd.DataFrame( {"col1": ["a", "d", "b", "a", "c"], "col2": ["1", "2", "3", "1", "4"]} ) # count encoding encoder = CountFrequencyEncoder(errors="encode").fit(df_fit) result = encoder.transform(df_transform) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 0 # check that the counts are correct expected_result = pd.DataFrame({"col1": [3, 0, 1, 3, 1], "col2": [2, 2, 1, 2, 0]}) pd.testing.assert_frame_equal(result, expected_result) # with frequency encoder = CountFrequencyEncoder(encoding_method="frequency", errors="encode").fit( df_fit ) result = encoder.transform(df_transform) # check that no NaNs are added assert pd.isnull(result).sum().sum() == 0 # check that the frequencies are correct expected_result = pd.DataFrame( {"col1": [0.6, 0, 0.2, 0.6, 0.2], "col2": [0.4, 0.4, 0.2, 0.4, 0]} ) pd.testing.assert_frame_equal(result, expected_result)
def test_error_if_input_df_contains_categories_not_present_in_fit_df( df_enc, df_enc_rare): # test case 3: when dataset to be transformed contains categories not present in # training dataset with pytest.warns(UserWarning): encoder = CountFrequencyEncoder() encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_no_error_triggered_when_df_contains_unseen_categories_and_errors_is_encode( df_enc, df_enc_rare ): # dataset to be transformed contains categories not present in # training dataset (unseen categories). # check for no error and no warning when errors equals 'encode' warnings.simplefilter("error") encoder = CountFrequencyEncoder(errors="encode") encoder.fit(df_enc) with warnings.catch_warnings(): encoder.transform(df_enc_rare)
def test_warning_when_df_contains_unseen_categories(df_enc, df_enc_rare): # dataset to be transformed contains categories not present in # training dataset (unseen categories), errors set to ignore. msg = "During the encoding, NaN values were introduced in the feature(s) var_A." # check for warning when errors equals 'ignore' encoder = CountFrequencyEncoder(errors="ignore") encoder.fit(df_enc) with pytest.warns(UserWarning) as record: encoder.transform(df_enc_rare) # check that only one warning was raised assert len(record) == 1 # check that the message matches assert record[0].message.args[0] == msg
def test_transform_raises_error_if_df_contains_na(df_enc, df_enc_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): encoder = CountFrequencyEncoder() encoder.fit(df_enc) encoder.transform(df_enc_na)
def test_error_if_encoding_method_not_permitted_value(): with pytest.raises(ValueError): CountFrequencyEncoder(encoding_method="arbitrary")
def test_fit_raises_error_if_df_contains_na(errors, df_enc_na): # test case 4: when dataset contains na, fit method encoder = CountFrequencyEncoder(errors=errors) with pytest.raises(ValueError): encoder.fit(df_enc_na)
def test_exception_if_errors_gets_not_permitted_value(errors): with pytest.raises(ValueError): CountFrequencyEncoder(errors=errors)
[ ('numeric_impute', MeanMedianImputer(imputation_method='median', variables=config.CONTINUOUS_FEATURES)), ('categorical_impute', CategoricalImputer(imputation_method='missing', variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES)), ('rare_label_encode', RareLabelEncoder(tol=0.02, n_categories=10, variables=config.CATEGORICAL_FEATURES+ config.DISCRETE_SET1_FEATURES+config.DISCRETE_SET2_FEATURES+ config.DISCRETE_SET3_FEATURES, replace_with='Rare')), ('categorical_encode1', OrdinalEncoder(encoding_method='arbitrary', variables=config.CATEGORICAL_FEATURES+config.DISCRETE_SET2_FEATURES)), ('categorical_encode2', OrdinalEncoder(encoding_method='ordered', variables=config.DISCRETE_SET1_FEATURES)), ('categorical_encode3', CountFrequencyEncoder(encoding_method='count', variables=config.DISCRETE_SET3_FEATURES)), ('continuous_discretization', EqualFrequencyDiscretiser(q=20, variables=config.CONTINUOUS_FEATURES, return_object=True)), ('continuous_encoding', OrdinalEncoder(encoding_method='ordered', variables=config.CONTINUOUS_FEATURES)), ('scaling', StandardScaler()), ('clf', RandomForestClassifier(criterion='gini', max_depth=10, min_samples_split=10, random_state=0)) ])
def test_raises_non_fitted_error(df_enc): with pytest.raises(NotFittedError): encoder = CountFrequencyEncoder() encoder.transform(df_enc)
@parametrize_with_checks([ MeanMedianImputer(), ArbitraryNumberImputer(), CategoricalImputer(fill_value=0, ignore_format=True), EndTailImputer(), AddMissingIndicator(), RandomSampleImputer(), DropMissingData(), ]) def test_sklearn_compatible_imputer(estimator, check): check(estimator) # encoding @parametrize_with_checks([ CountFrequencyEncoder(ignore_format=True), DecisionTreeEncoder(regression=False, ignore_format=True), MeanEncoder(ignore_format=True), OneHotEncoder(ignore_format=True), OrdinalEncoder(ignore_format=True), RareLabelEncoder( tol=0.00000000001, n_categories=100000000000, replace_with=10, ignore_format=True, ), WoEEncoder(ignore_format=True), PRatioEncoder(ignore_format=True), ]) def test_sklearn_compatible_encoder(estimator, check): check(estimator)
def test_error_when_df_contains_unseen_categories(df_enc, df_enc_rare): # dataset to be transformed contains categories not present in # training dataset (unseen categories), errors set to raise. msg = "During the encoding, NaN values were introduced in the feature(s) var_A." encoder = CountFrequencyEncoder(errors="raise") encoder.fit(df_enc) # check for exception when errors equals 'raise' with pytest.raises(ValueError) as record: encoder.transform(df_enc_rare) # check that the error message matches assert str(record.value) == msg # check for no error and no warning when errors equals 'encode' with warnings.catch_warnings(): warnings.simplefilter("error") encoder = CountFrequencyEncoder(errors="encode") encoder.fit(df_enc) encoder.transform(df_enc_rare)
def test_automatically_select_variables_encode_with_frequency(df_enc): # test case 2: automatically select variables, frequency encoder = CountFrequencyEncoder(encoding_method="frequency", variables=None) X = encoder.fit_transform(df_enc) # expected output transf_df = df_enc.copy() transf_df["var_A"] = [ 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.2, 0.2, 0.2, 0.2, ] transf_df["var_B"] = [ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.2, 0.2, 0.2, 0.2, ] # init params assert encoder.encoding_method == "frequency" assert encoder.variables == ["var_A", "var_B"] # fit params assert encoder.encoder_dict_ == { "var_A": { "A": 0.3, "B": 0.5, "C": 0.2 }, "var_B": { "A": 0.5, "B": 0.3, "C": 0.2 }, } assert encoder.input_shape_ == (20, 3) # transform params pd.testing.assert_frame_equal(X, transf_df)
def convertFromField36ToField40WithCountFrequencyEncoder(array): dataFrame = pd.DataFrame(array[:, 44:49]) encoder = CountFrequencyEncoder(encoding_method='frequency') encoder.fit(dataFrame) encodedField36Field40 = encoder.transform(dataFrame) array[:, 44:49] = encodedField36Field40