def test_log_base_e_plus_automatically_find_variables(df_vartypes): # test case 1: log base e, automatically select variables transformer = LogTransformer(base="e", variables=None) X = transformer.fit_transform(df_vartypes) # expected output transf_df = df_vartypes.copy() transf_df["Age"] = [2.99573, 3.04452, 2.94444, 2.89037] transf_df["Marks"] = [-0.105361, -0.223144, -0.356675, -0.510826] # test init params assert transformer.base == "e" assert transformer.variables is None # test fit attr assert transformer.variables_ == ["Age", "Marks"] assert transformer.n_features_in_ == 5 # test transform output pd.testing.assert_frame_equal(X, transf_df) # test inverse_transform Xit = transformer.inverse_transform(X) # convert numbers to original format. Xit["Age"] = Xit["Age"].round().astype("int64") Xit["Marks"] = Xit["Marks"].round(1) # test pd.testing.assert_frame_equal(Xit, df_vartypes)
def test_log_base_10_plus_user_passes_var_list(df_vartypes): # test case 2: log base 10, user passes variables transformer = LogTransformer(base="10", variables="Age") X = transformer.fit_transform(df_vartypes) # expected output transf_df = df_vartypes.copy() transf_df["Age"] = [1.30103, 1.32222, 1.27875, 1.25527] # test init params assert transformer.base == "10" assert transformer.variables == "Age" # test fit attr assert transformer.variables_ == ["Age"] assert transformer.n_features_in_ == 5 # test transform output pd.testing.assert_frame_equal(X, transf_df) # test inverse_transform Xit = transformer.inverse_transform(X) # convert numbers to original format. Xit["Age"] = Xit["Age"].round().astype("int64") # test pd.testing.assert_frame_equal(Xit, df_vartypes)
def test_error_if_df_contains_negative_values(df_vartypes): # test error when data contains negative values df_neg = df_vartypes.copy() df_neg.loc[1, "Age"] = -1 # test case 5: when variable contains negative value, fit with pytest.raises(ValueError): transformer = LogTransformer() transformer.fit(df_neg) # test case 6: when variable contains negative value, transform with pytest.raises(ValueError): transformer = LogTransformer() transformer.fit(df_vartypes) transformer.transform(df_neg)
def test_log_base_10_plus_user_passes_var_list(df_vartypes): # test case 2: log base 10, user passes variables transformer = LogTransformer(base="10", variables="Age") X = transformer.fit_transform(df_vartypes) # expected output transf_df = df_vartypes.copy() transf_df["Age"] = [1.30103, 1.32222, 1.27875, 1.25527] # test init params assert transformer.base == "10" assert transformer.variables == ["Age"] # test fit attr assert transformer.input_shape_ == (4, 5) # test transform output pd.testing.assert_frame_equal(X, transf_df)
def test_log_base_e_plus_automatically_find_variables(df_vartypes): # test case 1: log base e, automatically select variables transformer = LogTransformer(base="e", variables=None) X = transformer.fit_transform(df_vartypes) # expected output transf_df = df_vartypes.copy() transf_df["Age"] = [2.99573, 3.04452, 2.94444, 2.89037] transf_df["Marks"] = [-0.105361, -0.223144, -0.356675, -0.510826] # test init params assert transformer.base == "e" assert transformer.variables == ["Age", "Marks"] # test fit attr assert transformer.input_shape_ == (4, 5) # test transform output pd.testing.assert_frame_equal(X, transf_df)
def test_inverse_e_plus_user_passes_var_list(df_vartypes): # test case 7: inverse log, user passes variables transformer = LogTransformer(variables="Age") Xt = transformer.fit_transform(df_vartypes) X = transformer.inverse_transform(Xt) # convert floats to int X["Age"] = X["Age"].round().astype("int64") # test init params assert transformer.base == "e" assert transformer.variables == "Age" # test fit attr assert transformer.variables_ == ["Age"] assert transformer.n_features_in_ == 5 # test transform output pd.testing.assert_frame_equal(X, df_vartypes)
import pytest from sklearn.utils.estimator_checks import check_estimator from feature_engine.transformation import ( BoxCoxTransformer, LogTransformer, PowerTransformer, ReciprocalTransformer, YeoJohnsonTransformer, ) @pytest.mark.parametrize( "Estimator", [ BoxCoxTransformer(), LogTransformer(), PowerTransformer(), ReciprocalTransformer(), YeoJohnsonTransformer(), ], ) def test_all_transformers(Estimator): return check_estimator(Estimator)
def test_non_fitted_error(df_vartypes): with pytest.raises(NotFittedError): transformer = LogTransformer() transformer.transform(df_vartypes)
def test_transform_raises_error_if_na_in_df(df_vartypes, df_na): # test case 4: when dataset contains na, transform method with pytest.raises(ValueError): transformer = LogTransformer() transformer.fit(df_vartypes) transformer.transform(df_na[["Name", "City", "Age", "Marks", "dob"]])
def test_fit_raises_error_if_na_in_df(df_na): # test case 3: when dataset contains na, fit method with pytest.raises(ValueError): transformer = LogTransformer() transformer.fit(df_na)
def test_error_if_base_value_not_allowed(): with pytest.raises(ValueError): LogTransformer(base="other")
print(mean_imputer.imputer_dict_) X_train = mean_imputer.transform(X_train) X_test = mean_imputer.transform(X_test) # %% Varief whether there are missing value. X_train[cat_vars_with_na].isnull().sum() [var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0] #%% Temporal variables. def elapsed_years(df, var): df[var] = df['YrSold'] - df[var] return df for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']: X_train = elapsed_years(X_train, var) X_test = elapsed_years(X_test, var) # now we drop YrSold. drop_features = DropFeatures(features_to_drop=['YrSold']) X_train = mean_imputer.fit_transform(X_train) X_test = mean_imputer.transform(X_test) # %% Numerical variable -- transformation. log_transformer = LogTransformer( variables=["LotFrontage", "1stFlrSF", "GrLivArea"], ) X_train = log_transformer.fit_transform(X_train) X_test = log_transformer.transform(X_test) # check that test set does not contain null values in the engineered variables [var for var in ["LotFrontage", "1stFlrSF", "GrLivArea"] if X_test[var].isnull().sum() > 0]
imputation_method="mean", variables=config.model_config.numerical_vars_with_na, ), ), # == TEMPORAL VARIABLES ==== ( "elapsed_time", pp.TemporalVariableTransformer( variables=config.model_config.temporal_vars, reference_variable=config.model_config.ref_var, ), ), ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])), # ==== VARIABLE TRANSFORMATION ===== ("log", LogTransformer(variables=config.model_config.numericals_log_vars)), ( "binarizer", SklearnTransformerWrapper( transformer=Binarizer(threshold=0), variables=config.model_config.binarize_vars, ), ), # === mappers === ( "mapper_qual", pp.Mapper( variables=config.model_config.qual_vars, mappings=config.model_config.qual_mappings, ), ),