def test_sklearn_ohe_with_crossvalidation(): """ Created 2022-02-14 to test fix to issue # 368 """ # Set up test pipeline with wrapped OneHotEncoder, with simple regression model # to be able to run cross-validation; use sklearn CA housing data df = fetch_california_housing(as_frame=True).frame y = df["MedHouseVal"] X = (df[["HouseAge", "AveBedrms"]].assign(AveBedrms_cat=lambda x: pd.cut( x.AveBedrms, [0, 1, 2, 3, 4, np.inf]).astype(str)).drop( columns="AveBedrms")) pipeline: Pipeline = Pipeline(steps=[ ( "encode_cat", SklearnTransformerWrapper( transformer=OneHotEncoder(drop="first", sparse=False), variables=["AveBedrms_cat"], ), ), ("cleanup", DropFeatures(["AveBedrms_cat"])), ("model", Lasso()), ]) # Run cross-validation results: np.ndarray = cross_val_score(pipeline, X, y, scoring="neg_mean_squared_error", cv=3) assert not any([np.isnan(i) for i in results])
def test_drop_2_variables_integer_colnames(df_numeric_columns): transformer = DropFeatures(features_to_drop=[0, 1]) X = transformer.fit_transform(df_numeric_columns) # expected result df = pd.DataFrame( { 2: [20, 21, 19, 18], 3: [0.9, 0.8, 0.7, 0.6], 4: pd.date_range("2020-02-24", periods=4, freq="T"), } ) # init params assert transformer.features_to_drop == [0, 1] # transform params pd.testing.assert_frame_equal(X, df)
def test_drop_2_variables(df_vartypes): transformer = DropFeatures(features_to_drop=["City", "dob"]) X = transformer.fit_transform(df_vartypes) # expected result df = pd.DataFrame( { "Name": ["tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], } ) # init params assert transformer.features_to_drop == ["City", "dob"] # transform params assert X.shape == (4, 3) assert type(X) == pd.DataFrame pd.testing.assert_frame_equal(X, df)
def test_drop_1_variable(df_vartypes): transformer = DropFeatures(features_to_drop="City") X = transformer.fit_transform(df_vartypes) # expected result df = pd.DataFrame( { "Name": ["tom", "nick", "krish", "jack"], "Age": [20, 21, 19, 18], "Marks": [0.9, 0.8, 0.7, 0.6], "dob": pd.date_range("2020-02-24", periods=4, freq="T"), } ) # init params assert transformer.features_to_drop == "City" # transform params assert X.shape == (4, 4) assert type(X) == pd.DataFrame pd.testing.assert_frame_equal(X, df)
def test_error_if_empty_list(df_vartypes): # test case 6: passing an empty list with pytest.raises(ValueError): transformer = DropFeatures(features_to_drop=[]) transformer.fit_transform(df_vartypes)
def test_error_when_returning_empty_dataframe(df_vartypes): # test case 5: dropping all columns produces warning check with pytest.raises(ValueError): transformer = DropFeatures(features_to_drop=list(df_vartypes.columns)) transformer.fit_transform(df_vartypes)
def test_error_if_non_existing_variables(df_vartypes): # test case 2: passing variables that doesn't exist with pytest.raises(KeyError): transformer = DropFeatures(features_to_drop=["last_name"]) transformer.fit_transform(df_vartypes)
DropCorrelatedFeatures, DropDuplicateFeatures, DropFeatures, DropHighPSIFeatures, RecursiveFeatureAddition, RecursiveFeatureElimination, SelectByShuffling, SelectBySingleFeaturePerformance, SelectByTargetMeanPerformance, SmartCorrelatedSelection, ) _logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1) _estimators = [ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), DropHighPSIFeatures(bins=5), SmartCorrelatedSelection(), SelectByShuffling(estimator=_logreg, scoring="accuracy"), SelectByTargetMeanPerformance(bins=3, regression=False), SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"), RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"), RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100), ] _multivariate_estimators = [ DropDuplicateFeatures(), DropCorrelatedFeatures(),
print(mean_imputer.imputer_dict_) X_train = mean_imputer.transform(X_train) X_test = mean_imputer.transform(X_test) # %% Varief whether there are missing value. X_train[cat_vars_with_na].isnull().sum() [var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0] #%% Temporal variables. def elapsed_years(df, var): df[var] = df['YrSold'] - df[var] return df for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']: X_train = elapsed_years(X_train, var) X_test = elapsed_years(X_test, var) # now we drop YrSold. drop_features = DropFeatures(features_to_drop=['YrSold']) X_train = mean_imputer.fit_transform(X_train) X_test = mean_imputer.transform(X_test) # %% Numerical variable -- transformation. log_transformer = LogTransformer( variables=["LotFrontage", "1stFlrSF", "GrLivArea"], ) X_train = log_transformer.fit_transform(X_train) X_test = log_transformer.transform(X_test) # check that test set does not contain null values in the engineered variables [var for var in ["LotFrontage", "1stFlrSF", "GrLivArea"] if X_test[var].isnull().sum() > 0]
# transformers @parametrize_with_checks([ BoxCoxTransformer(), LogTransformer(), PowerTransformer(), ReciprocalTransformer(), YeoJohnsonTransformer(), ]) def test_sklearn_compatible_transformer(estimator, check): check(estimator) # selectors @parametrize_with_checks([ DropFeatures(features_to_drop=["0"]), DropConstantFeatures(missing_values="ignore"), DropDuplicateFeatures(), DropCorrelatedFeatures(), SmartCorrelatedSelection(), DropHighPSIFeatures(bins=5), SelectByShuffling(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1), scoring="accuracy"), RecursiveFeatureElimination( LogisticRegression(max_iter=2, random_state=1), scoring="accuracy",
def test_error_if_fit_input_not_dataframe(): # test case 3: passing a different input than dataframe with pytest.raises(TypeError): transformer = DropFeatures(features_to_drop=["Name"]) transformer.fit({"Name": ["Karthik"]})
"mean_imputation", MeanMedianImputer( imputation_method="mean", variables=config.model_config.numerical_vars_with_na, ), ), # == TEMPORAL VARIABLES ==== ( "elapsed_time", pp.TemporalVariableTransformer( variables=config.model_config.temporal_vars, reference_variable=config.model_config.ref_var, ), ), ("drop_features", DropFeatures(features_to_drop=[config.model_config.ref_var])), # ==== VARIABLE TRANSFORMATION ===== ("log", LogTransformer(variables=config.model_config.numericals_log_vars)), ( "binarizer", SklearnTransformerWrapper( transformer=Binarizer(threshold=0), variables=config.model_config.binarize_vars, ), ), # === mappers === ( "mapper_qual", pp.Mapper( variables=config.model_config.qual_vars, mappings=config.model_config.qual_mappings,
def test_non_fitted_error(df_numeric_columns): # test case 8: when fit is not called prior to transform with pytest.raises(NotFittedError): transformer = DropFeatures(features_to_drop=[0, 1]) transformer.transform(df_numeric_columns)