def test_sklearn_ohe_with_crossvalidation():
    """
    Created 2022-02-14 to test fix to issue # 368
    """

    # Set up test pipeline with wrapped OneHotEncoder, with simple regression model
    # to be able to run cross-validation; use sklearn CA housing data
    df = fetch_california_housing(as_frame=True).frame
    y = df["MedHouseVal"]
    X = (df[["HouseAge", "AveBedrms"]].assign(AveBedrms_cat=lambda x: pd.cut(
        x.AveBedrms, [0, 1, 2, 3, 4, np.inf]).astype(str)).drop(
            columns="AveBedrms"))
    pipeline: Pipeline = Pipeline(steps=[
        (
            "encode_cat",
            SklearnTransformerWrapper(
                transformer=OneHotEncoder(drop="first", sparse=False),
                variables=["AveBedrms_cat"],
            ),
        ),
        ("cleanup", DropFeatures(["AveBedrms_cat"])),
        ("model", Lasso()),
    ])

    # Run cross-validation
    results: np.ndarray = cross_val_score(pipeline,
                                          X,
                                          y,
                                          scoring="neg_mean_squared_error",
                                          cv=3)
    assert not any([np.isnan(i) for i in results])
Ejemplo n.º 2
0
def test_drop_2_variables_integer_colnames(df_numeric_columns):
    transformer = DropFeatures(features_to_drop=[0, 1])
    X = transformer.fit_transform(df_numeric_columns)

    # expected result
    df = pd.DataFrame(
        {
            2: [20, 21, 19, 18],
            3: [0.9, 0.8, 0.7, 0.6],
            4: pd.date_range("2020-02-24", periods=4, freq="T"),
        }
    )

    # init params
    assert transformer.features_to_drop == [0, 1]
    # transform params
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 3
0
def test_drop_2_variables(df_vartypes):
    transformer = DropFeatures(features_to_drop=["City", "dob"])
    X = transformer.fit_transform(df_vartypes)

    # expected result
    df = pd.DataFrame(
        {
            "Name": ["tom", "nick", "krish", "jack"],
            "Age": [20, 21, 19, 18],
            "Marks": [0.9, 0.8, 0.7, 0.6],
        }
    )

    # init params
    assert transformer.features_to_drop == ["City", "dob"]
    # transform params
    assert X.shape == (4, 3)
    assert type(X) == pd.DataFrame
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 4
0
def test_drop_1_variable(df_vartypes):
    transformer = DropFeatures(features_to_drop="City")
    X = transformer.fit_transform(df_vartypes)

    # expected result
    df = pd.DataFrame(
        {
            "Name": ["tom", "nick", "krish", "jack"],
            "Age": [20, 21, 19, 18],
            "Marks": [0.9, 0.8, 0.7, 0.6],
            "dob": pd.date_range("2020-02-24", periods=4, freq="T"),
        }
    )

    # init params
    assert transformer.features_to_drop == "City"

    # transform params
    assert X.shape == (4, 4)
    assert type(X) == pd.DataFrame
    pd.testing.assert_frame_equal(X, df)
Ejemplo n.º 5
0
def test_error_if_empty_list(df_vartypes):
    # test case 6: passing an empty list
    with pytest.raises(ValueError):
        transformer = DropFeatures(features_to_drop=[])
        transformer.fit_transform(df_vartypes)
Ejemplo n.º 6
0
def test_error_when_returning_empty_dataframe(df_vartypes):
    # test case 5: dropping all columns produces warning check
    with pytest.raises(ValueError):
        transformer = DropFeatures(features_to_drop=list(df_vartypes.columns))
        transformer.fit_transform(df_vartypes)
Ejemplo n.º 7
0
def test_error_if_non_existing_variables(df_vartypes):
    # test case 2: passing variables that doesn't exist
    with pytest.raises(KeyError):
        transformer = DropFeatures(features_to_drop=["last_name"])
        transformer.fit_transform(df_vartypes)
Ejemplo n.º 8
0
    DropCorrelatedFeatures,
    DropDuplicateFeatures,
    DropFeatures,
    DropHighPSIFeatures,
    RecursiveFeatureAddition,
    RecursiveFeatureElimination,
    SelectByShuffling,
    SelectBySingleFeaturePerformance,
    SelectByTargetMeanPerformance,
    SmartCorrelatedSelection,
)

_logreg = LogisticRegression(C=0.0001, max_iter=2, random_state=1)

_estimators = [
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    DropHighPSIFeatures(bins=5),
    SmartCorrelatedSelection(),
    SelectByShuffling(estimator=_logreg, scoring="accuracy"),
    SelectByTargetMeanPerformance(bins=3, regression=False),
    SelectBySingleFeaturePerformance(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureAddition(estimator=_logreg, scoring="accuracy"),
    RecursiveFeatureElimination(estimator=_logreg, scoring="accuracy", threshold=-100),
]

_multivariate_estimators = [
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
Ejemplo n.º 9
0
print(mean_imputer.imputer_dict_)

X_train = mean_imputer.transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Varief whether there are missing value.
X_train[cat_vars_with_na].isnull().sum()
[var for var in cat_vars_with_na if X_test[var].isnull().sum() > 0]

#%% Temporal variables.
def elapsed_years(df, var):
    df[var] = df['YrSold'] - df[var]
    return df
for var in ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt']:
    X_train = elapsed_years(X_train, var)
    X_test = elapsed_years(X_test, var)

# now we drop YrSold.
drop_features = DropFeatures(features_to_drop=['YrSold'])
X_train = mean_imputer.fit_transform(X_train)
X_test = mean_imputer.transform(X_test)

# %% Numerical variable -- transformation.
log_transformer = LogTransformer(
    variables=["LotFrontage", "1stFlrSF", "GrLivArea"],
)
X_train = log_transformer.fit_transform(X_train)
X_test = log_transformer.transform(X_test)

# check that test set does not contain null values in the engineered variables
[var for var in ["LotFrontage", "1stFlrSF", "GrLivArea"] if X_test[var].isnull().sum() > 0]
# transformers
@parametrize_with_checks([
    BoxCoxTransformer(),
    LogTransformer(),
    PowerTransformer(),
    ReciprocalTransformer(),
    YeoJohnsonTransformer(),
])
def test_sklearn_compatible_transformer(estimator, check):
    check(estimator)


# selectors
@parametrize_with_checks([
    DropFeatures(features_to_drop=["0"]),
    DropConstantFeatures(missing_values="ignore"),
    DropDuplicateFeatures(),
    DropCorrelatedFeatures(),
    SmartCorrelatedSelection(),
    DropHighPSIFeatures(bins=5),
    SelectByShuffling(LogisticRegression(max_iter=2, random_state=1),
                      scoring="accuracy"),
    SelectBySingleFeaturePerformance(LogisticRegression(max_iter=2,
                                                        random_state=1),
                                     scoring="accuracy"),
    RecursiveFeatureAddition(LogisticRegression(max_iter=2, random_state=1),
                             scoring="accuracy"),
    RecursiveFeatureElimination(
        LogisticRegression(max_iter=2, random_state=1),
        scoring="accuracy",
Ejemplo n.º 11
0
def test_error_if_fit_input_not_dataframe():
    # test case 3: passing a different input than dataframe
    with pytest.raises(TypeError):
        transformer = DropFeatures(features_to_drop=["Name"])
        transformer.fit({"Name": ["Karthik"]})
Ejemplo n.º 12
0
     "mean_imputation",
     MeanMedianImputer(
         imputation_method="mean",
         variables=config.model_config.numerical_vars_with_na,
     ),
 ),
 # == TEMPORAL VARIABLES ====
 (
     "elapsed_time",
     pp.TemporalVariableTransformer(
         variables=config.model_config.temporal_vars,
         reference_variable=config.model_config.ref_var,
     ),
 ),
 ("drop_features",
  DropFeatures(features_to_drop=[config.model_config.ref_var])),
 # ==== VARIABLE TRANSFORMATION =====
 ("log", LogTransformer(variables=config.model_config.numericals_log_vars)),
 (
     "binarizer",
     SklearnTransformerWrapper(
         transformer=Binarizer(threshold=0),
         variables=config.model_config.binarize_vars,
     ),
 ),
 # === mappers ===
 (
     "mapper_qual",
     pp.Mapper(
         variables=config.model_config.qual_vars,
         mappings=config.model_config.qual_mappings,
Ejemplo n.º 13
0
def test_non_fitted_error(df_numeric_columns):
    # test case 8: when fit is not called prior to transform
    with pytest.raises(NotFittedError):
        transformer = DropFeatures(features_to_drop=[0, 1])
        transformer.transform(df_numeric_columns)