def test_features_to_extract_param(): assert DatetimeFeatures( features_to_extract=None).features_to_extract is None assert DatetimeFeatures( features_to_extract=["year"]).features_to_extract == ["year"] assert DatetimeFeatures( features_to_extract="all").features_to_extract == "all"
def test_extract_datetime_features_with_default_options( df_datetime, df_datetime_transformed): transformer = DatetimeFeatures() X = transformer.fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[ vars_non_dt + [var + feat for var in vars_dt for feat in feat_names_default]], )
def test_raises_error_when_variables_not_datetime(df_datetime): # asking for not datetime variable(s) with pytest.raises(TypeError): DatetimeFeatures(variables=["Age"]).fit(df_datetime) with pytest.raises(TypeError): DatetimeFeatures( variables=["Name", "Age", "date_obj1"]).fit(df_datetime) with pytest.raises(TypeError): DatetimeFeatures(variables="index").fit(df_datetime) # passing a df that contains no datetime variables with pytest.raises(ValueError): DatetimeFeatures().fit(df_datetime[["Name", "Age"]])
def test_extract_datetime_features_from_specified_variables( df_datetime, df_datetime_transformed): # single datetime variable X = DatetimeFeatures(variables="date_obj1").fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[ vars_non_dt + ["datetime_range", "date_obj2", "time_obj"] + ["date_obj1" + feat for feat in feat_names_default]], ) # multiple datetime variables X = DatetimeFeatures( variables=["datetime_range", "date_obj2"]).fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[vars_non_dt + ["date_obj1", "time_obj"] + [ var + feat for var in ["datetime_range", "date_obj2"] for feat in feat_names_default ]], ) # multiple datetime variables in different order than they appear in the df X = DatetimeFeatures( variables=["date_obj2", "date_obj1"]).fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[vars_non_dt + ["datetime_range", "time_obj"] + [ var + feat for var in ["date_obj2", "date_obj1"] for feat in feat_names_default ]], ) # datetime variable is index X = DatetimeFeatures(variables="index", features_to_extract=["month", "day_of_month" ]).fit_transform(dates_idx_dt) pd.testing.assert_frame_equal( X, pd.concat( [ dates_idx_dt, pd.DataFrame( [[2, 27], [2, 28], [3, 1], [3, 2]], index=dates_idx_dt.index, columns=["month", "day_of_month"], ), ], axis=1, ), )
def test_raises_error_when_wrong_input_params(_features_to_extract, _variables, _other_params): with pytest.raises(ValueError): assert DatetimeFeatures(features_to_extract=_features_to_extract) with pytest.raises(ValueError): assert DatetimeFeatures(variables=_variables) with pytest.raises(ValueError): assert DatetimeFeatures(missing_values=_other_params) with pytest.raises(ValueError): assert DatetimeFeatures(drop_original=_other_params) with pytest.raises(ValueError): assert DatetimeFeatures(utc=_other_params)
def test_extract_features_with_different_datetime_parsing_options(df_datetime): X = DatetimeFeatures(features_to_extract=["day_of_month"], dayfirst=True).fit_transform( df_datetime[["date_obj2"]]) pd.testing.assert_frame_equal( X, pd.DataFrame({"date_obj2_day_of_month": [10, 31, 30, 17]}), ) X = DatetimeFeatures(features_to_extract=["year"], yearfirst=True).fit_transform( df_datetime[["date_obj2"]]) pd.testing.assert_frame_equal( X, pd.DataFrame({"date_obj2_year": [2010, 2009, 1995, 2004]}), )
def test_extract_features_from_categorical_variable(df_datetime, df_datetime_transformed): cat_date = pd.DataFrame( {"date_obj1": df_datetime["date_obj1"].astype("category")}) X = DatetimeFeatures(variables="date_obj1").fit_transform(cat_date) pd.testing.assert_frame_equal( X, df_datetime_transformed[[ "date_obj1" + feat for feat in feat_names_default ]])
def test_default_params(): transformer = DatetimeFeatures() assert isinstance(transformer, DatetimeFeatures) assert transformer.variables is None assert transformer.features_to_extract is None assert transformer.drop_original assert transformer.utc is None assert transformer.dayfirst is False assert transformer.yearfirst is False assert transformer.missing_values == "raise"
def test_get_feature_names_out_from_pipeline(df_datetime, df_datetime_transformed): transformer = Pipeline([("transformer", DatetimeFeatures())]) X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() assert transformer.get_feature_names_out(input_features=vars_dt) == [ var + feat for var in vars_dt for feat in feat_names_default ] assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1" + feat for feat in feat_names_default ]
def test_raises_error_when_transform_df_with_different_n_variables( df_datetime): transformer = DatetimeFeatures() transformer.fit(df_datetime) # different number of columns than the df used to fit with pytest.raises(ValueError): transformer.transform(df_datetime[vars_dt])
def test_extract_features_from_variables_containing_nans(): X = DatetimeFeatures(features_to_extract=["year"], missing_values="ignore").fit_transform(dates_nan) pd.testing.assert_frame_equal( X, pd.DataFrame({"dates_na_year": [2010, np.nan, 1922, np.nan]}), ) # dt variable is index X = DatetimeFeatures(variables="index", features_to_extract=["month"], missing_values="ignore").fit_transform(dates_idx_nan) pd.testing.assert_frame_equal( X, pd.concat( [ dates_idx_nan, pd.DataFrame({"month": [2, np.nan, 6, np.nan]}, index=dates_idx_nan.index), ], axis=1, ), )
def test_extract_specified_datetime_features(df_datetime, df_datetime_transformed): X = DatetimeFeatures( features_to_extract=["semester", "week"]).fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[vars_non_dt + [ var + "_" + feat for var in vars_dt for feat in ["semester", "week"] ]], ) # different order than they appear in the glossary X = DatetimeFeatures( features_to_extract=["hour", "day_of_week"]).fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed[vars_non_dt + [ var + "_" + feat for var in vars_dt for feat in ["hour", "day_of_week"] ]], )
def test_extract_features_from_different_timezones(df_datetime, df_datetime_transformed): time_zones = [4, -1, 9, -7] tz_df = pd.DataFrame( {"time_obj": df_datetime["time_obj"].add(["+4", "-1", "+9", "-7"])}) transformer = DatetimeFeatures(variables="time_obj", features_to_extract=["hour"], utc=True) X = transformer.fit_transform(tz_df) pd.testing.assert_frame_equal( X, df_datetime_transformed[["time_obj_hour" ]].apply(lambda x: x.subtract(time_zones)), ) exp_err_msg = ("ValueError: variable(s) time_obj " "could not be converted to datetime. Try setting utc=True") with pytest.raises(ValueError) as errinfo: assert DatetimeFeatures(variables="time_obj", features_to_extract=["hour"], utc=False).fit_transform(tz_df) assert str(errinfo.value) == exp_err_msg
def test_extract_features_from_localized_tz_variables(): tz_df = pd.DataFrame({ "date_var": [ "2018-10-28 01:30:00", "2018-10-28 02:00:00", "2018-10-28 02:30:00", "2018-10-28 02:00:00", "2018-10-28 02:30:00", "2018-10-28 03:00:00", "2018-10-28 03:30:00", ] }) tz_df["date_var"] = pd.to_datetime( tz_df["date_var"]).dt.tz_localize(tz="US/Eastern") # when utc is None transformer = DatetimeFeatures(features_to_extract=["hour"]).fit(tz_df) # init params assert transformer.variables is None assert transformer.utc is None assert transformer.features_to_extract == ["hour"] # fit attr assert transformer.variables_ == ["date_var"] assert transformer.features_to_extract_ == ["hour"] assert transformer.n_features_in_ == 1 # transform X = transformer.transform(tz_df) df_expected = pd.DataFrame({"date_var_hour": [1, 2, 2, 2, 2, 3, 3]}) pd.testing.assert_frame_equal(X, df_expected) # when utc is True transformer = DatetimeFeatures(features_to_extract=["hour"], utc=True).fit(tz_df) # init params assert transformer.variables is None assert transformer.utc is True assert transformer.features_to_extract == ["hour"] # fit attr assert transformer.variables_ == ["date_var"] assert transformer.features_to_extract_ == ["hour"] assert transformer.n_features_in_ == 1 # transform X = transformer.transform(tz_df) df_expected = pd.DataFrame({"date_var_hour": [5, 6, 6, 6, 6, 7, 7]}) pd.testing.assert_frame_equal(X, df_expected)
def test_extract_features_without_dropping_original_variables( df_datetime, df_datetime_transformed): X = DatetimeFeatures( variables=["datetime_range", "date_obj2"], features_to_extract=["week", "quarter"], drop_original=False, ).fit_transform(df_datetime) pd.testing.assert_frame_equal( X, pd.concat( [df_datetime_transformed[column] for column in vars_non_dt] + [df_datetime[var] for var in vars_dt] + [ df_datetime_transformed[feat] for feat in [ var + "_" + feat for var in ["datetime_range", "date_obj2"] for feat in ["week", "quarter"] ] ], axis=1, ), )
def test_raises_error_when_fitting_not_a_df(_not_a_df): transformer = DatetimeFeatures() # trying to fit not a df with pytest.raises(TypeError): transformer.fit(_not_a_df)
def test_raises_error_when_transforming_not_a_df(_not_a_df, df_datetime): transformer = DatetimeFeatures() transformer.fit(df_datetime) # trying to transform not a df with pytest.raises(TypeError): transformer.transform(_not_a_df)
def test_variables_params(_variables): assert DatetimeFeatures(variables=_variables).variables == _variables
def test_raises_error_when_nan_in_transform_df(df_datetime): transformer = DatetimeFeatures() transformer.fit(df_datetime) # dataset containing nans with pytest.raises(ValueError): transformer.transform(dates_nan) transformer = DatetimeFeatures(variables="index") transformer.fit(dates_idx_dt) with pytest.raises(ValueError): transformer.transform(dates_idx_nan)
def test_raises_non_fitted_error(df_datetime): # trying to transform before fitting with pytest.raises(NotFittedError): DatetimeFeatures().transform(df_datetime)
def test_get_feature_names_out(df_datetime, df_datetime_transformed): # default features from all variables transformer = DatetimeFeatures() X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() assert transformer.get_feature_names_out(input_features=vars_dt) == [ var + feat for var in vars_dt for feat in feat_names_default ] assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1" + feat for feat in feat_names_default ] # default features from 1 variable transformer = DatetimeFeatures(variables="date_obj1") X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1" + feat for feat in feat_names_default ] # all features transformer = DatetimeFeatures(features_to_extract="all") X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() feat_names_all = [FEATURES_SUFFIXES[feat] for feat in FEATURES_SUPPORTED] assert transformer.get_feature_names_out(input_features=vars_dt) == [ var + feat for var in vars_dt for feat in feat_names_all ] assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1" + feat for feat in feat_names_all ] # specified features transformer = DatetimeFeatures(features_to_extract=["semester", "week"]) X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() assert transformer.get_feature_names_out(input_features=vars_dt) == [ var + "_" + feat for var in vars_dt for feat in ["semester", "week"] ] assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1_" + feat for feat in ["semester", "week"] ] # features were extracted from index transformer = DatetimeFeatures(variables="index", features_to_extract=["semester", "week"]) X = transformer.fit_transform(dates_idx_dt) # user passes something else than index as input_features with pytest.raises(ValueError): transformer.get_feature_names_out(input_features="not_index") with pytest.raises(ValueError): transformer.get_feature_names_out( input_features=["still", "not", "index"]) # input_features is None assert list(X.columns) == transformer.get_feature_names_out() # input_features is index assert ["semester", "week" ] == transformer.get_feature_names_out(input_features="index") # when drop original is False transformer = DatetimeFeatures(drop_original=False) X = transformer.fit_transform(df_datetime) assert list(X.columns) == transformer.get_feature_names_out() assert transformer.get_feature_names_out(input_features=vars_dt) == [ var + feat for var in vars_dt for feat in feat_names_default ] assert transformer.get_feature_names_out(input_features=["date_obj1"]) == [ "date_obj1" + feat for feat in feat_names_default ] with pytest.raises(ValueError): # assert error when user passes a string instead of list transformer.get_feature_names_out(input_features="date_obj1") with pytest.raises(ValueError): # assert error when uses passes features that were not lagged transformer.get_feature_names_out(input_features=["color"])
def test_extract_all_datetime_features(df_datetime, df_datetime_transformed): X = DatetimeFeatures(features_to_extract="all").fit_transform(df_datetime) pd.testing.assert_frame_equal( X, df_datetime_transformed.drop(vars_dt, axis=1))
def test_attributes_upon_fitting(df_datetime): transformer = DatetimeFeatures() transformer.fit(df_datetime) assert transformer.variables_ == vars_dt assert transformer.features_to_extract_ == FEATURES_DEFAULT assert transformer.n_features_in_ == df_datetime.shape[1] transformer = DatetimeFeatures(variables="date_obj1", features_to_extract="all") transformer.fit(df_datetime) assert transformer.variables_ == ["date_obj1"] assert transformer.features_to_extract_ == FEATURES_SUPPORTED transformer = DatetimeFeatures( variables=["date_obj1", "time_obj"], features_to_extract=["year", "quarter_end", "second"], ) transformer.fit(df_datetime) assert transformer.variables_ == ["date_obj1", "time_obj"] assert transformer.features_to_extract_ == [ "year", "quarter_end", "second" ]
import pytest from feature_engine.datetime import DatetimeFeatures from tests.estimator_checks.estimator_checks import check_feature_engine_estimator _estimators = [DatetimeFeatures()] @pytest.mark.parametrize("estimator", _estimators) def test_check_estimator_from_feature_engine(estimator): return check_feature_engine_estimator(estimator)
def test_raises_error_when_df_has_nan(): # dataset containing nans with pytest.raises(ValueError): DatetimeFeatures().fit(dates_nan) with pytest.raises(ValueError): DatetimeFeatures(variables="index").fit(dates_idx_nan)