def test_datetime_featurizer_woodwork_custom_overrides_returned_by_components(with_datetime_col, encode_as_categories, X_df): override_types = [Integer, Double, Categorical, NaturalLanguage, Datetime] if with_datetime_col: X_df['datetime col'] = pd.to_datetime(['20200101', '20200519', '20190607'], format='%Y%m%d') for logical_type in override_types: try: X = ww.DataTable(X_df.copy(), logical_types={0: logical_type}) except TypeError: continue datetime_transformer = DateTimeFeaturizer(encode_as_categories=encode_as_categories) datetime_transformer.fit(X) transformed = datetime_transformer.transform(X) assert isinstance(transformed, ww.DataTable) if with_datetime_col: if encode_as_categories: datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Categorical, 'datetime col_day_of_week': Categorical, 'datetime col_hour': Integer} else: datetime_col_transformed = {'datetime col_year': Integer, 'datetime col_month': Integer, 'datetime col_day_of_week': Integer, 'datetime col_hour': Integer} assert all(item in transformed.logical_types.items() for item in datetime_col_transformed.items()) if logical_type == Datetime: if encode_as_categories: col_transformed = {'0_year': Integer, '0_month': Categorical, '0_day_of_week': Categorical, '0_hour': Integer} else: col_transformed = {'0_year': Integer, '0_month': Integer, '0_day_of_week': Integer, '0_hour': Integer} assert all(item in transformed.logical_types.items() for item in col_transformed.items()) else: assert transformed.logical_types[0] == logical_type
def test_datetime_featurizer_fit_transform(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"]) X = pd.DataFrame({'Numerical 1': range(20), 'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'), 'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'), 'Numerical 2': [0] * 20}) transformed = datetime_transformer.fit_transform(X) assert list(transformed.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year'] assert transformed["Date Col 1_year"].equals(pd.Series([2020] * 20)) assert transformed["Date Col 2_year"].equals(pd.Series([2020] * 20)) assert datetime_transformer.get_feature_names() == {}
def test_datetime_featurizer_init(): datetime_transformer = DateTimeFeaturizer() assert datetime_transformer.parameters == {"features_to_extract": ["year", "month", "day_of_week", "hour"], "encode_as_categories": False} datetime_transformer = DateTimeFeaturizer(features_to_extract=["year", "month"], encode_as_categories=True) assert datetime_transformer.parameters == {"features_to_extract": ["year", "month"], "encode_as_categories": True} with pytest.raises(ValueError, match="not valid options for features_to_extract"): DateTimeFeaturizer(features_to_extract=["invalid", "parameters"])
def test_datetime_featurizer_numpy_array_input(): datetime_transformer = DateTimeFeaturizer() X = np.array([['2007-02-03'], ['2016-06-07'], ['2020-05-19']], dtype='datetime64') datetime_transformer.fit(X) assert list(datetime_transformer.transform(X).columns) == ["0_year", "0_month", "0_day_of_week", "0_hour"] assert datetime_transformer.get_feature_names() == {'0_month': {'February': 1, 'June': 5, 'May': 4}, '0_day_of_week': {'Saturday': 6, 'Tuesday': 2}}
def test_datetime_featurizer_custom_features_to_extract(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["month", "year"]) rng = pd.date_range('2020-02-24', periods=20, freq='D') X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) datetime_transformer.fit(X) assert list(datetime_transformer.transform(X).columns) == ["numerical", "date col_month", "date col_year"] assert datetime_transformer.get_feature_names() == {"date col_month": {"February": 1, "March": 2}}
def test_datetime_featurizer_no_features_to_extract(): datetime_transformer = DateTimeFeaturizer(features_to_extract=[]) rng = pd.date_range('2020-02-24', periods=20, freq='D') X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) datetime_transformer.fit(X) assert datetime_transformer.transform(X).equals(X) assert datetime_transformer.get_feature_names() == {}
def test_datetime_featurizer_no_datetime_cols(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year", "month"]) X = pd.DataFrame([[1, 3, 4], [2, 5, 2]]) expected = X.astype("Int64") datetime_transformer.fit(X) transformed = datetime_transformer.transform(X).to_dataframe() assert_frame_equal(expected, transformed) assert datetime_transformer.get_feature_names() == {}
def test_datetime_featurizer_no_col_names(): datetime_transformer = DateTimeFeaturizer() X = pd.DataFrame(pd.Series(pd.date_range('2020-02-24', periods=10, freq='D'))) datetime_transformer.fit(X) assert list(datetime_transformer.transform(X).columns) == ['0_year', '0_month', '0_day_of_week', '0_hour'] assert datetime_transformer.get_feature_names() == {'0_month': {'February': 1, 'March': 2}, '0_day_of_week': {'Monday': 1, 'Tuesday': 2, 'Wednesday': 3, 'Thursday': 4, 'Friday': 5, 'Saturday': 6, 'Sunday': 0}}
def test_datetime_featurizer_no_features_to_extract(): datetime_transformer = DateTimeFeaturizer(features_to_extract=[]) rng = pd.date_range('2020-02-24', periods=20, freq='D') X = pd.DataFrame({"date col": rng, "numerical": [0] * len(rng)}) expected = X.copy() expected["numerical"] = expected["numerical"].astype("Int64") datetime_transformer.fit(X) transformed = datetime_transformer.transform(X).to_dataframe() assert_frame_equal(expected, transformed) assert datetime_transformer.get_feature_names() == {}
def test_describe_component(): enc = OneHotEncoder() imputer = Imputer() simple_imputer = SimpleImputer("mean") column_imputer = PerColumnImputer({"a": "mean", "b": ("constant", 100)}) scaler = StandardScaler() feature_selection_clf = RFClassifierSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) feature_selection_reg = RFRegressorSelectFromModel(n_estimators=10, number_features=5, percent_features=0.3, threshold=-np.inf) drop_col_transformer = DropColumns(columns=['col_one', 'col_two']) drop_null_transformer = DropNullColumns() datetime = DateTimeFeaturizer() text_featurizer = TextFeaturizer() lsa = LSA() pca = PCA() lda = LinearDiscriminantAnalysis() ft = DFSTransformer() us = Undersampler() assert enc.describe(return_dict=True) == {'name': 'One Hot Encoder', 'parameters': {'top_n': 10, 'features_to_encode': None, 'categories': None, 'drop': 'if_binary', 'handle_unknown': 'ignore', 'handle_missing': 'error'}} assert imputer.describe(return_dict=True) == {'name': 'Imputer', 'parameters': {'categorical_impute_strategy': "most_frequent", 'categorical_fill_value': None, 'numeric_impute_strategy': "mean", 'numeric_fill_value': None}} assert simple_imputer.describe(return_dict=True) == {'name': 'Simple Imputer', 'parameters': {'impute_strategy': 'mean', 'fill_value': None}} assert column_imputer.describe(return_dict=True) == {'name': 'Per Column Imputer', 'parameters': {'impute_strategies': {'a': 'mean', 'b': ('constant', 100)}, 'default_impute_strategy': 'most_frequent'}} assert scaler.describe(return_dict=True) == {'name': 'Standard Scaler', 'parameters': {}} assert feature_selection_clf.describe(return_dict=True) == {'name': 'RF Classifier Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert feature_selection_reg.describe(return_dict=True) == {'name': 'RF Regressor Select From Model', 'parameters': {'number_features': 5, 'n_estimators': 10, 'max_depth': None, 'percent_features': 0.3, 'threshold': -np.inf, 'n_jobs': -1}} assert drop_col_transformer.describe(return_dict=True) == {'name': 'Drop Columns Transformer', 'parameters': {'columns': ['col_one', 'col_two']}} assert drop_null_transformer.describe(return_dict=True) == {'name': 'Drop Null Columns Transformer', 'parameters': {'pct_null_threshold': 1.0}} assert datetime.describe(return_dict=True) == {'name': 'DateTime Featurization Component', 'parameters': {'features_to_extract': ['year', 'month', 'day_of_week', 'hour'], 'encode_as_categories': False}} assert text_featurizer.describe(return_dict=True) == {'name': 'Text Featurization Component', 'parameters': {}} assert lsa.describe(return_dict=True) == {'name': 'LSA Transformer', 'parameters': {}} assert pca.describe(return_dict=True) == {'name': 'PCA Transformer', 'parameters': {'n_components': None, 'variance': 0.95}} assert lda.describe(return_dict=True) == {'name': 'Linear Discriminant Analysis Transformer', 'parameters': {'n_components': None}} assert ft.describe(return_dict=True) == {'name': 'DFS Transformer', 'parameters': {"index": "index"}} assert us.describe(return_dict=True) == {'name': 'Undersampler', 'parameters': {"balanced_ratio": 4, "min_samples": 100, "min_percentage": 0.1}} # testing estimators base_classifier = BaselineClassifier() base_regressor = BaselineRegressor() lr_classifier = LogisticRegressionClassifier() en_classifier = ElasticNetClassifier() en_regressor = ElasticNetRegressor() et_classifier = ExtraTreesClassifier(n_estimators=10, max_features="auto") et_regressor = ExtraTreesRegressor(n_estimators=10, max_features="auto") rf_classifier = RandomForestClassifier(n_estimators=10, max_depth=3) rf_regressor = RandomForestRegressor(n_estimators=10, max_depth=3) linear_regressor = LinearRegressor() svm_classifier = SVMClassifier() svm_regressor = SVMRegressor() assert base_classifier.describe(return_dict=True) == {'name': 'Baseline Classifier', 'parameters': {'strategy': 'mode'}} assert base_regressor.describe(return_dict=True) == {'name': 'Baseline Regressor', 'parameters': {'strategy': 'mean'}} assert lr_classifier.describe(return_dict=True) == {'name': 'Logistic Regression Classifier', 'parameters': {'penalty': 'l2', 'C': 1.0, 'n_jobs': -1, 'multi_class': 'auto', 'solver': 'lbfgs'}} assert en_classifier.describe(return_dict=True) == {'name': 'Elastic Net Classifier', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'n_jobs': -1, 'max_iter': 1000, "loss": 'log', 'penalty': 'elasticnet'}} assert en_regressor.describe(return_dict=True) == {'name': 'Elastic Net Regressor', 'parameters': {'alpha': 0.5, 'l1_ratio': 0.5, 'max_iter': 1000, 'normalize': False}} assert et_classifier.describe(return_dict=True) == {'name': 'Extra Trees Classifier', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert et_regressor.describe(return_dict=True) == {'name': 'Extra Trees Regressor', 'parameters': {'n_estimators': 10, 'max_features': 'auto', 'max_depth': 6, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_jobs': -1}} assert rf_classifier.describe(return_dict=True) == {'name': 'Random Forest Classifier', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert rf_regressor.describe(return_dict=True) == {'name': 'Random Forest Regressor', 'parameters': {'n_estimators': 10, 'max_depth': 3, 'n_jobs': -1}} assert linear_regressor.describe(return_dict=True) == {'name': 'Linear Regressor', 'parameters': {'fit_intercept': True, 'normalize': False, 'n_jobs': -1}} assert svm_classifier.describe(return_dict=True) == {'name': 'SVM Classifier', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale', 'probability': True}} assert svm_regressor.describe(return_dict=True) == {'name': 'SVM Regressor', 'parameters': {'C': 1.0, 'kernel': 'rbf', 'gamma': 'scale'}} try: xgb_classifier = XGBoostClassifier(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) xgb_regressor = XGBoostRegressor(eta=0.1, min_child_weight=1, max_depth=3, n_estimators=75) assert xgb_classifier.describe(return_dict=True) == {'name': 'XGBoost Classifier', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} assert xgb_regressor.describe(return_dict=True) == {'name': 'XGBoost Regressor', 'parameters': {'eta': 0.1, 'max_depth': 3, 'min_child_weight': 1, 'n_estimators': 75}} except ImportError: pass try: cb_classifier = CatBoostClassifier() cb_regressor = CatBoostRegressor() assert cb_classifier.describe(return_dict=True) == {'name': 'CatBoost Classifier', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': True}} assert cb_regressor.describe(return_dict=True) == {'name': 'CatBoost Regressor', 'parameters': {'allow_writing_files': False, 'n_estimators': 10, 'eta': 0.03, 'max_depth': 6, 'bootstrap_type': None, 'silent': False}} except ImportError: pass try: lg_classifier = LightGBMClassifier() lg_regressor = LightGBMRegressor() assert lg_classifier.describe(return_dict=True) == {'name': 'LightGBM Classifier', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 100, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} assert lg_regressor.describe(return_dict=True) == {'name': 'LightGBM Regressor', 'parameters': {'boosting_type': 'gbdt', 'learning_rate': 0.1, 'n_estimators': 20, 'max_depth': 0, 'num_leaves': 31, 'min_child_samples': 20, 'n_jobs': -1, 'bagging_fraction': 0.9, 'bagging_freq': 0}} except ImportError: pass
def test_datetime_featurizer_encodes_as_ints(): X = pd.DataFrame({"date": ["2016-04-10 16:10:09", "2017-03-15 13:32:05", "2018-07-10 07:15:10", "2019-08-19 20:20:20", "2020-01-03 06:45:12"]}) dt = DateTimeFeaturizer() X_transformed = dt.fit_transform(X) answer = pd.DataFrame({"date_year": [2016, 2017, 2018, 2019, 2020], "date_month": [3, 2, 6, 7, 0], "date_day_of_week": [0, 3, 2, 1, 5], "date_hour": [16, 13, 7, 20, 6]}) feature_names = {'date_month': {'April': 3, 'March': 2, 'July': 6, 'August': 7, 'January': 0}, 'date_day_of_week': {'Sunday': 0, 'Wednesday': 3, 'Tuesday': 2, 'Monday': 1, 'Friday': 5} } pd.testing.assert_frame_equal(X_transformed, answer) assert dt.get_feature_names() == feature_names # Test that changing encode_as_categories to True only changes the dtypes but not the values dt_with_cats = DateTimeFeaturizer(encode_as_categories=True) X_transformed = dt_with_cats.fit_transform(X) answer = answer.astype({"date_day_of_week": "category", "date_month": "category"}) pd.testing.assert_frame_equal(X_transformed, answer) assert dt_with_cats.get_feature_names() == feature_names # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined X = pd.DataFrame({"date": ["2020-04-10", "2017-03-15", "2019-08-19"]}) X_transformed = dt.fit_transform(X) answer = pd.DataFrame({"date_year": [2020, 2017, 2019], "date_month": [3, 2, 7], "date_day_of_week": [5, 3, 1], "date_hour": [0, 0, 0]}) pd.testing.assert_frame_equal(X_transformed, answer) assert dt.get_feature_names() == {'date_month': {'April': 3, 'March': 2, 'August': 7}, 'date_day_of_week': {'Friday': 5, 'Wednesday': 3, 'Monday': 1}} dt = DateTimeFeaturizer(features_to_extract=["year", "hour"]) dt.fit_transform(X) assert dt.get_feature_names() == {}
def test_datetime_featurizer_no_datetime_cols(): datetime_transformer = DateTimeFeaturizer(features_to_extract=["year", "month"]) X = pd.DataFrame([[1, 3, 4], [2, 5, 2]]) datetime_transformer.fit(X) assert datetime_transformer.transform(X).equals(X) assert datetime_transformer.get_feature_names() == {}