Example #1
0
def test_datetime_featurizer_fit_transform():
    datetime_transformer = DateTimeFeaturizer(features_to_extract=["year"])
    X = pd.DataFrame({'Numerical 1': range(20),
                      'Date Col 1': pd.date_range('2020-05-19', periods=20, freq='D'),
                      'Date Col 2': pd.date_range('2020-02-03', periods=20, freq='W'),
                      'Numerical 2': [0] * 20})
    transformed = datetime_transformer.fit_transform(X)
    assert list(transformed.columns) == ['Numerical 1', 'Numerical 2', 'Date Col 1_year', 'Date Col 2_year']
    assert transformed["Date Col 1_year"].equals(pd.Series([2020] * 20))
    assert transformed["Date Col 2_year"].equals(pd.Series([2020] * 20))
    assert datetime_transformer.get_feature_names() == {}
def test_datetime_featurizer_encodes_as_ints():
    X = pd.DataFrame({"date": ["2016-04-10 16:10:09", "2017-03-15 13:32:05", "2018-07-10 07:15:10",
                               "2019-08-19 20:20:20", "2020-01-03 06:45:12"]})
    dt = DateTimeFeaturizer()
    X_transformed_df = dt.fit_transform(X)
    expected = pd.DataFrame({"date_year": pd.Series([2016, 2017, 2018, 2019, 2020], dtype="Int64"),
                             "date_month": pd.Series([3, 2, 6, 7, 0], dtype="Int64"),
                             "date_day_of_week": pd.Series([0, 3, 2, 1, 5], dtype="Int64"),
                             "date_hour": pd.Series([16, 13, 7, 20, 6], dtype="Int64")})
    feature_names = {'date_month': {'April': 3, 'March': 2, 'July': 6, 'August': 7, 'January': 0},
                     'date_day_of_week': {'Sunday': 0, 'Wednesday': 3, 'Tuesday': 2, 'Monday': 1, 'Friday': 5}
                     }
    assert_frame_equal(expected, X_transformed_df.to_dataframe())
    assert dt.get_feature_names() == feature_names

    # Test that changing encode_as_categories to True only changes the dtypes but not the values
    dt_with_cats = DateTimeFeaturizer(encode_as_categories=True)
    X_transformed_df = dt_with_cats.fit_transform(X)
    expected["date_month"] = pd.Categorical([3, 2, 6, 7, 0])
    expected["date_day_of_week"] = pd.Categorical([0, 3, 2, 1, 5])

    assert_frame_equal(expected, X_transformed_df.to_dataframe())
    assert dt_with_cats.get_feature_names() == feature_names

    # Test that sequential calls to the same DateTimeFeaturizer work as expected by using the first dt we defined
    X = pd.DataFrame({"date": ["2020-04-10", "2017-03-15", "2019-08-19"]})
    X_transformed_df = dt.fit_transform(X)
    expected = pd.DataFrame({"date_year": pd.Series([2020, 2017, 2019], dtype="Int64"),
                             "date_month": pd.Series([3, 2, 7], dtype="Int64"),
                             "date_day_of_week": pd.Series([5, 3, 1], dtype="Int64"),
                             "date_hour": pd.Series([0, 0, 0], dtype="Int64")})
    assert_frame_equal(expected, X_transformed_df.to_dataframe())
    assert dt.get_feature_names() == {'date_month': {'April': 3, 'March': 2, 'August': 7},
                                      'date_day_of_week': {'Friday': 5, 'Wednesday': 3, 'Monday': 1}}

    dt = DateTimeFeaturizer(features_to_extract=["year", "hour"])
    dt.fit_transform(X)
    assert dt.get_feature_names() == {}