Esempio n. 1
0
def test_add_time_features_df():
    """Tests add_time_features_df"""
    # create indexed input
    date_list = pd.date_range(start=datetime.datetime(2019, 1, 1),
                              periods=100,
                              freq="H").tolist()
    df0 = pd.DataFrame({TIME_COL: date_list}, index=date_list)

    df = add_time_features_df(df=df0,
                              time_col=TIME_COL,
                              conti_year_origin=2018)
    assert df["year"][0] == 2019
    assert df.shape[0] == df0.shape[0]

    hourly_data = generate_df_with_reg_for_tests(
        freq="H",
        periods=24 * 500,
        train_start_date=datetime.datetime(2018, 7, 1),
        conti_year_origin=2018)
    cols = [TIME_COL, "regressor1", "regressor_bool", "regressor_categ"]
    train_df = hourly_data["train_df"]
    df = add_time_features_df(df=train_df[cols],
                              time_col=TIME_COL,
                              conti_year_origin=2018)
    assert df["year"][0] == 2018
    assert (df["dow_hr"][:3] == ["7_00", "7_01", "7_02"]).all()
    assert df.shape[0] == train_df.shape[0]
Esempio n. 2
0
def test_add_daily_events():
    # generate events dictionary
    countries = ["US", "India", "UK"]
    event_df_dict = get_holidays(countries, year_start=2015, year_end=2025)
    original_col_names = [
        event_df_dict[country].columns[1] for country in countries
    ]

    # generate temporal data
    date_list = pd.date_range(start=dt(2019, 1, 1), periods=100,
                              freq="H").tolist()

    df0 = pd.DataFrame({"ts": date_list})
    df = add_time_features_df(df0, time_col="ts", conti_year_origin=2018)
    df_with_events = add_daily_events(df=df,
                                      event_df_dict=event_df_dict,
                                      date_col="date")

    assert df_with_events[f"{EVENT_PREFIX}_India"].values[
        0] == "New Year's Day"
    assert df_with_events[f"{EVENT_PREFIX}_US"].values[25] == ""

    # makes sure the function does not modify the input
    new_col_names = [
        event_df_dict[country].columns[1] for country in countries
    ]
    assert original_col_names == new_col_names
Esempio n. 3
0
def test_get_custom_changepoints():
    """Tests get_custom_changepoints and get_changepoint_features"""
    date_list = pd.date_range(start=dt(2019, 1, 1), periods=20,
                              freq="D").tolist()
    time_col = "custom_time_col"
    df0 = pd.DataFrame({time_col: date_list})
    df = add_time_features_df(df0,
                              time_col="custom_time_col",
                              conti_year_origin=2018)

    # dates as datetime
    changepoint_dates = pd.to_datetime(
        ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"])
    result = get_custom_changepoints_values(
        df=df,
        changepoint_dates=changepoint_dates,
        time_col=time_col,  # pd.Timestamp type
        continuous_time_col=time_col  # makes checking the result easier
    )
    # 2018-01-01 is mapped to 2019-01-01. Duplicates mapped to "2019-01-03" are merged
    # Last requested changepoint is not found
    assert np.all(result == pd.to_datetime(["2019-01-01", "2019-01-03"]))

    # dates as strings
    changepoint_dates = [
        "2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"
    ]
    result = get_custom_changepoints_values(
        df=df,
        changepoint_dates=changepoint_dates,
        time_col="date",  # datetime.date type
        continuous_time_col=time_col)
    assert np.all(result == pd.to_datetime(["2019-01-01", "2019-01-03"]))

    # continuous_time_col different from time_col
    # check using timestamps from last `result`
    changepoints = get_custom_changepoints_values(
        df=df,
        changepoint_dates=changepoint_dates,
        time_col=time_col,
        continuous_time_col="ct1")
    assert np.all(changepoints == df[df[time_col].isin(result)]["ct1"].values)

    # generated features, using changepoints from above
    changepoint_df = get_changepoint_features(
        df,
        changepoints,
        continuous_time_col="ct1",
        growth_func=lambda x: x,
    )
    assert changepoint_df.shape == (df.shape[0], len(result))
    assert round(changepoint_df.iloc[15, 0], 3) == 0.041
    assert round(changepoint_df.iloc[16, 1], 3) == 0.038

    # no matching dates
    changepoint_dates = ["2019-02-01"]
    result = get_custom_changepoints_values(
        df=df,
        changepoint_dates=changepoint_dates,
        time_col=time_col,
        continuous_time_col=time_col)
    assert result is None

    # hourly data, provided changepoints at daily level
    date_list = pd.date_range(start=dt(2019, 1, 1), periods=20 * 24,
                              freq="H").tolist()
    time_col = "custom_time_col"
    df0 = pd.DataFrame({time_col: date_list})
    df = add_time_features_df(df0,
                              time_col="custom_time_col",
                              conti_year_origin=2018)

    # dates as datetime
    changepoint_dates = pd.to_datetime(
        ["2018-01-01", "2019-01-02-16", "2019-01-03", "2019-02-01"])
    result = get_custom_changepoints_values(
        df=df,
        changepoint_dates=changepoint_dates,
        time_col=time_col,  # pd.Timestamp type
        continuous_time_col=time_col  # makes checking the result easier
    )
    # 2018-01-01 is mapped to 2019-01-01-00. Mapped to -00 if no hour provided
    # Last requested changepoint is not found
    assert np.all(result == pd.to_datetime(
        ["2019-01-01-00", "2019-01-02-16", "2019-01-03-00"]))
Esempio n. 4
0
def test_get_evenly_spaced_changepoint_values():
    df = pd.DataFrame({
        "time_col":
        np.arange(1, 11),
        "ts":
        pd.date_range(start="2020-01-01", periods=10, freq="AS")
    })
    changepoints = get_evenly_spaced_changepoints_values(df,
                                                         "time_col",
                                                         n_changepoints=3)
    changepoint_dates = get_changepoint_dates_from_changepoints_dict(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 3
        },
        df=df,
        time_col="ts")

    # linear growth
    changepoint_df = get_changepoint_features(
        df,
        changepoints,
        continuous_time_col="time_col",
        growth_func=None,
        changepoint_dates=changepoint_dates)
    expected = pd.DataFrame({
        "changepoint0_2022_01_01_00": [0, 0, 0, 1, 2, 3, 4, 5, 6, 7],
        "changepoint1_2025_01_01_00": [0, 0, 0, 0, 0, 0, 1, 2, 3, 4],
        "changepoint2_2027_01_01_00": [0, 0, 0, 0, 0, 0, 0, 0, 1, 2]
    })
    assert changepoint_df.equals(expected)

    # quadratic growth
    changepoint_df = get_changepoint_features(
        df,
        changepoints,
        continuous_time_col="time_col",
        growth_func=lambda x: x**2,
        changepoint_dates=changepoint_dates)
    expected = pd.DataFrame({
        "changepoint0_2022_01_01_00": [0, 0, 0, 1, 4, 9, 16, 25, 36, 49],
        "changepoint1_2025_01_01_00": [0, 0, 0, 0, 0, 0, 1, 4, 9, 16],
        "changepoint2_2027_01_01_00": [0, 0, 0, 0, 0, 0, 0, 0, 1, 4]
    })
    assert changepoint_df.equals(expected)

    # real example
    n_changepoints = 3
    date_list = pd.date_range(start=dt(2019, 1, 1), periods=20,
                              freq="H").tolist()

    df0 = pd.DataFrame({"ts": date_list})
    df = add_time_features_df(df0, time_col="ts", conti_year_origin=2018)
    changepoints = get_evenly_spaced_changepoints_values(
        df, "ct1", n_changepoints=n_changepoints)
    changepoint_dates = get_changepoint_dates_from_changepoints_dict(
        changepoints_dict={
            "method": "uniform",
            "n_changepoints": 3
        },
        df=df,
        time_col="ts")
    changepoint_df = get_changepoint_features(
        df,
        changepoints,
        continuous_time_col="ct1",
        growth_func=lambda x: x,
        changepoint_dates=changepoint_dates)
    assert changepoint_df.shape == (df.shape[0], n_changepoints)
    assert changepoint_df.iloc[15, 2] == 0.0
    assert changepoint_df.iloc[16, 2] == 0.00011415525113989133