def cols_interact(static_col, fs_name, fs_order, fs_seas_name=None):
    """Returns all interactions between static_col and fourier series up to specified order

    :param static_col:
        column to interact with fourier series. can be an arbitrary patsy model term
        e.g. "ct1", "C(woy)", "is_weekend:Q('events_Christmas Day')"
    :param fs_name:
        column the fourier series is generated from, same as col_name in fourier_series_fcn
    :param fs_order: int
        generate interactions up to this order. must be <= order in fourier_series_fcn
    :param fs_seas_name: str
        same as seas_name in fourier_series_fcn
    :return: list[str]
        interaction terms to include in patsy model formula
    """
    interaction_columns = [None] * fs_order * 2
    for i in range(fs_order):
        k = i + 1
        sin_col_name = get_fourier_col_name(k,
                                            fs_name,
                                            function_name="sin",
                                            seas_name=fs_seas_name)
        cos_col_name = get_fourier_col_name(k,
                                            fs_name,
                                            function_name="cos",
                                            seas_name=fs_seas_name)
        interaction_columns[2 * i] = f"{static_col}:{sin_col_name}"
        interaction_columns[2 * i + 1] = f"{static_col}:{cos_col_name}"
    return interaction_columns
Esempio n. 2
0
def generate_df_with_holidays(freq, periods):
    # generate data
    df = generate_df_for_tests(freq, periods, remove_extra_cols=False)["df"]

    # generate holidays
    countries = ["US", "India"]
    event_df_dict = get_holidays(countries, year_start=2015, year_end=2025)

    for country in countries:
        event_df_dict[country][EVENT_DF_LABEL_COL] = country + "_holiday"

    df = add_daily_events(df=df,
                          event_df_dict=event_df_dict,
                          date_col=TIME_COL,
                          regular_day_label="")

    df[VALUE_COL] = (df[VALUE_COL] + 2 * (df["events_US"] == "US_holiday") *
                     df[get_fourier_col_name(1, "tod", function_name="sin")] +
                     3 * (df["events_US"] == "US_holiday") *
                     df[get_fourier_col_name(1, "tod", function_name="cos")] +
                     4 * (df["events_India"] == "India_holiday") *
                     df[get_fourier_col_name(1, "tod", function_name="cos")])

    df = df[[TIME_COL, VALUE_COL]]
    thresh = datetime.datetime(2019, 8, 1)
    train_df = df[df[TIME_COL] <= thresh]
    test_df = df[df[TIME_COL] > thresh]
    fut_time_num = test_df.shape[0]

    return {
        "df": df,
        "train_df": train_df,
        "test_df": test_df,
        "fut_time_num": fut_time_num,
    }
Esempio n. 3
0
def test_get_fourier_col_name():
    assert get_fourier_col_name(2, "dow", function_name="sin") == "sin2_dow"
    assert get_fourier_col_name(4.0, "dow", function_name="cos") == "cos4_dow"
    assert get_fourier_col_name(2,
                                "dow",
                                function_name="sin",
                                seas_name="weekly") == "sin2_dow_weekly"
    assert get_fourier_col_name(4.0,
                                "dow",
                                function_name="cos",
                                seas_name="weekly") == "cos4_dow_weekly"
Esempio n. 4
0
def test_fourier_series_multi_fcn():
    """Tests fourier_series_multi_fcn"""
    x = np.linspace(2.0, 3.0, num=100)
    y = np.linspace(3.0, 4.0, num=100)
    df0 = pd.DataFrame({"x": x, "y": y})

    func = fourier_series_multi_fcn(
        col_names=["x", "x", "y"],
        periods=[1.0, 2.0, 0.5],
        orders=[3, 4, 2],
        seas_names=["cat_period", "double_cat_period", "dog_period"])

    res = func(df0)
    df = res["df"]
    """
    col1 = get_fourier_col_name(1, "x", function_name="cos", seas_name="cat_period")
    col2 = get_fourier_col_name(3, "x", function_name="sin", seas_name="cat_period")
    plt.plot(df["x"], df[col1])
    plt.plot(df["x"], df[col2])
    """
    col = get_fourier_col_name(1,
                               "x",
                               function_name="sin",
                               seas_name="cat_period")
    assert df[col][0].round(1) == 0.0

    with pytest.raises(ValueError,
                       match="periods and orders must have the same length."):
        fourier_series_multi_fcn(col_names=["tod", "tow"],
                                 periods=[24.0, 7.0],
                                 orders=[3, 3, 4],
                                 seas_names=["daily", "weekly"])
Esempio n. 5
0
def test_fourier_series_fcn():
    x = np.linspace(2.0, 3.0, num=100)
    df1 = pd.DataFrame({"x": x})
    func = fourier_series_fcn(col_name="x", period=1.0, order=2)
    df2 = func(df1)["df"]
    """
    # visualization for debugging
    col1 = get_fourier_col_name(1, "x", function_name="sin")
    col2 = get_fourier_col_name(1, "x", function_name="cos")
    col3 = get_fourier_col_name(2, "x", function_name="sin")
    col4 = get_fourier_col_name(2, "x", function_name="cos")
    plt.plot(x, df2[col1], label=col1)
    plt.plot(x, df2[col2], label=col2)
    plt.plot(x, df2[col3], label=col3)
    plt.plot(x, df2[col4], label=col4)
    plt.legend()
    """
    assert df2[get_fourier_col_name(1, "x", function_name="sin")][0].round(1) == 0.0
Esempio n. 6
0
def generate_df_for_tests(freq,
                          periods,
                          train_start_date=datetime.datetime(2018, 7, 1),
                          train_end_date=None,
                          train_frac=0.8,
                          conti_year_origin=None,
                          noise_std=2.0,
                          remove_extra_cols=True,
                          autoreg_coefs=None,
                          fs_coefs=[-1, 3, 4],
                          growth_coef=3.0,
                          growth_pow=1.1,
                          intercept=0.0):
    """Generates dataset for unit tests.

    :param freq: str
        pd.date_range freq parameter, e.g. H or D
    :param periods: int
        number of periods to generate
    :param train_start_date: datetime.datetime
        train start date
    :param train_end_date: Optional[datetime.datetime]
        train end date
    :param train_frac: Optional[float]
        fraction of data to use for training
        only used if train_end_date isn't provided
    :param noise_std: float
        standard deviation of gaussian noise
    :param conti_year_origin: float
        the time origin for continuous time variables
    :param remove_extra_cols: bool
        whether to remove extra columns besides TIME_COL, VALUE_COL
    :param autoreg_coefs: Optional[List[int]]
        The coefficients for the autoregressive terms.
        If provided the generated series denoted mathematically by Y(t) will be
        converted as follows:
        Y(t) -> Y(t) + c1 Y(t-1) + c2 Y(t-2) + c3 Y(t-3) + ...
        where autoreg_coefs = [c1, c2, c3, ...]
        In this fashion, the obtained series will have autoregressive
        properties not explained by seasonality and growth.
    :param fs_coefs: List[float]
        The fourier series coefficients used.
    :param growth_coef: float
        Multiplier for growth
    :param growth_pow: float
        Power for growth, as function of continuous time
    :param intercept: float
        Constant term added to Y(t)

    :return: Dict[str, any]
        contains full dataframe, train dataframe, test dataframe,
        and nrows in test dataframe
    """
    np.random.seed(123)

    date_list = pd.date_range(start=train_start_date,
                              periods=periods,
                              freq=freq).tolist()

    df0 = pd.DataFrame({TIME_COL: date_list})
    if conti_year_origin is None:
        conti_year_origin = get_default_origin_for_time_vars(df0, TIME_COL)
    time_df = build_time_features_df(dt=df0[TIME_COL],
                                     conti_year_origin=conti_year_origin)
    df = pd.concat([df0, time_df], axis=1)
    df["growth"] = growth_coef * (df["ct1"]**growth_pow)

    func = fourier_series_multi_fcn(col_names=["toy", "tow", "tod"],
                                    periods=[1.0, 7.0, 24.0],
                                    orders=[1, 1, 1],
                                    seas_names=None)

    res = func(df)
    df_seas = res["df"]
    df = pd.concat([df, df_seas], axis=1)

    df[VALUE_COL] = (
        intercept + df["growth"] +
        fs_coefs[0] * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        fs_coefs[1] * df[get_fourier_col_name(1, "tow", function_name="sin")] +
        fs_coefs[2] * df[get_fourier_col_name(1, "toy", function_name="sin")] +
        noise_std * np.random.normal(size=df.shape[0]))

    if autoreg_coefs is not None:
        df["temporary_new_value"] = df[VALUE_COL]
        k = len(autoreg_coefs)
        for i in range(k):
            df["temporary_new_value"] = (
                df["temporary_new_value"] +
                autoreg_coefs[i] * df[VALUE_COL].shift(-i)).bfill()
        df[VALUE_COL] = df["temporary_new_value"]
        del df["temporary_new_value"]

    if train_end_date is None:
        train_rows = np.floor(train_frac * df.shape[0]).astype(int)
        train_end_date = df[TIME_COL][train_rows]

    if remove_extra_cols:
        df = df[[TIME_COL, VALUE_COL]]
    train_df = df.loc[df[TIME_COL] <= train_end_date]
    test_df = df.loc[df[TIME_COL] > train_end_date]
    fut_time_num = test_df.shape[0]

    return {
        "df": df,
        "train_df": train_df.reset_index(drop=True),
        "test_df": test_df.reset_index(drop=True),
        "fut_time_num": fut_time_num,
    }
Esempio n. 7
0
def generate_df_with_reg_for_tests(freq,
                                   periods,
                                   train_start_date=datetime.datetime(
                                       2018, 7, 1),
                                   train_end_date=None,
                                   train_frac=0.8,
                                   conti_year_origin=None,
                                   noise_std=2.0,
                                   remove_extra_cols=True,
                                   mask_test_actuals=False):
    """Generates dataset for unit tests that includes regressor columns
    :param freq: str
        pd.date_range freq parameter, e.g. H or D
    :param periods: int
        number of periods to generate
    :param train_start_date: datetime.datetime
        train start date
    :param train_end_date: Optional[datetime.datetime]
        train end date
    :param train_frac: Optional[float]
        fraction of data to use for training
        only used if train_end_date isn't provided
    :param noise_std: float
        standard deviation of gaussian noise
    :param conti_year_origin: float
        the time origin for continuous time variables
    :param remove_extra_cols: bool
        whether to remove extra columns besides TIME_COL, VALUE_COL
    :param mask_test_actuals: bool
        whether to set y values to np.NaN in the test set.
    :return: Dict with train dataframe, test dataframe, and nrows in test dataframe
    """
    np.random.seed(123)

    result_list = generate_df_for_tests(freq,
                                        periods,
                                        train_start_date=train_start_date,
                                        train_end_date=train_end_date,
                                        train_frac=train_frac,
                                        conti_year_origin=conti_year_origin,
                                        noise_std=noise_std,
                                        remove_extra_cols=False)

    df = result_list["df"]
    df["regressor1"] = (
        df["growth"] +
        4 * df[get_fourier_col_name(1, "tow", function_name="sin")] -
        3 * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        7 * df[get_fourier_col_name(1, "toy", function_name="sin")] -
        noise_std * np.random.normal(size=df.shape[0]))

    df["regressor2"] = (
        df["growth"] +
        1 * df[get_fourier_col_name(1, "tow", function_name="sin")] -
        2 * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        3 * df[get_fourier_col_name(1, "toy", function_name="sin")] +
        noise_std * np.random.normal(size=df.shape[0]))

    df["regressor3"] = (
        df["growth"] +
        9 * df[get_fourier_col_name(1, "tow", function_name="sin")] -
        8 * df[get_fourier_col_name(1, "tod", function_name="sin")] +
        5 * df[get_fourier_col_name(1, "toy", function_name="sin")] +
        noise_std * np.random.normal(size=df.shape[0]))

    df["regressor_bool"] = np.random.rand(df.shape[0]) > 0.3
    df["regressor_categ"] = np.random.choice(a=["c1", "c2", "c3"],
                                             size=df.shape[0],
                                             p=[0.1, 0.2, 0.7])

    if train_end_date is None:
        train_rows = np.floor(train_frac * df.shape[0]).astype(int)
        train_end_date = df[TIME_COL][train_rows]

    regressor_cols = [
        "regressor1", "regressor2", "regressor3", "regressor_bool",
        "regressor_categ"
    ]
    if remove_extra_cols:
        df = df[[TIME_COL, VALUE_COL] + regressor_cols]

    if mask_test_actuals:
        # False positive warning:
        #   pandas.core.common.SettingWithCopyError:
        #   A value is trying to be set on a copy of a slice from a DataFrame.
        #   Try using .loc[row_indexer,col_indexer] = value instead
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            df.loc[df[TIME_COL] > train_end_date, VALUE_COL] = np.NaN

    train_df = df.loc[df[TIME_COL] <= train_end_date]
    test_df = df.loc[df[TIME_COL] > train_end_date]
    fut_time_num = test_df.shape[0]

    return {
        "df": df,
        "train_df": train_df,
        "test_df": test_df,
        "fut_time_num": fut_time_num,
        "regressor_cols": regressor_cols,
    }