Python get_canonical_dataの例、greykite.common.time_properties.get_canonical_data Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_irregular():
    """Checks sort and fill missing dates"""
    # gaps in unsorted, irregular input
    df = pd.DataFrame({
        TIME_COL: [
            datetime.datetime(2018, 1, 1, 0, 0, 1),
            datetime.datetime(2018, 1, 1, 0, 0, 2),
            datetime.datetime(2018, 1, 1, 0, 0,
                              10),  # intentionally out of order
            datetime.datetime(2018, 1, 1, 0, 0, 4)
        ],
        VALUE_COL: [1, 2, 3, 4]
    })
    expected = pd.DataFrame({
        # in sorted order
        TIME_COL:
        pd.date_range(start=datetime.datetime(2018, 1, 1, 0, 0, 1),
                      end=datetime.datetime(2018, 1, 1, 0, 0, 10),
                      freq="S"),
        VALUE_COL:
        [1, 2, np.nan, 4, np.nan, np.nan, np.nan, np.nan, np.nan, 3]
    })
    expected.index = expected[TIME_COL]
    expected.index.name = None

    canonical_data_dict = get_canonical_data(
        df=df, time_col=TIME_COL, value_col=VALUE_COL,
        freq="S")  # the frequency should be provided when there are gaps
    assert canonical_data_dict["time_stats"]["added_timepoints"] == 6
    assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
    assert_equal(canonical_data_dict["df"], expected)
    assert_equal(canonical_data_dict["time_stats"]["gaps"],
                 find_missing_dates(df[TIME_COL]))

コード例 #2

0

ファイルを表示

ファイル: test_univariate_time_series.py プロジェクト: somandubey/greykite

def test_load_data_anomaly():
    """Checks anomaly_info parameter"""
    dl = DataLoaderTS()
    df = dl.load_beijing_pm()
    value_col = "pm"

    # no anomaly adjustment
    ts = UnivariateTimeSeries()
    ts.load_data(df=df, value_col=value_col)
    assert ts.df_before_adjustment is None

    # adjusts two columns
    dim_one = "one"
    dim_two = "two"
    anomaly_df = pd.DataFrame({
        START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"],
        END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"],
        ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0],
        METRIC_COL: [dim_one, dim_one, dim_two]
    })
    anomaly_info = [{
        "value_col": value_col,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_one
        },
        "adjustment_method": "add"
    }, {
        "value_col": "pres",
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_two
        },
        "adjustment_method": "subtract"
    }]
    ts = UnivariateTimeSeries()
    ts.load_data(df=df, value_col=value_col, anomaly_info=anomaly_info)
    canonical_data_dict = get_canonical_data(df=df,
                                             value_col=value_col,
                                             anomaly_info=anomaly_info)
    assert_equal(ts.df, canonical_data_dict["df"])
    assert_equal(ts.df_before_adjustment,
                 canonical_data_dict["df_before_adjustment"])

コード例 #3

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_freq():
    # Checks warning for frequency not matching inferred frequency
    df = pd.DataFrame({
        TIME_COL: [
            datetime.datetime(2018, 1, 1, 0, 0, 0),
            datetime.datetime(2018, 1, 2, 0, 0, 0),
            datetime.datetime(2018, 1, 3, 0, 0, 0)
        ],
        VALUE_COL: [1, 2, 3]
    })
    with pytest.warns(UserWarning) as record:
        canonical_data_dict = get_canonical_data(df=df,
                                                 time_col=TIME_COL,
                                                 value_col=VALUE_COL,
                                                 freq="H")
        assert "does not match inferred frequency" in record[0].message.args[0]
        assert canonical_data_dict["time_stats"][
            "added_timepoints"] == 24 * 2 - 2
        assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0

コード例 #4

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_err():
    df = pd.DataFrame({
        TIME_COL: [
            datetime.datetime(2018, 1, 1, 0, 0, 3),
            datetime.datetime(2018, 1, 1, 0, 0, 2),
            datetime.datetime(2018, 1, 1, 0, 0, 1)
        ],
        VALUE_COL: [1, 2, 3]
    })
    with pytest.raises(ValueError, match="Time series has < 3 observations"):
        get_canonical_data(df=df.iloc[:2, ])

    with pytest.raises(ValueError, match="column is not in input data"):
        get_canonical_data(df=df, time_col="time")

    with pytest.raises(ValueError, match="column is not in input data"):
        get_canonical_data(df=df, value_col="value")

コード例 #5

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_train_end_date_regressor():
    """Tests train_end_date for data with regressors"""
    data = generate_df_with_reg_for_tests(freq="D",
                                          periods=30,
                                          train_start_date=datetime.datetime(
                                              2018, 1, 1),
                                          remove_extra_cols=True,
                                          mask_test_actuals=True)
    regressor_cols = ["regressor1", "regressor2", "regressor_categ"]
    keep_cols = [TIME_COL, VALUE_COL] + regressor_cols
    df = data["df"][keep_cols].copy()
    # Setting NaN values at the end
    df.loc[df.tail(2).index, "regressor1"] = np.nan
    df.loc[df.tail(4).index, "regressor2"] = np.nan
    df.loc[df.tail(6).index, "regressor_categ"] = np.nan
    df.loc[df.tail(8).index, VALUE_COL] = np.nan

    # last date with a value
    result_train_end_date = datetime.datetime(2018, 1, 22)

    # default train_end_date, default regressor_cols
    with pytest.warns(UserWarning) as record:
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=None,
                                                 regressor_cols=None)
        assert f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["df"].shape == df.shape
        assert canonical_data_dict["fit_df"].shape == (22, 2)
        assert canonical_data_dict["regressor_cols"] == []
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL]
        assert canonical_data_dict["train_end_date"] == result_train_end_date
        assert canonical_data_dict[
            "last_date_for_val"] == result_train_end_date
        assert canonical_data_dict["last_date_for_reg"] is None

    # train_end_date later than last date in df, all available regressor_cols
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 2, 10)
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=regressor_cols)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["fit_df"].shape == (22, 5)
        assert canonical_data_dict["regressor_cols"] == regressor_cols
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL
                                                   ] + regressor_cols
        assert canonical_data_dict["train_end_date"] == result_train_end_date
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
            2018, 1, 28)

    # train_end_date in between last date in df and last date before null
    # user passes no regressor_cols
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 25)
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=None)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]
        assert canonical_data_dict["fit_df"].shape == (22, 2)
        assert canonical_data_dict["regressor_cols"] == []
        assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL]
        assert canonical_data_dict["train_end_date"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] is None

    # train end date equal to last date before null
    # user requests a subset of the regressor_cols
    train_end_date = datetime.datetime(2018, 1, 22)
    regressor_cols = ["regressor2"]
    canonical_data_dict = get_canonical_data(df=df,
                                             train_end_date=train_end_date,
                                             regressor_cols=regressor_cols)
    assert canonical_data_dict["fit_df"].shape == (22, 3)
    assert canonical_data_dict["regressor_cols"] == regressor_cols
    assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL
                                               ] + regressor_cols
    assert canonical_data_dict["train_end_date"] == datetime.datetime(
        2018, 1, 22)
    assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
        2018, 1, 22)
    assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
        2018, 1, 26)

    # train_end_date smaller than last date before null
    # user requests regressor_cols that does not exist in df
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 20)
        regressor_cols = ["regressor1", "regressor4", "regressor5"]
        canonical_data_dict = get_canonical_data(df=df,
                                                 train_end_date=train_end_date,
                                                 regressor_cols=regressor_cols)
        assert canonical_data_dict["fit_df"].shape == (20, 3)
        assert canonical_data_dict["regressor_cols"] == ["regressor1"]
        assert canonical_data_dict["fit_cols"] == [
            TIME_COL, VALUE_COL, "regressor1"
        ]
        assert canonical_data_dict["train_end_date"] == datetime.datetime(
            2018, 1, 20)
        assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
            2018, 1, 22)
        assert canonical_data_dict["last_date_for_reg"] == datetime.datetime(
            2018, 1, 28)
        assert (f"The following columns are not available to use as "
                f"regressors: ['regressor4', 'regressor5']"
                ) in record[0].message.args[0]

コード例 #6

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_train_end_date():
    """Tests train_end_date for data without regressors"""
    df = pd.DataFrame({
        TIME_COL: [
            datetime.datetime(2018, 1, 1, 3, 0, 0),
            datetime.datetime(2018, 1, 1, 4, 0, 0),
            datetime.datetime(2018, 1, 1, 5, 0, 0),
            datetime.datetime(2018, 1, 1, 6, 0, 0),
            datetime.datetime(2018, 1, 1, 7, 0, 0)
        ],
        VALUE_COL: [1, np.nan, 3, np.nan, np.nan],
    })

    # no train_end_date
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0)
        get_canonical_data(df=df)
        assert f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({train_end_date})." in record[0].message.args[0]

    # train_end_date later than last date in df
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 1, 8, 0, 0)
        result_train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0)
        get_canonical_data(df, train_end_date=train_end_date)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]

    # train_end_date in between last date in df and last date before
    with pytest.warns(UserWarning) as record:
        train_end_date = datetime.datetime(2018, 1, 1, 6, 0, 0)
        result_train_end_date = datetime.datetime(2018, 1, 1, 5, 0, 0)
        get_canonical_data(df, train_end_date=train_end_date)
        assert f"Input timestamp for the parameter 'train_end_date' " \
               f"({train_end_date}) either exceeds the last available timestamp or" \
               f"{VALUE_COL} column of the provided TimeSeries contains null " \
               f"values at the end. Setting 'train_end_date' to the last timestamp with a " \
               f"non-null value ({result_train_end_date})." in record[0].message.args[0]

    # train end date equal to last date before null
    canonical_data_dict = get_canonical_data(df,
                                             train_end_date=datetime.datetime(
                                                 2018, 1, 1, 5, 0, 0))
    assert canonical_data_dict["train_end_date"] == datetime.datetime(
        2018, 1, 1, 5, 0, 0)

    # train_end_date smaller than last date before null
    canonical_data_dict = get_canonical_data(df,
                                             train_end_date=datetime.datetime(
                                                 2018, 1, 1, 4, 0, 0))
    assert_equal(canonical_data_dict["fit_df"],
                 canonical_data_dict["df"].iloc[:2])
    assert canonical_data_dict["regressor_cols"] == []
    assert canonical_data_dict["fit_cols"] == [TIME_COL, VALUE_COL]
    assert canonical_data_dict["train_end_date"] == datetime.datetime(
        2018, 1, 1, 4, 0, 0)
    assert canonical_data_dict["last_date_for_val"] == datetime.datetime(
        2018, 1, 1, 5, 0, 0)
    assert canonical_data_dict["last_date_for_reg"] is None

コード例 #7

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_load_data_anomaly():
    """Checks anomaly_info parameter"""
    dl = DataLoader()
    df = dl.load_beijing_pm()
    value_col = "pm"

    # no anomaly adjustment
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col=TIME_COL,
                                             value_col=value_col)
    assert canonical_data_dict["df_before_adjustment"] is None

    dim_one = "one"
    dim_two = "two"
    anomaly_df = pd.DataFrame({
        START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"],
        END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"],
        ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0],
        METRIC_COL: [dim_one, dim_one,
                     dim_two]  # used to filter rows in this df
    })
    # Adjusts one column (value_col)
    anomaly_info = {
        "value_col": value_col,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_one
        },
        "adjustment_method": "add"
    }
    canonical_data_dict2 = get_canonical_data(df=df,
                                              time_col=TIME_COL,
                                              value_col=value_col,
                                              anomaly_info=anomaly_info)
    assert_equal(canonical_data_dict2["df_before_adjustment"],
                 canonical_data_dict["df"])
    expected_df = canonical_data_dict["df"].copy()
    # first anomaly
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][0])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][0]))
    expected_df.loc[idx, VALUE_COL] = np.nan
    # second anomaly
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][1])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][1]))
    expected_df.loc[idx, VALUE_COL] += 100.0
    assert_equal(canonical_data_dict2["df"], expected_df)

    # Adjusts two columns
    value_col_two = "pres"  # second column to adjust
    anomaly_info = [
        anomaly_info, {
            "value_col": value_col_two,
            "anomaly_df": anomaly_df,
            "start_date_col": START_DATE_COL,
            "end_date_col": END_DATE_COL,
            "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
            "filter_by_dict": {
                METRIC_COL: dim_two
            },
            "adjustment_method": "subtract"
        }
    ]
    canonical_data_dict3 = get_canonical_data(df=df,
                                              time_col=TIME_COL,
                                              value_col=value_col,
                                              anomaly_info=anomaly_info)
    # third anomaly. The value is subtracted, according to `adjustment_method`.
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][2])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][2]))
    expected_df.loc[idx, value_col_two] -= -100.0
    assert_equal(canonical_data_dict3["df_before_adjustment"],
                 canonical_data_dict["df"])
    assert_equal(canonical_data_dict3["df"], expected_df)

コード例 #8

0

ファイルを表示

ファイル: test_time_properties.py プロジェクト: somandubey/greykite

def test_gcd_dates():
    """Checks if regular data can be properly loaded. Checks time column stats"""
    df = pd.DataFrame({
        "time": [
            datetime.datetime(2018, 1, 1, 0, 0, 1),
            datetime.datetime(2018, 1, 1, 0, 0, 2),
            datetime.datetime(2018, 1, 1, 0, 0, 3)
        ],
        "val": [1, 2, 3]
    })
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col="time",
                                             value_col="val")
    assert_equal(canonical_data_dict["time_stats"]["gaps"],
                 find_missing_dates(df["time"]))
    assert canonical_data_dict["time_stats"]["added_timepoints"] == 0
    assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
    assert canonical_data_dict["freq"] == "S"
    assert_equal(canonical_data_dict["df"][VALUE_COL].values, df["val"].values)
    assert canonical_data_dict["df"].index.name is None

    # string with date format
    date_format = "%Y-%m-%d"
    df = pd.DataFrame({
        TIME_COL: ["2018-01-01", "2018-01-05", "2018-01-09"],
        VALUE_COL: [1, 2, 3]
    })
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col=TIME_COL,
                                             value_col=VALUE_COL,
                                             date_format=date_format,
                                             freq="4D")
    expected = pd.DataFrame({
        TIME_COL:
        pd.Series([
            datetime.datetime(2018, 1, 1, 0, 0, 0),
            datetime.datetime(2018, 1, 5, 0, 0, 0),
            datetime.datetime(2018, 1, 9, 0, 0, 0)
        ],
                  name=TIME_COL),
        VALUE_COL:
        df[VALUE_COL]
    })
    expected.index = expected[TIME_COL]
    expected.index.name = None
    assert canonical_data_dict["time_stats"]["added_timepoints"] == 0
    assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
    assert canonical_data_dict["freq"] == "4D"
    assert_equal(canonical_data_dict["df"], expected)

    # string with inferred date format
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col=TIME_COL,
                                             value_col=VALUE_COL,
                                             freq="4D")
    assert canonical_data_dict["time_stats"]["added_timepoints"] == 0
    assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
    assert_equal(canonical_data_dict["df"], expected)

    # time zone
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col=TIME_COL,
                                             value_col=VALUE_COL,
                                             freq="4D",
                                             tz="US/Pacific")
    expected = expected.tz_localize("US/Pacific")
    assert canonical_data_dict["time_stats"]["added_timepoints"] == 0
    assert canonical_data_dict["time_stats"]["dropped_timepoints"] == 0
    assert_equal(canonical_data_dict["df"], expected)

コード例 #9

0

ファイルを表示

ファイル: time_properties_forecast.py プロジェクト: goncaloperes/greykite

def get_forecast_time_properties(
        df,
        time_col=TIME_COL,
        value_col=VALUE_COL,
        freq=None,
        regressor_cols=None,
        lagged_regressor_cols=None,
        train_end_date=None,
        forecast_horizon=None):
    """Returns the number of training points in `df`, the start year, and prediction end year

    Parameters
    ----------
    df : `pandas.DataFrame` with columns [``time_col``, ``value_col``]
        Univariate timeseries data to forecast
    time_col : `str`, default ``TIME_COL`` in constants.py
        Name of timestamp column in df
    value_col : `str`, default ``VALUE_COL`` in constants.py
        Name of value column in df (the values to forecast)
    freq : `str` or None, default None
        Frequency of input data. Used to generate future dates for prediction.
        Frequency strings can have multiples, e.g. '5H'.
        See https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#offset-aliases
        for a list of frequency aliases.
        If None, inferred by pd.infer_freq.
        Provide this parameter if ``df`` has missing timepoints.
    regressor_cols : `list` [`str`] or None, optional, default None
        A list of regressor columns used in the training and prediction DataFrames.
        If None, no regressor columns are used.
        Regressor columns that are unavailable in ``df`` are dropped.
    lagged_regressor_cols : `list` [`str`] or None, optional, default None
        A list of lagged regressor columns used in the training and prediction DataFrames.
        If None, no lagged regressor columns are used.
        Lagged regressor columns that are unavailable in ``df`` are dropped.
    train_end_date : `datetime.datetime`, optional, default None
        Last date to use for fitting the model. Forecasts are generated after this date.
        If None, it is set to the last date with a non-null value in
        ``value_col`` of ``df``.
    forecast_horizon : `int` or None, default None
        Number of periods to forecast into the future. Must be > 0
        If None, default is determined from input data frequency

    Returns
    -------
    time_properties : `dict` [`str`, `any`]
        Time properties dictionary with keys:

        ``"period"`` : `int`
            Period of each observation (i.e. minimum time between observations, in seconds).
        ``"simple_freq"`` : `SimpleTimeFrequencyEnum`
            ``SimpleTimeFrequencyEnum`` member corresponding to data frequency.
        ``"num_training_points"`` : `int`
            Number of observations for training.
        ``"num_training_days"`` : `int`
            Number of days for training.
        ``"days_per_observation"``: `float`
            The time frequency in day units.
        ``"forecast_horizon"``: `int`
            The number of time intervals for which forecast is needed.
        ``"forecast_horizon_in_timedelta"``: `datetime.timedelta`
            The forecast horizon length in timedelta units.
        ``"forecast_horizon_in_days"``: `float`
            The forecast horizon length in day units.
        ``"start_year"`` : `int`
            Start year of the training period.
        ``"end_year"`` : `int`
            End year of the forecast period.
        ``"origin_for_time_vars"`` : `float`
            Continuous time representation of the first date in ``df``.
    """
    if regressor_cols is None:
        regressor_cols = []

    # Defines ``fit_df``, the data available for fitting the model
    # and its time column (in `datetime.datetime` format)
    canonical_data_dict = get_canonical_data(
        df=df,
        time_col=time_col,
        value_col=value_col,
        freq=freq,
        train_end_date=train_end_date,
        regressor_cols=regressor_cols,
        lagged_regressor_cols=lagged_regressor_cols)
    fit_df = canonical_data_dict["fit_df"]

    # Calculates basic time properties
    train_start = fit_df[TIME_COL].min()
    start_year = int(train_start.strftime("%Y"))
    origin_for_time_vars = get_default_origin_for_time_vars(fit_df, TIME_COL)
    period = min_gap_in_seconds(df=fit_df, time_col=TIME_COL)
    simple_freq = get_simple_time_frequency_from_period(period)
    num_training_points = fit_df.shape[0]

    # Calculates number of (fractional) days in the training set
    time_delta = fit_df[TIME_COL].max() - train_start
    num_training_days = (
            time_delta.days
            + (time_delta.seconds + period) / TimeEnum.ONE_DAY_IN_SECONDS.value)

    # Calculates forecast horizon (as a number of periods)
    if forecast_horizon is None:
        # expected to be kept in sync with default value set in ``get_default_time_parameters``
        forecast_horizon = get_default_horizon_from_period(
            period=period,
            num_observations=num_training_points)

    days_per_observation = period / TimeEnum.ONE_DAY_IN_SECONDS.value
    forecast_horizon_in_days = forecast_horizon * days_per_observation
    forecast_horizon_in_timedelta = timedelta(days=forecast_horizon_in_days)

    # Calculates forecast end year
    train_end = fit_df[TIME_COL].max()
    days_to_forecast = math.ceil(forecast_horizon * days_per_observation)
    future_end = train_end + timedelta(days=days_to_forecast)
    end_year = int(future_end.strftime("%Y"))

    return {
        "period": period,
        "simple_freq": simple_freq,
        "num_training_points": num_training_points,
        "num_training_days": num_training_days,
        "days_per_observation": days_per_observation,
        "forecast_horizon": forecast_horizon,
        "forecast_horizon_in_timedelta": forecast_horizon_in_timedelta,
        "forecast_horizon_in_days": forecast_horizon_in_days,
        "start_year": start_year,
        "end_year": end_year,
        "origin_for_time_vars": origin_for_time_vars
    }