def test_get_df():
    dl = DataLoader()
    # Daily data
    data_path = dl.get_data_home(data_dir=None, data_sub_dir="daily")
    df = dl.get_df(data_path=data_path, data_name="daily_peyton_manning")
    assert list(df.columns) == ["ts", "y"]
    assert df.shape == (2905, 2)

    # Hourly data
    data_path = dl.get_data_home(data_dir=None, data_sub_dir="hourly")
    df = dl.get_df(data_path=data_path, data_name="hourly_parking")
    assert list(df.columns) == [
        "SystemCodeNumber", "Capacity", "Occupancy", "LastUpdated"
    ]
    assert df.shape == (35717, 4)

    # Error due to wrong file name
    data_path = dl.get_data_home(data_dir=None, data_sub_dir="daily")
    file_path = os.path.join(data_path, "parking.csv")
    file_names = dl.get_data_names(data_path=data_path)
    with pytest.raises(
            ValueError,
            match=
            fr"Given file path '{file_path}' is not found. Available datasets "
            fr"in data directory '{data_path}' are \{file_names}\."):
        dl.get_df(data_path=data_path, data_name="parking")
def test_benchmark_silverkite_template_with_real_data():
    # setting every list to 1 item to speed up test case
    forecast_horizons = [30]
    max_cvs = [3]
    fit_algorithms = ["linear"]
    metric = EvaluationMetricEnum.MeanSquaredError
    evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name)

    # real data
    dl = DataLoader()
    data_path = dl.get_data_home(data_sub_dir="daily")
    data_name = "daily_female_births"
    df = dl.get_df(data_path=data_path, data_name="daily_female_births")
    time_col = "Date"
    value_col = "Births"
    metadata = MetadataParam(time_col=time_col, value_col=value_col, freq="D")
    result_silverkite_real = benchmark_silverkite_template(
        data_name=data_name,
        df=df,
        metadata=metadata,
        evaluation_metric=evaluation_metric,
        forecast_horizons=forecast_horizons,
        fit_algorithms=fit_algorithms,
        max_cvs=max_cvs)

    result_silverkite_real = result_silverkite_real[0]
    assert result_silverkite_real["data_name"] == data_name
    assert result_silverkite_real["forecast_model_name"] == "silverkite_linear"
    assert result_silverkite_real["train_period"] == df.shape[0]
    assert result_silverkite_real["forecast_horizon"] == 30
    assert result_silverkite_real["cv_folds"] == 3
Exemple #3
0
def test_estimator_get_coef_summary_from_forecaster():
    """Tests model summary for silverkite model with missing values in value_col after everything is setup by Forecaster"""
    dl = DataLoader()
    df_pt = dl.load_peyton_manning()
    config = ForecastConfig().from_dict(
        dict(model_template=ModelTemplateEnum.SILVERKITE.name,
             forecast_horizon=10,
             metadata_param=dict(time_col="ts", value_col="y", freq="D"),
             model_components_param=dict(
                 custom={"fit_algorithm_dict": {
                     "fit_algorithm": "linear"
                 }})))
    result = Forecaster().run_forecast_config(
        df=df_pt[:365],  # shortens df to speed up
        config=config)
    summary = result.model[-1].summary()
    x = summary.get_coef_summary(is_intercept=True, return_df=True)
    assert x.shape[0] == 1
    summary.get_coef_summary(is_time_feature=True)
    summary.get_coef_summary(is_event=True)
    summary.get_coef_summary(is_trend=True)
    summary.get_coef_summary(is_interaction=True)
    x = summary.get_coef_summary(is_lag=True)
    assert x is None
    x = summary.get_coef_summary(is_trend=True,
                                 is_seasonality=False,
                                 is_interaction=False,
                                 return_df=True)
    assert all([":" not in col for col in x["Pred_col"].tolist()])
    assert "ct1" in x["Pred_col"].tolist()
    assert "sin1_ct1_yearly" not in x["Pred_col"].tolist()
    x = summary.get_coef_summary(return_df=True)
    assert x.shape[0] == summary.info_dict["coef_summary_df"].shape[0]
def test_load_hourly_beijing_pm():
    dl = DataLoader()
    df = dl.load_beijing_pm()
    assert list(df.columns) == [
        "ts", "year", "month", "day", "hour", "pm", "dewp", "temp", "pres",
        "cbwd", "iws", "is", "ir"
    ]
    assert df.shape == (43824, 13)
def test_load_hourly_parking():
    dl = DataLoader()
    df = dl.load_parking(system_code_number=None)
    assert list(df.columns) == ["LastUpdated", "Capacity", "Occupancy", "OccupancyRatio"]
    assert df.shape == (1328, 4)

    df = dl.load_parking(system_code_number="NIA South")
    assert list(df.columns) == ["SystemCodeNumber", "Capacity", "Occupancy", "LastUpdated", "OccupancyRatio"]
    assert df.shape == (1204, 5)
def test_get_data_inventory():
    dl = DataLoader()
    file_names = dl.get_data_inventory()
    assert set(file_names) == {
        "online_retail", "minute_energy_appliance", "minute_household_power",
        "minute_yosemite_temps", "hourly_parking", "hourly_traffic_volume",
        "hourly_bikesharing", "hourly_beijing_pm",
        "daily_temperature_australia", "daily_demand_order",
        "daily_female_births", "daily_istanbul_stock", "daily_peyton_manning",
        "monthly_shampoo", "monthly_sunspot"
    }
def test_load_data():
    dl = DataLoader()
    df = dl.load_data(data_name="daily_peyton_manning")
    expected_df = dl.load_peyton_manning()
    assert_equal(df, expected_df)

    df = dl.load_data(data_name="hourly_parking", system_code_number="Shopping")
    expected_df = dl.load_parking(system_code_number="Shopping")
    assert_equal(df, expected_df)

    # Error due to unavailable data name
    data_name = "dummy"
    data_inventory = dl.get_data_inventory()
    with pytest.raises(ValueError, match=fr"Input data name '{data_name}' is not recognized. "
                                         fr"Must be one of \{data_inventory}\."):
        dl.load_data(data_name=data_name)
Exemple #8
0
def test_estimator_plot_components_from_forecaster():
    """Tests estimator's plot_components function after the Forecaster has set everything up at the top most level"""
    # Test with real data (Female-births) via model template
    dl = DataLoader()
    data_path = dl.get_data_home(data_sub_dir="daily")
    df = dl.get_df(data_path=data_path, data_name="daily_female_births")
    metadata = MetadataParam(time_col="Date", value_col="Births", freq="D")
    model_components = ModelComponentsParam(
        seasonality={
            "yearly_seasonality": True,
            "quarterly_seasonality": True,
            "weekly_seasonality": True,
            "daily_seasonality": False
        })
    result = Forecaster().run_forecast_config(
        df=df,
        config=ForecastConfig(
            model_template=ModelTemplateEnum.SILVERKITE.name,
            forecast_horizon=30,  # forecast 1 month
            coverage=0.95,  # 95% prediction intervals
            metadata_param=metadata,
            model_components_param=model_components))
    estimator = result.model.steps[-1][-1]
    assert estimator.plot_components()
def test_get_data_names():
    dl = DataLoader()
    # Returns empty set as there is no .csv file in 'data' folder
    data_path = dl.get_data_home()
    file_names = dl.get_data_names(data_path=data_path)
    assert file_names == []

    data_path = dl.get_data_home(data_sub_dir="daily")
    file_names = dl.get_data_names(data_path=data_path)
    assert set(file_names) == {
        "daily_temperature_australia", "daily_demand_order",
        "daily_female_births", "daily_istanbul_stock", "daily_peyton_manning"
    }
def test_get_data_home():
    dl = DataLoader()
    # Default parameters
    data_home = dl.get_data_home()
    assert os.path.basename(os.path.normpath(data_home)) == "data"

    # With subdirectory
    data_home = dl.get_data_home(data_sub_dir="daily")
    assert os.path.basename(os.path.normpath(data_home)) == "daily"

    # Error due to non existing folder
    data_dir = "/home/data"
    with pytest.raises(ValueError, match=f"Requested data directory '{data_dir}' does not exist."):
        dl.get_data_home(data_dir=data_dir)
def test_load_hourly_bikesharing():
    dl = DataLoader()
    df = dl.load_bikesharing()
    assert list(df.columns) == ["date", TIME_COL, "count", "tmin", "tmax", "pn"]
    assert df.shape == (78421, 6)

    agg_func = {"count": "sum", "tmin": "min", "tmax": "max", "pn": "mean"}
    df = dl.load_bikesharing(agg_freq="daily", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (3269, len(agg_func) + 1)
    df = dl.load_bikesharing(agg_freq="weekly", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (468, len(agg_func) + 1)
    df = dl.load_bikesharing(agg_freq="monthly", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (109, len(agg_func) + 1)
def test_get_aggregated_data():
    dl = DataLoader()
    test_df = pd.DataFrame({
        TIME_COL: pd.date_range("2020-01-01 00:00", "2020-12-31 23:00", freq="1H"),
        "col1": 1,
        "col2": 2,
        "col3": 3,
        "col4": 4,
        "col5": 5,
    })
    agg_func = {"col1": "sum", "col2": "mean", "col3": "median", "col4": "min", "col5": "max"}
    # For each frequency,
    # (1) make sure the `TIME_COL` column is correctly included
    # (2) verify the aggregation part works correctly
    # Daily aggregation
    df = dl.get_aggregated_data(test_df, agg_freq="daily", agg_func=agg_func)
    assert df.shape == (366, len(agg_func) + 1)
    assert (df["col1"] != 24).sum() == 0
    assert (df["col2"] != 2).sum() == 0
    assert (df["col3"] != 3).sum() == 0
    assert (df["col4"] != 4).sum() == 0
    assert (df["col5"] != 5).sum() == 0
    # Weekly aggregation
    df = dl.get_aggregated_data(test_df, agg_freq="weekly", agg_func=agg_func)
    assert df.shape == (53, len(agg_func) + 1)
    assert (df["col1"] != 24*7).sum() == 2
    assert (df["col2"] != 2).sum() == 0
    assert (df["col3"] != 3).sum() == 0
    assert (df["col4"] != 4).sum() == 0
    assert (df["col5"] != 5).sum() == 0
    # Monthly aggregation
    df = dl.get_aggregated_data(test_df, agg_freq="monthly", agg_func=agg_func)
    assert df.shape == (12, len(agg_func) + 1)
    assert (df["col1"].isin([24*29, 24*30, 24*31])).sum() == 12
    assert (df["col2"] != 2).sum() == 0
    assert (df["col3"] != 3).sum() == 0
    assert (df["col4"] != 4).sum() == 0
    assert (df["col5"] != 5).sum() == 0

    df = test_df.drop(columns=[TIME_COL])
    with pytest.raises(ValueError, match=f"{TIME_COL}"):
        dl.get_aggregated_data(df, agg_freq="monthly", agg_func=agg_func)
def test_load_hourly_beijing_pm():
    dl = DataLoader()
    df = dl.load_beijing_pm()
    assert list(df.columns) == [
        TIME_COL, "year", "month", "day", "hour", "pm", "dewp",
        "temp", "pres", "cbwd", "iws", "is", "ir"]
    assert df.shape == (43824, 13)

    agg_func = {"pm": "mean", "dewp": "mean", "temp": "max", "pres": "mean", "iws": "sum", "is": "sum", "ir": "sum"}
    df = dl.load_beijing_pm(agg_freq="daily", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (1826, len(agg_func) + 1)
    df = dl.load_beijing_pm(agg_freq="weekly", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (262, len(agg_func) + 1)
    df = dl.load_beijing_pm(agg_freq="monthly", agg_func=agg_func)
    assert TIME_COL in df.columns
    assert df.shape == (60, len(agg_func) + 1)
def test_load_hourly_bikesharing():
    dl = DataLoader()
    df = dl.load_bikesharing()
    assert list(df.columns) == ["date", "ts", "count", "tmin", "tmax", "pn"]
    assert df.shape == (78421, 6)
def test_load_peyton_manning():
    dl = DataLoader()
    df = dl.load_peyton_manning()
    assert list(df.columns) == [TIME_COL, "y"]
    assert df.shape == (2905, 2)
def test_init():
    dl = DataLoader()
    assert dl.available_datasets == dl.get_data_inventory()
def test_gcd_load_data_anomaly():
    """Checks anomaly_info parameter"""
    dl = DataLoader()
    df = dl.load_beijing_pm()
    value_col = "pm"

    # no anomaly adjustment
    canonical_data_dict = get_canonical_data(df=df,
                                             time_col=TIME_COL,
                                             value_col=value_col)
    assert canonical_data_dict["df_before_adjustment"] is None

    dim_one = "one"
    dim_two = "two"
    anomaly_df = pd.DataFrame({
        START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"],
        END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"],
        ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0],
        METRIC_COL: [dim_one, dim_one,
                     dim_two]  # used to filter rows in this df
    })
    # Adjusts one column (value_col)
    anomaly_info = {
        "value_col": value_col,
        "anomaly_df": anomaly_df,
        "start_date_col": START_DATE_COL,
        "end_date_col": END_DATE_COL,
        "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
        "filter_by_dict": {
            METRIC_COL: dim_one
        },
        "adjustment_method": "add"
    }
    canonical_data_dict2 = get_canonical_data(df=df,
                                              time_col=TIME_COL,
                                              value_col=value_col,
                                              anomaly_info=anomaly_info)
    assert_equal(canonical_data_dict2["df_before_adjustment"],
                 canonical_data_dict["df"])
    expected_df = canonical_data_dict["df"].copy()
    # first anomaly
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][0])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][0]))
    expected_df.loc[idx, VALUE_COL] = np.nan
    # second anomaly
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][1])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][1]))
    expected_df.loc[idx, VALUE_COL] += 100.0
    assert_equal(canonical_data_dict2["df"], expected_df)

    # Adjusts two columns
    value_col_two = "pres"  # second column to adjust
    anomaly_info = [
        anomaly_info, {
            "value_col": value_col_two,
            "anomaly_df": anomaly_df,
            "start_date_col": START_DATE_COL,
            "end_date_col": END_DATE_COL,
            "adjustment_delta_col": ADJUSTMENT_DELTA_COL,
            "filter_by_dict": {
                METRIC_COL: dim_two
            },
            "adjustment_method": "subtract"
        }
    ]
    canonical_data_dict3 = get_canonical_data(df=df,
                                              time_col=TIME_COL,
                                              value_col=value_col,
                                              anomaly_info=anomaly_info)
    # third anomaly. The value is subtracted, according to `adjustment_method`.
    idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][2])
           & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][2]))
    expected_df.loc[idx, value_col_two] -= -100.0
    assert_equal(canonical_data_dict3["df_before_adjustment"],
                 canonical_data_dict["df"])
    assert_equal(canonical_data_dict3["df"], expected_df)
def test_get_changepoints_dict():
    dl = DataLoader()
    df_pt = dl.load_peyton_manning()

    changepoints_dict = {
        "method": "auto",
        "yearly_seasonality_order": 8,
        "resample_freq": "D",
        "trend_estimator": "ridge",
        "adaptive_lasso_initial_estimator": "ridge",
        "regularization_strength": None,
        "actual_changepoint_min_distance": "30D",
        "potential_changepoint_distance": None,
        "potential_changepoint_n": 100,
        "no_changepoint_distance_from_end": None,
        "no_changepoint_proportion_from_end": 0.0,
        "continuous_time_col": "ct1"
    }
    new_changepoints_dict, changepoint_detector = get_changepoints_dict(
        df=df_pt,
        time_col="ts",
        value_col="y",
        changepoints_dict=changepoints_dict
    )
    assert new_changepoints_dict["method"] == "custom"
    assert len(new_changepoints_dict["dates"]) > 0
    assert new_changepoints_dict["continuous_time_col"] == "ct1"
    assert changepoint_detector.trend_changepoints is not None
    # tests change point properties
    changepoints_dict = {
        "method": "auto",
        "yearly_seasonality_order": 8,
        "resample_freq": "D",
        "trend_estimator": "ridge",
        "adaptive_lasso_initial_estimator": "ridge",
        "regularization_strength": None,
        "actual_changepoint_min_distance": "100D",
        "potential_changepoint_distance": "50D",
        "potential_changepoint_n": 100,
        "no_changepoint_distance_from_end": None,
        "no_changepoint_proportion_from_end": 0.3,
        "continuous_time_col": "ct1",
        "dates": ["2001-01-01", "2010-01-01"]
    }
    new_changepoints_dict, changepoint_detector = get_changepoints_dict(
        df=df_pt,
        time_col="ts",
        value_col="y",
        changepoints_dict=changepoints_dict
    )
    changepoint_dates = new_changepoints_dict["dates"]
    # checks no change points at the end
    assert (changepoint_dates[-1] - pd.to_datetime(df_pt["ts"].iloc[0])) / \
           (pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0])) <= 0.7
    # checks change point distance is good
    min_cp_dist = min([changepoint_dates[i] - changepoint_dates[i - 1] for i in range(1, len(changepoint_dates))])
    assert min_cp_dist >= timedelta(days=100)
    assert changepoint_detector.trend_changepoints is not None
    # checks additional custom changepoints are added
    assert pd.to_datetime("2001-01-01") not in changepoint_dates  # out of range
    assert pd.to_datetime("2010-01-01") in changepoint_dates
    # tests for None
    new_changepoints_dict, changepoint_detector = get_changepoints_dict(
        df=df_pt,
        time_col="ts",
        value_col="y",
        changepoints_dict=None
    )
    assert new_changepoints_dict is None
    assert changepoint_detector is None
    # tests for "custom"
    changepoints_dict = {
        "method": "custom",
        "dates": ["2020-01-01"]
    }
    new_changepoints_dict, changepoint_detector = get_changepoints_dict(
        df=df_pt,
        time_col="ts",
        value_col="y",
        changepoints_dict=changepoints_dict
    )
    assert new_changepoints_dict == changepoints_dict
    assert changepoint_detector is None
    # tests for uniform
    changepoints_dict = {
        "method": "uniform",
        "n_changepoints": 100
    }
    new_changepoints_dict, changepoint_detector = get_changepoints_dict(
        df=df_pt,
        time_col="ts",
        value_col="y",
        changepoints_dict=changepoints_dict
    )
    assert new_changepoints_dict == changepoints_dict
    assert changepoint_detector is None
    # tests unused keys
    changepoints_dict = {
        "method": "auto",
        "unused_key": "value"
    }
    with pytest.warns(UserWarning) as record:
        get_changepoints_dict(
            df=df_pt,
            time_col="ts",
            value_col="y",
            changepoints_dict=changepoints_dict
        )
        assert (f"The following keys in ``changepoints_dict`` are not recognized\n"
                f"{['unused_key']}") in record[0].message.args[0]
Exemple #19
0
# ``2007-12-10`` and ``2016-01-20``.

# necessary imports
from datetime import datetime

import numpy as np
import plotly

from greykite.framework.input.univariate_time_series import UnivariateTimeSeries
from greykite.framework.constants import MEAN_COL_GROUP, OVERLAY_COL_GROUP
from greykite.common.constants import TIME_COL
from greykite.common.data_loader import DataLoader
from greykite.common.viz.timeseries_plotting import add_groupby_column, plot_multivariate, plot_univariate

# Loads dataset into pandas DataFrame
dl = DataLoader()
df = dl.load_peyton_manning()
df.rename(columns={"y": "log(pageviews)"},
          inplace=True)  # uses a more informative name

# plots dataset
ts = UnivariateTimeSeries()
ts.load_data(df=df, time_col="ts", value_col="log(pageviews)", freq="D")
fig = ts.plot()
plotly.io.show(fig)

# %%
# Yearly seasonality
# ------------------
# Because the observations are at daily frequency,
# it is possible to see yearly, quarterly, monthly, and weekly seasonality.
def test_find_trend_changepoints(hourly_data):
    df = hourly_data["df"]
    dl = DataLoader()
    df_pt = dl.load_peyton_manning()

    model = ChangepointDetector()
    # test class variables are initialized as None
    assert model.trend_model is None
    assert model.trend_coef is None
    assert model.trend_intercept is None
    assert model.trend_changepoints is None
    assert model.trend_potential_changepoint_n is None
    assert model.trend_df is None
    assert model.y is None
    assert model.original_df is None
    assert model.value_col is None
    assert model.time_col is None
    assert model.adaptive_lasso_coef is None
    # model training with default values
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y"
    )
    assert isinstance(model.trend_model, RegressorMixin)
    assert model.trend_model.coef_.shape[0] == 100 + 1 + 8 * 2
    assert model.trend_coef.shape[0] == 100 + 1 + 8 * 2
    assert model.trend_intercept is not None
    assert model.trend_changepoints is not None
    assert model.trend_potential_changepoint_n == 100
    assert model.trend_df.shape[1] == 100 + 1 + 8 * 2
    assert model.original_df.shape == df.shape
    assert model.time_col is not None
    assert model.value_col is not None
    assert model.adaptive_lasso_coef[1].shape[0] == 100 + 1 + 8 * 2
    assert model.y.index[0] not in model.trend_changepoints
    # model training with given values
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        potential_changepoint_n=50,
        yearly_seasonality_order=6,
        resample_freq="2D",
        trend_estimator="lasso",
        adaptive_lasso_initial_estimator="ols"
    )
    assert isinstance(model.trend_model, RegressorMixin)
    assert model.trend_model.coef_.shape[0] == 50 + 1 + 6 * 2
    assert model.trend_coef.shape[0] == 50 + 1 + 6 * 2
    assert model.trend_intercept is not None
    assert model.trend_changepoints is not None
    assert model.trend_potential_changepoint_n == 50
    assert model.trend_df.shape[1] == 50 + 1 + 6 * 2
    assert model.original_df.shape == df.shape
    assert model.time_col is not None
    assert model.value_col is not None
    assert model.adaptive_lasso_coef[1].shape[0] == 50 + 1 + 6 * 2
    assert model.y.index[0] not in model.trend_changepoints
    # test a given ``regularization_strength``
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        regularization_strength=1.0
    )
    assert isinstance(model.trend_model, RegressorMixin)
    assert model.trend_model.coef_.shape[0] == 100 + 1 + 8 * 2
    assert model.trend_coef.shape[0] == 100 + 1 + 8 * 2
    assert model.trend_intercept is not None
    assert model.trend_changepoints is not None
    assert model.trend_potential_changepoint_n == 100
    assert model.trend_df.shape[1] == 100 + 1 + 8 * 2
    assert model.original_df.shape == df.shape
    assert model.time_col is not None
    assert model.value_col is not None
    assert model.adaptive_lasso_coef[1].shape[0] == 100 + 1 + 8 * 2
    assert model.y.index[0] not in model.trend_changepoints
    # ``regularization_strength`` == 1.0 indicates no change point
    assert model.trend_changepoints == []
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        regularization_strength=0.5
    )
    # ``regularization_strength`` between 0 and 1 indicates at least one change point
    assert len(model.trend_changepoints) > 0
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        actual_changepoint_min_distance="D",
        regularization_strength=0.0
    )
    # ``regularization_strength`` == 0.0 indicates all potential change points are present
    assert len(model.trend_changepoints) == 100
    # test `potential_changepoint_distance`
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        potential_changepoint_distance="100D"
    )
    # test override `potential_changepoint_n`
    # df has length 500 days, with distance "100D", only 4 change points are placed.
    assert model.trend_potential_changepoint_n == 4
    with pytest.raises(ValueError,
                       match="In potential_changepoint_distance, the maximal unit is 'D', "
                             "i.e., you may use units no more than 'D' such as"
                             "'10D', '5H', '100T', '200S'. The reason is that 'W', 'M' "
                             "or higher has either cycles or indefinite number of days, "
                             "thus is not parsable by pandas as timedelta."):
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            potential_changepoint_distance="2M"
        )
    # test `no_changepoint_distance_from_end` with the Peyton Manning data
    model = ChangepointDetector()
    res = model.find_trend_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y",
        no_changepoint_distance_from_begin="730D",
        no_changepoint_distance_from_end="730D",
        regularization_strength=0
    )
    changepoints = res["trend_changepoints"]
    # test override `no_changepoint_proportion_from_end` and no change points in the last piece
    no_changepoint_proportion_from_end = timedelta(days=730) / (
            pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0]))
    last_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int(
        df_pt.shape[0] * (1 - no_changepoint_proportion_from_end))])
    first_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int(
        df_pt.shape[0] * no_changepoint_proportion_from_end)])
    assert changepoints[-1] <= last_date_to_have_changepoint
    assert changepoints[0] >= first_date_to_have_changepoint
    # test value error
    with pytest.raises(ValueError,
                       match="In no_changepoint_distance_from_end, the maximal unit is 'D', "
                             "i.e., you may use units no more than 'D' such as"
                             "'10D', '5H', '100T', '200S'. The reason is that 'W', 'M' "
                             "or higher has either cycles or indefinite number of days, "
                             "thus is not parsable by pandas as timedelta."):
        model.find_trend_changepoints(
            df=df_pt,
            time_col="ts",
            value_col="y",
            no_changepoint_distance_from_end="2M"
        )
    # test `no_changepoint_proportion_from_end` and `actual_changepoint_min_distance`
    # generates a df with trend change points, ensuring we detect change points
    df_trend = generate_test_changepoint_df()
    model = ChangepointDetector()
    res = model.find_trend_changepoints(
        df=df_trend,
        time_col="ts",
        value_col="y",
        potential_changepoint_n=50,
        yearly_seasonality_order=0,
        adaptive_lasso_initial_estimator='lasso',
        no_changepoint_proportion_from_end=0.3,
        actual_changepoint_min_distance="10D"
    )
    changepoints = res["trend_changepoints"]
    # last changepoint in first 70% data
    assert changepoints[-1] <= df_trend["ts"][int(df_trend.shape[0] * 0.7)]
    assert all((changepoints[i + 1] - changepoints[i] >= to_offset("10D")) for i in range(len(changepoints) - 1))
    # test the asserts above are violated when not specifying `no_changepoint_proportion_from_end`
    model = ChangepointDetector()
    res = model.find_trend_changepoints(
        df=df_trend,
        time_col="ts",
        value_col="y",
        potential_changepoint_n=50,
        yearly_seasonality_order=0,
        adaptive_lasso_initial_estimator='ridge',
        no_changepoint_proportion_from_end=0.0,
        actual_changepoint_min_distance="1D"
    )
    changepoints = res["trend_changepoints"]
    # last changepoint after first 70% data
    assert changepoints[-1] > df_trend["ts"][int(df_trend.shape[0] * 0.7)]
    # negative potential_changepoint_n
    model = ChangepointDetector()
    with pytest.raises(ValueError, match="potential_changepoint_n can not be negative. "
                                         "A large number such as 100 is recommended"):
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            potential_changepoint_n=-1
        )
    # negative year_seasonality_order
    model = ChangepointDetector()
    with pytest.raises(ValueError, match="year_seasonality_order can not be negative. "
                                         "A number less than or equal to 10 is recommended"):
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            yearly_seasonality_order=-1
        )
    # negative regularization_strength
    with pytest.raises(ValueError, match="regularization_strength must be between 0.0 and 1.0."):
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            regularization_strength=-1
        )
    # estimator parameter combination not valid warning
    with pytest.warns(UserWarning) as record:
        model = ChangepointDetector()
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            trend_estimator="something"
        )
        assert "trend_estimator not in ['ridge', 'lasso', 'ols'], " \
               "estimating using ridge" in record[0].message.args[0]
    with pytest.warns(UserWarning) as record:
        model = ChangepointDetector()
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            trend_estimator="ols",
            yearly_seasonality_order=8
        )
        assert "trend_estimator = 'ols' with year_seasonality_order > 0 may create " \
               "over-fitting, trend_estimator has been set to 'ridge'." in record[0].message.args[0]
    with pytest.warns(UserWarning) as record:
        model = ChangepointDetector()
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            adaptive_lasso_initial_estimator="something"
        )
        assert "adaptive_lasso_initial_estimator not in ['ridge', 'lasso', 'ols'], " \
               "estimating with ridge" in record[0].message.args[0]
    # df sample size too small
    df = pd.DataFrame(
        data={
            "ts": pd.date_range(start='2020-1-1', end='2020-1-3', freq='D'),
            "y": [1, 2, 3]
        }
    )
    model = ChangepointDetector()
    with pytest.raises(ValueError, match="Change point detector does not work for less than "
                                         "5 observations. Please increase sample size."):
        model.find_trend_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
        )
    # test when training data has missing dates, the model drops na from resample
    df = pd.DataFrame(
        data={
            "ts": pd.date_range(start='2020-1-1', end='2020-1-9', freq='2D'),
            "y": [1, 2, 3, 4, 5]
        }
    )
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y"
    )
    assert model.y.isnull().sum().sum() == 0
    assert model.y.shape[0] == 5
    # tests varying yearly seasonality effect
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y",
        yearly_seasonality_change_freq="365D"
    )
    assert model.trend_df.shape[1] > 100 + 1 + 8 * 2  # checks extra columns are created for varying yearly seasonality
def test_find_seasonality_changepoints(hourly_data):
    df = hourly_data["df"]
    dl = DataLoader()
    df_pt = dl.load_peyton_manning()

    # model training with given values
    model = ChangepointDetector()
    model.find_seasonality_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        potential_changepoint_n=80,
        resample_freq="2D",
        seasonality_components_df=pd.DataFrame({
            "name": ["tod", "tow", "conti_year"],
            "period": [24.0, 7.0, 1.0],
            "order": [3, 4, 5],
            "seas_names": ["daily", "weekly", "yearly"]})
    )
    # resample frequency is "2D", daily component is automatically removed from
    # seasonality_components_df
    assert model.seasonality_df.shape[1] == 18 * 81
    assert model.seasonality_changepoints is not None
    assert model.seasonality_estimation is not None
    assert model.seasonality_estimation.shape[0] == df.shape[0]
    # test a given ``regularization_strength``
    model = ChangepointDetector()
    model.find_seasonality_changepoints(
        df=df,
        time_col="ts",
        value_col="y",
        regularization_strength=1.0
    )
    # ``regularization_strength`` == 1.0 indicates no change point
    assert all([model.seasonality_changepoints[key] == [] for key in model.seasonality_changepoints.keys()])
    model.find_seasonality_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y",
        regularization_strength=0.1
    )
    # ``regularization_strength`` between 0 and 1 indicates at least one change point
    assert any([model.seasonality_changepoints[key] != [] for key in model.seasonality_changepoints.keys()])
    # test `no_changepoint_distance_from_end` with the Peyton Manning data
    model = ChangepointDetector()
    res = model.find_seasonality_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y",
        no_changepoint_distance_from_end="730D",
        regularization_strength=0.1
    )
    changepoints_dict = res["seasonality_changepoints"]
    changepoints = []
    for key in changepoints_dict.keys():
        changepoints += changepoints_dict[key]
    # test override `no_changepoint_proportion_from_end` and no change points in the last piece
    no_changepoint_proportion_from_end = timedelta(days=730) / (
            pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0]))
    last_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int(
        df_pt.shape[0] * (1 - no_changepoint_proportion_from_end))])
    assert changepoints[-1] <= last_date_to_have_changepoint
    # test daily data automatically drops daily seasonality components
    cd = ChangepointDetector()
    res = cd.find_seasonality_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y"
    )
    assert "daily" not in res["seasonality_changepoints"].keys()
    # test feeding the same df with different column names will not rerun trend estimation
    df2 = df_pt.copy().rename({"ts": "ts2", "y": "y2"}, axis=1)
    cd = ChangepointDetector()
    cd.find_seasonality_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y"
    )
    with pytest.warns(UserWarning) as record:
        cd.find_seasonality_changepoints(
            df=df2,
            time_col="ts2",
            value_col="y2"
        )
        assert ("Trend changepoints are already identified, using past trend estimation. "
                "If you would like to run trend change point detection again, "
                "please call ``find_trend_changepoints`` with desired parameters "
                "before calling ``find_seasonality_changepoints``.") in record[0].message.args[0]
    assert cd.time_col == "ts"
    assert cd.value_col == "y"
    # negative potential_changepoint_n
    model = ChangepointDetector()
    with pytest.raises(ValueError, match="potential_changepoint_n can not be negative. "
                                         "A large number such as 50 is recommended"):
        model.find_seasonality_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            potential_changepoint_n=-1
        )
    # negative regularization_strength
    with pytest.raises(ValueError, match="regularization_strength must be between 0.0 and 1.0."):
        model.find_seasonality_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            regularization_strength=-1
        )
    # test regularization_strength == None warning
    with pytest.warns(UserWarning) as record:
        model.find_seasonality_changepoints(
            df=df,
            time_col="ts",
            value_col="y",
            regularization_strength=None
        )
        assert ("regularization_strength is set to None. This will trigger cross-validation to "
                "select the tuning parameter which might result in too many change points. "
                "Keep the default value or tuning around it is recommended.") in record[0].message.args[0]
    # test existing trend estimation warning
    model = ChangepointDetector()
    model.find_trend_changepoints(
        df=df,
        time_col="ts",
        value_col="y"
    )
    with pytest.warns(UserWarning) as record:
        model.find_seasonality_changepoints(
            df=df,
            time_col="ts",
            value_col="y"
        )
        assert ("Trend changepoints are already identified, using past trend estimation. "
                "If you would like to run trend change point detection again, "
                "please call ``find_trend_changepoints`` with desired parameters "
                "before calling ``find_seasonality_changepoints``.") in record[0].message.args[0]
    # df sample size too small
    df_small = pd.DataFrame(
        data={
            "ts": pd.date_range(start='2020-1-1', end='2020-1-3', freq='D'),
            "y": [1, 2, 3]
        }
    )
    model = ChangepointDetector()
    with pytest.raises(ValueError, match="Change point detector does not work for less than "
                                         "5 observations. Please increase sample size."):
        model.find_seasonality_changepoints(
            df=df_small,
            time_col="ts",
            value_col="y",
        )
    # tests given trend changepoints
    cd = ChangepointDetector()
    cd.find_seasonality_changepoints(
        df=df_pt,
        time_col="ts",
        value_col="y",
        trend_changepoints=list(pd.to_datetime(["2016-01-01", "2017-02-05"]))
    )
    assert cd.trend_changepoints == list(pd.to_datetime(["2016-01-01", "2017-02-05"]))
    assert cd.original_df is not None
    assert cd.trend_estimation is not None
    assert cd.y is not None
    assert cd.time_col == "ts"
    assert cd.value_col == "y"
Exemple #22
0
`here <https://facebook.github.io/prophet/docs/quick_start.html>`_.
"""

import warnings

warnings.filterwarnings("ignore")

from greykite.common.data_loader import DataLoader
from greykite.framework.templates.autogen.forecast_config import ForecastConfig
from greykite.framework.templates.autogen.forecast_config import MetadataParam
from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam
from greykite.framework.templates.model_templates import ModelTemplateEnum
from greykite.framework.templates.forecaster import Forecaster

# Loads dataset into pandas DataFrame
dl = DataLoader()
df = dl.load_peyton_manning()

# %%
# Then we create a forecast model with ``SILVERKITE`` template.
# For a simple example of creating a forecast model, see
# `Simple Forecast <./0100_simple_forecast.html>`_.
# For a detailed tuning tutorial, see
# `Forecast Model Tuning <../tutorials/0100_forecast_tutorial.html>`_.

# Specifies dataset information
metadata = MetadataParam(
    time_col="ts",  # name of the time column
    value_col="y",  # name of the value column
    freq="D"  # "H" for hourly, "D" for daily, "W" for weekly, etc.
)
Exemple #23
0
def df_pt():
    """fetches the Peyton Manning pageview data"""
    dl = DataLoader()
    return dl.load_peyton_manning()