def test_get_df(): dl = DataLoader() # Daily data data_path = dl.get_data_home(data_dir=None, data_sub_dir="daily") df = dl.get_df(data_path=data_path, data_name="daily_peyton_manning") assert list(df.columns) == ["ts", "y"] assert df.shape == (2905, 2) # Hourly data data_path = dl.get_data_home(data_dir=None, data_sub_dir="hourly") df = dl.get_df(data_path=data_path, data_name="hourly_parking") assert list(df.columns) == [ "SystemCodeNumber", "Capacity", "Occupancy", "LastUpdated" ] assert df.shape == (35717, 4) # Error due to wrong file name data_path = dl.get_data_home(data_dir=None, data_sub_dir="daily") file_path = os.path.join(data_path, "parking.csv") file_names = dl.get_data_names(data_path=data_path) with pytest.raises( ValueError, match= fr"Given file path '{file_path}' is not found. Available datasets " fr"in data directory '{data_path}' are \{file_names}\."): dl.get_df(data_path=data_path, data_name="parking")
def test_benchmark_silverkite_template_with_real_data(): # setting every list to 1 item to speed up test case forecast_horizons = [30] max_cvs = [3] fit_algorithms = ["linear"] metric = EvaluationMetricEnum.MeanSquaredError evaluation_metric = EvaluationMetricParam(cv_selection_metric=metric.name) # real data dl = DataLoader() data_path = dl.get_data_home(data_sub_dir="daily") data_name = "daily_female_births" df = dl.get_df(data_path=data_path, data_name="daily_female_births") time_col = "Date" value_col = "Births" metadata = MetadataParam(time_col=time_col, value_col=value_col, freq="D") result_silverkite_real = benchmark_silverkite_template( data_name=data_name, df=df, metadata=metadata, evaluation_metric=evaluation_metric, forecast_horizons=forecast_horizons, fit_algorithms=fit_algorithms, max_cvs=max_cvs) result_silverkite_real = result_silverkite_real[0] assert result_silverkite_real["data_name"] == data_name assert result_silverkite_real["forecast_model_name"] == "silverkite_linear" assert result_silverkite_real["train_period"] == df.shape[0] assert result_silverkite_real["forecast_horizon"] == 30 assert result_silverkite_real["cv_folds"] == 3
def test_estimator_get_coef_summary_from_forecaster(): """Tests model summary for silverkite model with missing values in value_col after everything is setup by Forecaster""" dl = DataLoader() df_pt = dl.load_peyton_manning() config = ForecastConfig().from_dict( dict(model_template=ModelTemplateEnum.SILVERKITE.name, forecast_horizon=10, metadata_param=dict(time_col="ts", value_col="y", freq="D"), model_components_param=dict( custom={"fit_algorithm_dict": { "fit_algorithm": "linear" }}))) result = Forecaster().run_forecast_config( df=df_pt[:365], # shortens df to speed up config=config) summary = result.model[-1].summary() x = summary.get_coef_summary(is_intercept=True, return_df=True) assert x.shape[0] == 1 summary.get_coef_summary(is_time_feature=True) summary.get_coef_summary(is_event=True) summary.get_coef_summary(is_trend=True) summary.get_coef_summary(is_interaction=True) x = summary.get_coef_summary(is_lag=True) assert x is None x = summary.get_coef_summary(is_trend=True, is_seasonality=False, is_interaction=False, return_df=True) assert all([":" not in col for col in x["Pred_col"].tolist()]) assert "ct1" in x["Pred_col"].tolist() assert "sin1_ct1_yearly" not in x["Pred_col"].tolist() x = summary.get_coef_summary(return_df=True) assert x.shape[0] == summary.info_dict["coef_summary_df"].shape[0]
def test_load_hourly_beijing_pm(): dl = DataLoader() df = dl.load_beijing_pm() assert list(df.columns) == [ "ts", "year", "month", "day", "hour", "pm", "dewp", "temp", "pres", "cbwd", "iws", "is", "ir" ] assert df.shape == (43824, 13)
def test_load_hourly_parking(): dl = DataLoader() df = dl.load_parking(system_code_number=None) assert list(df.columns) == ["LastUpdated", "Capacity", "Occupancy", "OccupancyRatio"] assert df.shape == (1328, 4) df = dl.load_parking(system_code_number="NIA South") assert list(df.columns) == ["SystemCodeNumber", "Capacity", "Occupancy", "LastUpdated", "OccupancyRatio"] assert df.shape == (1204, 5)
def test_get_data_inventory(): dl = DataLoader() file_names = dl.get_data_inventory() assert set(file_names) == { "online_retail", "minute_energy_appliance", "minute_household_power", "minute_yosemite_temps", "hourly_parking", "hourly_traffic_volume", "hourly_bikesharing", "hourly_beijing_pm", "daily_temperature_australia", "daily_demand_order", "daily_female_births", "daily_istanbul_stock", "daily_peyton_manning", "monthly_shampoo", "monthly_sunspot" }
def test_load_data(): dl = DataLoader() df = dl.load_data(data_name="daily_peyton_manning") expected_df = dl.load_peyton_manning() assert_equal(df, expected_df) df = dl.load_data(data_name="hourly_parking", system_code_number="Shopping") expected_df = dl.load_parking(system_code_number="Shopping") assert_equal(df, expected_df) # Error due to unavailable data name data_name = "dummy" data_inventory = dl.get_data_inventory() with pytest.raises(ValueError, match=fr"Input data name '{data_name}' is not recognized. " fr"Must be one of \{data_inventory}\."): dl.load_data(data_name=data_name)
def test_estimator_plot_components_from_forecaster(): """Tests estimator's plot_components function after the Forecaster has set everything up at the top most level""" # Test with real data (Female-births) via model template dl = DataLoader() data_path = dl.get_data_home(data_sub_dir="daily") df = dl.get_df(data_path=data_path, data_name="daily_female_births") metadata = MetadataParam(time_col="Date", value_col="Births", freq="D") model_components = ModelComponentsParam( seasonality={ "yearly_seasonality": True, "quarterly_seasonality": True, "weekly_seasonality": True, "daily_seasonality": False }) result = Forecaster().run_forecast_config( df=df, config=ForecastConfig( model_template=ModelTemplateEnum.SILVERKITE.name, forecast_horizon=30, # forecast 1 month coverage=0.95, # 95% prediction intervals metadata_param=metadata, model_components_param=model_components)) estimator = result.model.steps[-1][-1] assert estimator.plot_components()
def test_get_data_names(): dl = DataLoader() # Returns empty set as there is no .csv file in 'data' folder data_path = dl.get_data_home() file_names = dl.get_data_names(data_path=data_path) assert file_names == [] data_path = dl.get_data_home(data_sub_dir="daily") file_names = dl.get_data_names(data_path=data_path) assert set(file_names) == { "daily_temperature_australia", "daily_demand_order", "daily_female_births", "daily_istanbul_stock", "daily_peyton_manning" }
def test_get_data_home(): dl = DataLoader() # Default parameters data_home = dl.get_data_home() assert os.path.basename(os.path.normpath(data_home)) == "data" # With subdirectory data_home = dl.get_data_home(data_sub_dir="daily") assert os.path.basename(os.path.normpath(data_home)) == "daily" # Error due to non existing folder data_dir = "/home/data" with pytest.raises(ValueError, match=f"Requested data directory '{data_dir}' does not exist."): dl.get_data_home(data_dir=data_dir)
def test_load_hourly_bikesharing(): dl = DataLoader() df = dl.load_bikesharing() assert list(df.columns) == ["date", TIME_COL, "count", "tmin", "tmax", "pn"] assert df.shape == (78421, 6) agg_func = {"count": "sum", "tmin": "min", "tmax": "max", "pn": "mean"} df = dl.load_bikesharing(agg_freq="daily", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (3269, len(agg_func) + 1) df = dl.load_bikesharing(agg_freq="weekly", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (468, len(agg_func) + 1) df = dl.load_bikesharing(agg_freq="monthly", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (109, len(agg_func) + 1)
def test_get_aggregated_data(): dl = DataLoader() test_df = pd.DataFrame({ TIME_COL: pd.date_range("2020-01-01 00:00", "2020-12-31 23:00", freq="1H"), "col1": 1, "col2": 2, "col3": 3, "col4": 4, "col5": 5, }) agg_func = {"col1": "sum", "col2": "mean", "col3": "median", "col4": "min", "col5": "max"} # For each frequency, # (1) make sure the `TIME_COL` column is correctly included # (2) verify the aggregation part works correctly # Daily aggregation df = dl.get_aggregated_data(test_df, agg_freq="daily", agg_func=agg_func) assert df.shape == (366, len(agg_func) + 1) assert (df["col1"] != 24).sum() == 0 assert (df["col2"] != 2).sum() == 0 assert (df["col3"] != 3).sum() == 0 assert (df["col4"] != 4).sum() == 0 assert (df["col5"] != 5).sum() == 0 # Weekly aggregation df = dl.get_aggregated_data(test_df, agg_freq="weekly", agg_func=agg_func) assert df.shape == (53, len(agg_func) + 1) assert (df["col1"] != 24*7).sum() == 2 assert (df["col2"] != 2).sum() == 0 assert (df["col3"] != 3).sum() == 0 assert (df["col4"] != 4).sum() == 0 assert (df["col5"] != 5).sum() == 0 # Monthly aggregation df = dl.get_aggregated_data(test_df, agg_freq="monthly", agg_func=agg_func) assert df.shape == (12, len(agg_func) + 1) assert (df["col1"].isin([24*29, 24*30, 24*31])).sum() == 12 assert (df["col2"] != 2).sum() == 0 assert (df["col3"] != 3).sum() == 0 assert (df["col4"] != 4).sum() == 0 assert (df["col5"] != 5).sum() == 0 df = test_df.drop(columns=[TIME_COL]) with pytest.raises(ValueError, match=f"{TIME_COL}"): dl.get_aggregated_data(df, agg_freq="monthly", agg_func=agg_func)
def test_load_hourly_beijing_pm(): dl = DataLoader() df = dl.load_beijing_pm() assert list(df.columns) == [ TIME_COL, "year", "month", "day", "hour", "pm", "dewp", "temp", "pres", "cbwd", "iws", "is", "ir"] assert df.shape == (43824, 13) agg_func = {"pm": "mean", "dewp": "mean", "temp": "max", "pres": "mean", "iws": "sum", "is": "sum", "ir": "sum"} df = dl.load_beijing_pm(agg_freq="daily", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (1826, len(agg_func) + 1) df = dl.load_beijing_pm(agg_freq="weekly", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (262, len(agg_func) + 1) df = dl.load_beijing_pm(agg_freq="monthly", agg_func=agg_func) assert TIME_COL in df.columns assert df.shape == (60, len(agg_func) + 1)
def test_load_hourly_bikesharing(): dl = DataLoader() df = dl.load_bikesharing() assert list(df.columns) == ["date", "ts", "count", "tmin", "tmax", "pn"] assert df.shape == (78421, 6)
def test_load_peyton_manning(): dl = DataLoader() df = dl.load_peyton_manning() assert list(df.columns) == [TIME_COL, "y"] assert df.shape == (2905, 2)
def test_init(): dl = DataLoader() assert dl.available_datasets == dl.get_data_inventory()
def test_gcd_load_data_anomaly(): """Checks anomaly_info parameter""" dl = DataLoader() df = dl.load_beijing_pm() value_col = "pm" # no anomaly adjustment canonical_data_dict = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col) assert canonical_data_dict["df_before_adjustment"] is None dim_one = "one" dim_two = "two" anomaly_df = pd.DataFrame({ START_DATE_COL: ["2011-04-04-10", "2011-10-10-00", "2012-12-20-10"], END_DATE_COL: ["2011-04-05-20", "2011-10-11-23", "2012-12-20-13"], ADJUSTMENT_DELTA_COL: [np.nan, 100.0, -100.0], METRIC_COL: [dim_one, dim_one, dim_two] # used to filter rows in this df }) # Adjusts one column (value_col) anomaly_info = { "value_col": value_col, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_one }, "adjustment_method": "add" } canonical_data_dict2 = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col, anomaly_info=anomaly_info) assert_equal(canonical_data_dict2["df_before_adjustment"], canonical_data_dict["df"]) expected_df = canonical_data_dict["df"].copy() # first anomaly idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][0]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][0])) expected_df.loc[idx, VALUE_COL] = np.nan # second anomaly idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][1]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][1])) expected_df.loc[idx, VALUE_COL] += 100.0 assert_equal(canonical_data_dict2["df"], expected_df) # Adjusts two columns value_col_two = "pres" # second column to adjust anomaly_info = [ anomaly_info, { "value_col": value_col_two, "anomaly_df": anomaly_df, "start_date_col": START_DATE_COL, "end_date_col": END_DATE_COL, "adjustment_delta_col": ADJUSTMENT_DELTA_COL, "filter_by_dict": { METRIC_COL: dim_two }, "adjustment_method": "subtract" } ] canonical_data_dict3 = get_canonical_data(df=df, time_col=TIME_COL, value_col=value_col, anomaly_info=anomaly_info) # third anomaly. The value is subtracted, according to `adjustment_method`. idx = ((expected_df[TIME_COL] >= anomaly_df[START_DATE_COL][2]) & (expected_df[TIME_COL] <= anomaly_df[END_DATE_COL][2])) expected_df.loc[idx, value_col_two] -= -100.0 assert_equal(canonical_data_dict3["df_before_adjustment"], canonical_data_dict["df"]) assert_equal(canonical_data_dict3["df"], expected_df)
def test_get_changepoints_dict(): dl = DataLoader() df_pt = dl.load_peyton_manning() changepoints_dict = { "method": "auto", "yearly_seasonality_order": 8, "resample_freq": "D", "trend_estimator": "ridge", "adaptive_lasso_initial_estimator": "ridge", "regularization_strength": None, "actual_changepoint_min_distance": "30D", "potential_changepoint_distance": None, "potential_changepoint_n": 100, "no_changepoint_distance_from_end": None, "no_changepoint_proportion_from_end": 0.0, "continuous_time_col": "ct1" } new_changepoints_dict, changepoint_detector = get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=changepoints_dict ) assert new_changepoints_dict["method"] == "custom" assert len(new_changepoints_dict["dates"]) > 0 assert new_changepoints_dict["continuous_time_col"] == "ct1" assert changepoint_detector.trend_changepoints is not None # tests change point properties changepoints_dict = { "method": "auto", "yearly_seasonality_order": 8, "resample_freq": "D", "trend_estimator": "ridge", "adaptive_lasso_initial_estimator": "ridge", "regularization_strength": None, "actual_changepoint_min_distance": "100D", "potential_changepoint_distance": "50D", "potential_changepoint_n": 100, "no_changepoint_distance_from_end": None, "no_changepoint_proportion_from_end": 0.3, "continuous_time_col": "ct1", "dates": ["2001-01-01", "2010-01-01"] } new_changepoints_dict, changepoint_detector = get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=changepoints_dict ) changepoint_dates = new_changepoints_dict["dates"] # checks no change points at the end assert (changepoint_dates[-1] - pd.to_datetime(df_pt["ts"].iloc[0])) / \ (pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0])) <= 0.7 # checks change point distance is good min_cp_dist = min([changepoint_dates[i] - changepoint_dates[i - 1] for i in range(1, len(changepoint_dates))]) assert min_cp_dist >= timedelta(days=100) assert changepoint_detector.trend_changepoints is not None # checks additional custom changepoints are added assert pd.to_datetime("2001-01-01") not in changepoint_dates # out of range assert pd.to_datetime("2010-01-01") in changepoint_dates # tests for None new_changepoints_dict, changepoint_detector = get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=None ) assert new_changepoints_dict is None assert changepoint_detector is None # tests for "custom" changepoints_dict = { "method": "custom", "dates": ["2020-01-01"] } new_changepoints_dict, changepoint_detector = get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=changepoints_dict ) assert new_changepoints_dict == changepoints_dict assert changepoint_detector is None # tests for uniform changepoints_dict = { "method": "uniform", "n_changepoints": 100 } new_changepoints_dict, changepoint_detector = get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=changepoints_dict ) assert new_changepoints_dict == changepoints_dict assert changepoint_detector is None # tests unused keys changepoints_dict = { "method": "auto", "unused_key": "value" } with pytest.warns(UserWarning) as record: get_changepoints_dict( df=df_pt, time_col="ts", value_col="y", changepoints_dict=changepoints_dict ) assert (f"The following keys in ``changepoints_dict`` are not recognized\n" f"{['unused_key']}") in record[0].message.args[0]
# ``2007-12-10`` and ``2016-01-20``. # necessary imports from datetime import datetime import numpy as np import plotly from greykite.framework.input.univariate_time_series import UnivariateTimeSeries from greykite.framework.constants import MEAN_COL_GROUP, OVERLAY_COL_GROUP from greykite.common.constants import TIME_COL from greykite.common.data_loader import DataLoader from greykite.common.viz.timeseries_plotting import add_groupby_column, plot_multivariate, plot_univariate # Loads dataset into pandas DataFrame dl = DataLoader() df = dl.load_peyton_manning() df.rename(columns={"y": "log(pageviews)"}, inplace=True) # uses a more informative name # plots dataset ts = UnivariateTimeSeries() ts.load_data(df=df, time_col="ts", value_col="log(pageviews)", freq="D") fig = ts.plot() plotly.io.show(fig) # %% # Yearly seasonality # ------------------ # Because the observations are at daily frequency, # it is possible to see yearly, quarterly, monthly, and weekly seasonality.
def test_find_trend_changepoints(hourly_data): df = hourly_data["df"] dl = DataLoader() df_pt = dl.load_peyton_manning() model = ChangepointDetector() # test class variables are initialized as None assert model.trend_model is None assert model.trend_coef is None assert model.trend_intercept is None assert model.trend_changepoints is None assert model.trend_potential_changepoint_n is None assert model.trend_df is None assert model.y is None assert model.original_df is None assert model.value_col is None assert model.time_col is None assert model.adaptive_lasso_coef is None # model training with default values model.find_trend_changepoints( df=df, time_col="ts", value_col="y" ) assert isinstance(model.trend_model, RegressorMixin) assert model.trend_model.coef_.shape[0] == 100 + 1 + 8 * 2 assert model.trend_coef.shape[0] == 100 + 1 + 8 * 2 assert model.trend_intercept is not None assert model.trend_changepoints is not None assert model.trend_potential_changepoint_n == 100 assert model.trend_df.shape[1] == 100 + 1 + 8 * 2 assert model.original_df.shape == df.shape assert model.time_col is not None assert model.value_col is not None assert model.adaptive_lasso_coef[1].shape[0] == 100 + 1 + 8 * 2 assert model.y.index[0] not in model.trend_changepoints # model training with given values model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_n=50, yearly_seasonality_order=6, resample_freq="2D", trend_estimator="lasso", adaptive_lasso_initial_estimator="ols" ) assert isinstance(model.trend_model, RegressorMixin) assert model.trend_model.coef_.shape[0] == 50 + 1 + 6 * 2 assert model.trend_coef.shape[0] == 50 + 1 + 6 * 2 assert model.trend_intercept is not None assert model.trend_changepoints is not None assert model.trend_potential_changepoint_n == 50 assert model.trend_df.shape[1] == 50 + 1 + 6 * 2 assert model.original_df.shape == df.shape assert model.time_col is not None assert model.value_col is not None assert model.adaptive_lasso_coef[1].shape[0] == 50 + 1 + 6 * 2 assert model.y.index[0] not in model.trend_changepoints # test a given ``regularization_strength`` model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=1.0 ) assert isinstance(model.trend_model, RegressorMixin) assert model.trend_model.coef_.shape[0] == 100 + 1 + 8 * 2 assert model.trend_coef.shape[0] == 100 + 1 + 8 * 2 assert model.trend_intercept is not None assert model.trend_changepoints is not None assert model.trend_potential_changepoint_n == 100 assert model.trend_df.shape[1] == 100 + 1 + 8 * 2 assert model.original_df.shape == df.shape assert model.time_col is not None assert model.value_col is not None assert model.adaptive_lasso_coef[1].shape[0] == 100 + 1 + 8 * 2 assert model.y.index[0] not in model.trend_changepoints # ``regularization_strength`` == 1.0 indicates no change point assert model.trend_changepoints == [] model.find_trend_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=0.5 ) # ``regularization_strength`` between 0 and 1 indicates at least one change point assert len(model.trend_changepoints) > 0 model.find_trend_changepoints( df=df, time_col="ts", value_col="y", actual_changepoint_min_distance="D", regularization_strength=0.0 ) # ``regularization_strength`` == 0.0 indicates all potential change points are present assert len(model.trend_changepoints) == 100 # test `potential_changepoint_distance` model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_distance="100D" ) # test override `potential_changepoint_n` # df has length 500 days, with distance "100D", only 4 change points are placed. assert model.trend_potential_changepoint_n == 4 with pytest.raises(ValueError, match="In potential_changepoint_distance, the maximal unit is 'D', " "i.e., you may use units no more than 'D' such as" "'10D', '5H', '100T', '200S'. The reason is that 'W', 'M' " "or higher has either cycles or indefinite number of days, " "thus is not parsable by pandas as timedelta."): model.find_trend_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_distance="2M" ) # test `no_changepoint_distance_from_end` with the Peyton Manning data model = ChangepointDetector() res = model.find_trend_changepoints( df=df_pt, time_col="ts", value_col="y", no_changepoint_distance_from_begin="730D", no_changepoint_distance_from_end="730D", regularization_strength=0 ) changepoints = res["trend_changepoints"] # test override `no_changepoint_proportion_from_end` and no change points in the last piece no_changepoint_proportion_from_end = timedelta(days=730) / ( pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0])) last_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int( df_pt.shape[0] * (1 - no_changepoint_proportion_from_end))]) first_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int( df_pt.shape[0] * no_changepoint_proportion_from_end)]) assert changepoints[-1] <= last_date_to_have_changepoint assert changepoints[0] >= first_date_to_have_changepoint # test value error with pytest.raises(ValueError, match="In no_changepoint_distance_from_end, the maximal unit is 'D', " "i.e., you may use units no more than 'D' such as" "'10D', '5H', '100T', '200S'. The reason is that 'W', 'M' " "or higher has either cycles or indefinite number of days, " "thus is not parsable by pandas as timedelta."): model.find_trend_changepoints( df=df_pt, time_col="ts", value_col="y", no_changepoint_distance_from_end="2M" ) # test `no_changepoint_proportion_from_end` and `actual_changepoint_min_distance` # generates a df with trend change points, ensuring we detect change points df_trend = generate_test_changepoint_df() model = ChangepointDetector() res = model.find_trend_changepoints( df=df_trend, time_col="ts", value_col="y", potential_changepoint_n=50, yearly_seasonality_order=0, adaptive_lasso_initial_estimator='lasso', no_changepoint_proportion_from_end=0.3, actual_changepoint_min_distance="10D" ) changepoints = res["trend_changepoints"] # last changepoint in first 70% data assert changepoints[-1] <= df_trend["ts"][int(df_trend.shape[0] * 0.7)] assert all((changepoints[i + 1] - changepoints[i] >= to_offset("10D")) for i in range(len(changepoints) - 1)) # test the asserts above are violated when not specifying `no_changepoint_proportion_from_end` model = ChangepointDetector() res = model.find_trend_changepoints( df=df_trend, time_col="ts", value_col="y", potential_changepoint_n=50, yearly_seasonality_order=0, adaptive_lasso_initial_estimator='ridge', no_changepoint_proportion_from_end=0.0, actual_changepoint_min_distance="1D" ) changepoints = res["trend_changepoints"] # last changepoint after first 70% data assert changepoints[-1] > df_trend["ts"][int(df_trend.shape[0] * 0.7)] # negative potential_changepoint_n model = ChangepointDetector() with pytest.raises(ValueError, match="potential_changepoint_n can not be negative. " "A large number such as 100 is recommended"): model.find_trend_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_n=-1 ) # negative year_seasonality_order model = ChangepointDetector() with pytest.raises(ValueError, match="year_seasonality_order can not be negative. " "A number less than or equal to 10 is recommended"): model.find_trend_changepoints( df=df, time_col="ts", value_col="y", yearly_seasonality_order=-1 ) # negative regularization_strength with pytest.raises(ValueError, match="regularization_strength must be between 0.0 and 1.0."): model.find_trend_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=-1 ) # estimator parameter combination not valid warning with pytest.warns(UserWarning) as record: model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", trend_estimator="something" ) assert "trend_estimator not in ['ridge', 'lasso', 'ols'], " \ "estimating using ridge" in record[0].message.args[0] with pytest.warns(UserWarning) as record: model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", trend_estimator="ols", yearly_seasonality_order=8 ) assert "trend_estimator = 'ols' with year_seasonality_order > 0 may create " \ "over-fitting, trend_estimator has been set to 'ridge'." in record[0].message.args[0] with pytest.warns(UserWarning) as record: model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y", adaptive_lasso_initial_estimator="something" ) assert "adaptive_lasso_initial_estimator not in ['ridge', 'lasso', 'ols'], " \ "estimating with ridge" in record[0].message.args[0] # df sample size too small df = pd.DataFrame( data={ "ts": pd.date_range(start='2020-1-1', end='2020-1-3', freq='D'), "y": [1, 2, 3] } ) model = ChangepointDetector() with pytest.raises(ValueError, match="Change point detector does not work for less than " "5 observations. Please increase sample size."): model.find_trend_changepoints( df=df, time_col="ts", value_col="y", ) # test when training data has missing dates, the model drops na from resample df = pd.DataFrame( data={ "ts": pd.date_range(start='2020-1-1', end='2020-1-9', freq='2D'), "y": [1, 2, 3, 4, 5] } ) model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y" ) assert model.y.isnull().sum().sum() == 0 assert model.y.shape[0] == 5 # tests varying yearly seasonality effect model = ChangepointDetector() model.find_trend_changepoints( df=df_pt, time_col="ts", value_col="y", yearly_seasonality_change_freq="365D" ) assert model.trend_df.shape[1] > 100 + 1 + 8 * 2 # checks extra columns are created for varying yearly seasonality
def test_find_seasonality_changepoints(hourly_data): df = hourly_data["df"] dl = DataLoader() df_pt = dl.load_peyton_manning() # model training with given values model = ChangepointDetector() model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_n=80, resample_freq="2D", seasonality_components_df=pd.DataFrame({ "name": ["tod", "tow", "conti_year"], "period": [24.0, 7.0, 1.0], "order": [3, 4, 5], "seas_names": ["daily", "weekly", "yearly"]}) ) # resample frequency is "2D", daily component is automatically removed from # seasonality_components_df assert model.seasonality_df.shape[1] == 18 * 81 assert model.seasonality_changepoints is not None assert model.seasonality_estimation is not None assert model.seasonality_estimation.shape[0] == df.shape[0] # test a given ``regularization_strength`` model = ChangepointDetector() model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=1.0 ) # ``regularization_strength`` == 1.0 indicates no change point assert all([model.seasonality_changepoints[key] == [] for key in model.seasonality_changepoints.keys()]) model.find_seasonality_changepoints( df=df_pt, time_col="ts", value_col="y", regularization_strength=0.1 ) # ``regularization_strength`` between 0 and 1 indicates at least one change point assert any([model.seasonality_changepoints[key] != [] for key in model.seasonality_changepoints.keys()]) # test `no_changepoint_distance_from_end` with the Peyton Manning data model = ChangepointDetector() res = model.find_seasonality_changepoints( df=df_pt, time_col="ts", value_col="y", no_changepoint_distance_from_end="730D", regularization_strength=0.1 ) changepoints_dict = res["seasonality_changepoints"] changepoints = [] for key in changepoints_dict.keys(): changepoints += changepoints_dict[key] # test override `no_changepoint_proportion_from_end` and no change points in the last piece no_changepoint_proportion_from_end = timedelta(days=730) / ( pd.to_datetime(df_pt["ts"].iloc[-1]) - pd.to_datetime(df_pt["ts"].iloc[0])) last_date_to_have_changepoint = pd.to_datetime(df_pt["ts"].iloc[int( df_pt.shape[0] * (1 - no_changepoint_proportion_from_end))]) assert changepoints[-1] <= last_date_to_have_changepoint # test daily data automatically drops daily seasonality components cd = ChangepointDetector() res = cd.find_seasonality_changepoints( df=df_pt, time_col="ts", value_col="y" ) assert "daily" not in res["seasonality_changepoints"].keys() # test feeding the same df with different column names will not rerun trend estimation df2 = df_pt.copy().rename({"ts": "ts2", "y": "y2"}, axis=1) cd = ChangepointDetector() cd.find_seasonality_changepoints( df=df_pt, time_col="ts", value_col="y" ) with pytest.warns(UserWarning) as record: cd.find_seasonality_changepoints( df=df2, time_col="ts2", value_col="y2" ) assert ("Trend changepoints are already identified, using past trend estimation. " "If you would like to run trend change point detection again, " "please call ``find_trend_changepoints`` with desired parameters " "before calling ``find_seasonality_changepoints``.") in record[0].message.args[0] assert cd.time_col == "ts" assert cd.value_col == "y" # negative potential_changepoint_n model = ChangepointDetector() with pytest.raises(ValueError, match="potential_changepoint_n can not be negative. " "A large number such as 50 is recommended"): model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y", potential_changepoint_n=-1 ) # negative regularization_strength with pytest.raises(ValueError, match="regularization_strength must be between 0.0 and 1.0."): model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=-1 ) # test regularization_strength == None warning with pytest.warns(UserWarning) as record: model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y", regularization_strength=None ) assert ("regularization_strength is set to None. This will trigger cross-validation to " "select the tuning parameter which might result in too many change points. " "Keep the default value or tuning around it is recommended.") in record[0].message.args[0] # test existing trend estimation warning model = ChangepointDetector() model.find_trend_changepoints( df=df, time_col="ts", value_col="y" ) with pytest.warns(UserWarning) as record: model.find_seasonality_changepoints( df=df, time_col="ts", value_col="y" ) assert ("Trend changepoints are already identified, using past trend estimation. " "If you would like to run trend change point detection again, " "please call ``find_trend_changepoints`` with desired parameters " "before calling ``find_seasonality_changepoints``.") in record[0].message.args[0] # df sample size too small df_small = pd.DataFrame( data={ "ts": pd.date_range(start='2020-1-1', end='2020-1-3', freq='D'), "y": [1, 2, 3] } ) model = ChangepointDetector() with pytest.raises(ValueError, match="Change point detector does not work for less than " "5 observations. Please increase sample size."): model.find_seasonality_changepoints( df=df_small, time_col="ts", value_col="y", ) # tests given trend changepoints cd = ChangepointDetector() cd.find_seasonality_changepoints( df=df_pt, time_col="ts", value_col="y", trend_changepoints=list(pd.to_datetime(["2016-01-01", "2017-02-05"])) ) assert cd.trend_changepoints == list(pd.to_datetime(["2016-01-01", "2017-02-05"])) assert cd.original_df is not None assert cd.trend_estimation is not None assert cd.y is not None assert cd.time_col == "ts" assert cd.value_col == "y"
`here <https://facebook.github.io/prophet/docs/quick_start.html>`_. """ import warnings warnings.filterwarnings("ignore") from greykite.common.data_loader import DataLoader from greykite.framework.templates.autogen.forecast_config import ForecastConfig from greykite.framework.templates.autogen.forecast_config import MetadataParam from greykite.framework.templates.autogen.forecast_config import ModelComponentsParam from greykite.framework.templates.model_templates import ModelTemplateEnum from greykite.framework.templates.forecaster import Forecaster # Loads dataset into pandas DataFrame dl = DataLoader() df = dl.load_peyton_manning() # %% # Then we create a forecast model with ``SILVERKITE`` template. # For a simple example of creating a forecast model, see # `Simple Forecast <./0100_simple_forecast.html>`_. # For a detailed tuning tutorial, see # `Forecast Model Tuning <../tutorials/0100_forecast_tutorial.html>`_. # Specifies dataset information metadata = MetadataParam( time_col="ts", # name of the time column value_col="y", # name of the value column freq="D" # "H" for hourly, "D" for daily, "W" for weekly, etc. )
def df_pt(): """fetches the Peyton Manning pageview data""" dl = DataLoader() return dl.load_peyton_manning()