def test_model_raises_if_not_fit(): model = GroupedProphet() with pytest.raises( DivinerException, match="The model has not been fit. Please fit the model first.", ): model.forecast(30, "days")
def test_model_raises_if_already_fit(): train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) model = GroupedProphet().fit(train.df, train.key_columns) with pytest.raises( DivinerException, match="The model has already been fit. Create a new instance to fit the model again.", ): model.fit(train.df, train.key_columns)
def test_prophet_extract_params(): train = data_generator.generate_test_data(4, 6, 1000, "2020-01-01", 1) model = GroupedProphet(uncertainty_samples=0).fit(train.df, train.key_columns) params = model.extract_model_params() assert len(params) == 6
def grouped_prophet_example(locations, start_dt, artifact_path): print("Generating data...\n") data = generate_data(location_data=locations, start_dt=start_dt) grouping_keys = ["country", "city"] print("Data Generated.\nBuilding GroupedProphet Model...") model = GroupedProphet(n_changepoints=96, uncertainty_samples=0).fit( df=data, group_key_columns=grouping_keys, y_col="watts", datetime_col="datetime" ) print("GroupedProphet model built.\n") params = model.extract_model_params() print(f"Params: \n{params.to_string()}") print("Running Cross Validation on all groups...\n") metrics = model.cross_validate_and_score( horizon="120 hours", period="480 hours", initial="960 hours", parallel="threads", rolling_window=0.05, monthly=False, ) print(f"Cross Validation Metrics: \n{metrics.to_string()}") mlflow.diviner.log_model(diviner_model=model, artifact_path=artifact_path) # As an Alternative to saving metrics and params directly with a `log_dict()` function call, # Serializing the DataFrames to local as a .csv can be done as well, without requiring # column or object manipulation as shown below this block, utilizing a temporary directory # with a context wrapper to clean up the files from the local OS after the artifact logging # is complete: with tempfile.TemporaryDirectory() as tmpdir: params.to_csv(f"{tmpdir}/params.csv", index=False, header=True) metrics.to_csv(f"{tmpdir}/metrics.csv", index=False, header=True) mlflow.log_artifacts(tmpdir, artifact_path="run_data") # Saving the parameters and metrics as json without having to serialize to local # NOTE: this requires casting of fields that cannot be serialized to JSON # NOTE: Do not use both of these methods. These are shown as an either/or alternative based # on how you would choose to consume, view, or analyze the per-group metrics and parameters. # NB: There are object references present in the Prophet model parameters. Coerce to string if # using a JSON serialization approach with ``mlflow.log_dict()``. params = params.astype(dtype=str, errors="ignore") mlflow.log_dict(params.to_dict(), "params.json") mlflow.log_dict(metrics.to_dict(), "metrics.json") return mlflow.get_artifact_uri(artifact_path=artifact_path)
def test_prophet_forecast_correct_start(): train = data_generator.generate_test_data(2, 5, 1000, "2020-01-01", 1) expected_start_of_forecast = max(train.df["ds"]) + timedelta(days=1) model = GroupedProphet().fit(train.df, train.key_columns) forecasted_data = model.forecast(10, "D") # check that the first date in the forecasted df for the first model is 1 day after last date. min_forecast = min(forecasted_data["ds"]) assert expected_start_of_forecast == min_forecast assert len(forecasted_data) == 50
def test_prophet_save_and_load(): # Tests serialization, deserialization, and utilization of forecasting API from loaded model save_path = os.path.join("/tmp/grouped_prophet_test", "model") train = data_generator.generate_test_data(2, 2, 1000, "2020-01-01", 1) grouped_model = GroupedProphet().fit(train.df, train.key_columns) grouped_model.save(save_path) loaded_model = GroupedProphet.load(save_path) forecasts = loaded_model.forecast(25, "D") shutil.rmtree(os.path.dirname(save_path)) assert len(forecasts) == 50
def test_prophet_df_naming_overrides(): train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) train_df = train.df train_df.rename(columns={"ds": "datetime", "y": "sales"}, inplace=True) assert {"datetime", "sales"}.issubset(set(train_df.columns)) model = GroupedProphet().fit(train_df, train.key_columns, "sales", "datetime") params = model.extract_model_params() assert len(params) == 1
def test_prophet_cross_validation_extract(): train = data_generator.generate_test_data(4, 6, 1000, "2020-01-01", 1) model = GroupedProphet(uncertainty_samples=0).fit(train.df, train.key_columns) scores = model.cross_validate_and_score( initial="100 days", period="90 days", horizon="15 days", parallel=None ) assert all(scores["rmse"] > 0) assert len(scores) == 6 assert "coverage" not in scores
def test_prophet_manual_predict(): train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) train_df = train.df predict_df = train_df[["key1", "key0", "ds"]][-10:] model = GroupedProphet().fit(train_df, train.key_columns) prediction = model.predict(predict_df) assert len(prediction) == 10 for _, row in prediction.iterrows(): assert row["yhat"] > 0
def test_prophet_group_subset_predict_raises_and_warns(): _rows_to_generate = 30 train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) train_df = train.df model = GroupedProphet().fit(train_df, train.key_columns) key_entries = [] for v in train_df[["key1", "key0"]].iloc[[0]].to_dict().values(): key_entries.append(list(v.values())[0]) groups = [(key_entries[0], key_entries[1]), ("missing", "key")] with pytest.raises( DivinerException, match="Cannot perform predictions due to submitted" ): model.predict_groups(groups, _rows_to_generate, "D") with pytest.warns( UserWarning, match="Specified groups are unable to be predicted due to " ): model.predict_groups(groups, _rows_to_generate, "D", on_error="warn") with pytest.raises( DivinerException, match="Groups specified for subset forecasting are not" ): model.predict_groups( ("invalid", "invalid"), _rows_to_generate, "D", on_error="ignore" )
def test_prophet_cross_validation_extract_custom_scores(): train = data_generator.generate_test_data(4, 2, 1000, "2020-01-01", 1) model = GroupedProphet(uncertainty_samples=0).fit(train.df, train.key_columns) scores = model.cross_validate_and_score( initial="100 days", period="90 days", horizon="15 days", parallel=None, metrics=["rmse", "mape"], disable_tqdm=False, monthly=True, ) assert all(scores["rmse"] > 0) assert len(scores) == 2 assert "coverage" not in scores
def test_prophet_default_fit(): train = data_generator.generate_test_data(4, 2, 1000, "2020-01-01", 1) model = GroupedProphet().fit(train.df, train.key_columns) first_model = _get_individual_model(model, 0) assert len(first_model.history) > 0 assert ( len(first_model.params["trend"][0]) == 1000 ) # fit value for each value in series assert len(list(model.model.keys())) == 2
def test_prophet_group_subset_predict(): _rows_to_generate = 30 train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) train_df = train.df model = GroupedProphet().fit(train_df, train.key_columns) key_entries = [] for v in train_df[["key1", "key0"]].iloc[[0]].to_dict().values(): key_entries.append(list(v.values())[0]) groups = [tuple(key_entries)] group_prediction = model.predict_groups(groups, _rows_to_generate, "D") assert len(group_prediction) == _rows_to_generate _key1 = group_prediction["key1"].unique() assert len(_key1) == 1 assert _key1[0] == groups[0][0] _key0 = group_prediction["key0"].unique() assert len(_key0) == 1 assert _key0[0] == groups[0][1]
def test_prophet_execution_with_kwargs_override_for_pystan(): train = data_generator.generate_test_data(4, 6, 1000, "2020-01-01", 1) default_prophet_uncertainty_samples = Prophet().uncertainty_samples model = GroupedProphet(uncertainty_samples=0).fit( train.df, train.key_columns, algorithm="LBFGS" ) last_model = _get_individual_model(model, 5) assert last_model.uncertainty_samples == 0 assert default_prophet_uncertainty_samples != last_model.uncertainty_samples
def test_prophet_with_bad_group_data(): train = data_generator.generate_test_data(2, 1, 1000, "2020-01-01", 1) train_df = train.df bad_data = pd.DataFrame( { "ds": datetime.strptime("2021-01-01", "%Y-%M-%d"), "y": -500.3, "key1": "bad", "key0": "data", }, index=[1000], ) train_df_add = pd.concat([train_df, bad_data]) with pytest.warns(RuntimeWarning, match="An error occurred while fitting group"): model = GroupedProphet().fit(train_df_add, train.key_columns) assert ("bad", "data") not in model.model.keys()
def test_prophet_save_load_override_object(): """Test to ensure that deserialization updates object properly for all attributes""" train1 = data_generator.generate_test_data(3, 2, 1000, "2020-01-01", 1) train2 = data_generator.generate_test_data(2, 2, 500, "2021-01-01", 1) model1 = GroupedProphet().fit(train1.df, train1.key_columns) model2 = GroupedProphet().fit(train2.df, train2.key_columns) model1_group_keys = deepcopy(model1._group_key_columns) model1_model = deepcopy(model1.model) # save model 2 save_path = os.path.join("/tmp/group_prophet_test", "model2serdetest.gpm") model2.save(save_path) # use model1 object to load model2 reloaded = model1.load(save_path) assert set(reloaded._group_key_columns) != set(model1_group_keys) assert reloaded.model.keys() == model2.model.keys() assert reloaded.model.keys() != model1_model.keys()
def grouped_prophet(diviner_data): return GroupedProphet(n_changepoints=20, uncertainty_samples=0).fit( df=diviner_data.df, group_key_columns=diviner_data.key_columns, y_col="y", datetime_col="ds" )
def execute_grouped_prophet(): """ This function call will generate synthetic group time series data in a normalized format. The structure will be of: ============ ====== =========== =========== =========== ds y group_key_1 group_key_2 group_key_3 ============ ====== =========== =========== =========== "2016-02-01" 1234.5 A B C ============ ====== =========== =========== =========== With the grouping key values that are generated per ``ds`` and ``y`` values assigned in a non-deterministic fashion. For utililzation of this API, the normalized representation of the data is required, such that a particular target variables' data 'y' and the associated indexed datetime values in ``'ds'`` are 'stacked' (unioned) from a more traditional denormalized data storage paradigm. For guidance on this data transposition from denormalized representations, see: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.melt.html """ generated_data = generate_example_data( column_count=3, series_count=10, series_size=365 * 5, start_dt="2016-02-01", days_period=1, ) # Extract the normalized grouped datetime series data training_data = generated_data.df # Extract the names of the grouping columns that define the unique series data grouping_key_columns = generated_data.key_columns # Create a GroupedProphet model instance grouped_model = GroupedProphet(n_changepoints=20, uncertainty_samples=0).fit( training_data, grouping_key_columns) # Save the model to the local file system save_path = "/tmp/grouped_prophet.gpm" grouped_model.save(path="/tmp/grouped_prophet.gpm") # Load the model from the local storage location retrieved_model = GroupedProphet.load(save_path) # Score the model and print the results model_scores = retrieved_model.cross_validate_and_score( horizon="30 days", period="180 days", initial="365 days", parallel="threads", rolling_window=0.05, monthly=False, ) print(f"Model scores:\n{model_scores.to_string()}") # Run a forecast for each group forecasts = retrieved_model.forecast(horizon=20, frequency="D") print(f"Forecasted data:\n{forecasts[:50].to_string()}") # Extract the parameters from each model for logging params = retrieved_model.extract_model_params() print(f"Model parameters:\n{params.to_string()}")
column_count=3, series_count=10, series_size=365 * 5, start_dt="2016-02-01", days_period=1, ) # Extract the normalized grouped datetime series data training_data = generated_data.df # Extract the names of the grouping columns that define the unique series data group_key_columns = generated_data.key_columns # Create a GroupedProphet model instance grouped_model = GroupedProphet(n_changepoints=20, uncertainty_samples=0).fit( training_data, group_key_columns ) # Get a subset of group keys to generate forecasts for group_df = training_data.copy() group_df["groups"] = list(zip(*[group_df[c] for c in group_key_columns])) distinct_groups = group_df["groups"].unique() groups_to_predict = list(distinct_groups[:3]) print("-" * 65) print(f"\nUnique groups that have been modeled: \n{distinct_groups}\n") print(f"Subset of groups to generate predictions for: \n{groups_to_predict}\n") print("-" * 65) forecasts = grouped_model.predict_groups( groups=groups_to_predict,