def test_lgt_map_fit(make_weekly_data, seasonality, estimator): train_df, test_df, coef = make_weekly_data lgt = LGT(response_col='response', date_col='week', seasonality=seasonality, verbose=False, estimator=estimator) lgt.fit(train_df) init_call = lgt._model.get_init_values() if seasonality: assert isinstance(init_call, LGTInitializer) assert init_call.s == 52 init_values = init_call() assert init_values['init_sea'].shape == (51, ) else: assert not init_call predict_df = lgt.predict(test_df) expected_num_parameters = 10 expected_columns = ['week', 'prediction'] if seasonality == 52: expected_num_parameters += 2 expected_shape = (51, len(expected_columns)) assert predict_df.shape == expected_shape assert predict_df.columns.tolist() == expected_columns assert len(lgt._posterior_samples) == expected_num_parameters
def test_plot_predicted_data(iclaims_training_data, plot_components): df = iclaims_training_data df['claims'] = np.log(df['claims']) regressor_col = ['trend.unemploy', 'trend.filling', 'trend.job'] test_size = 52 train_df = df[:-test_size] test_df = df[-test_size:] lgt = LGT( response_col='claims', date_col='week', regressor_col=regressor_col, estimator='stan-map', seasonality=52, seed=8888, ) lgt.fit(train_df) predicted_df = lgt.predict(df=test_df, decompose=True) # test plotting _ = plot_predicted_data(training_actual_df=train_df, predicted_df=predicted_df, date_col='week', actual_col='claims', test_actual_df=test_df) _ = plot_predicted_components(predicted_df=predicted_df, date_col='week', plot_components=plot_components)
def test_backtester_test_metrics(iclaims_training_data, metrics): df = iclaims_training_data lgt = LGT(response_col='claims', date_col='week', seasonality=1, verbose=False, estimator='stan-map') backtester = BackTester( model=lgt, df=df, forecast_len=3, n_splits=1, ) backtester.fit_predict() eval_out = backtester.score(metrics=metrics) evaluated_metrics = set(eval_out['metric_name'].tolist()) if metrics is None: expected_metrics = [x.__name__ for x in backtester._default_metrics] elif isinstance(metrics, list): expected_metrics = [x.__name__ for x in metrics] else: expected_metrics = [metrics.__name__] assert set(expected_metrics) == evaluated_metrics
def test_lgt_grid_tuning(make_weekly_data, param_grid): train_df, test_df, coef = make_weekly_data args = { 'response_col': 'response', 'date_col': 'week', 'seasonality': 52, 'estimator': 'stan-map', } lgt = LGT(**args) best_params, tuned_df = grid_search_orbit(param_grid, model=lgt, df=train_df, min_train_len=80, incremental_len=20, forecast_len=20, metrics=None, criteria=None, verbose=True) assert best_params[0].keys() == param_grid.keys() assert set(tuned_df.columns.to_list()) == set( list(param_grid.keys()) + ['metrics']) assert tuned_df.shape == (9, 3)
def test_base_lgt_init(estimator): lgt = LGT(estimator=estimator) is_fitted = lgt.is_fitted() model_data_input = lgt.get_training_data_input() model_param_names = lgt._model.get_model_param_names() init_values = lgt._model.get_init_values() # model is not yet fitted assert not is_fitted # should only be initialized and not set assert not model_data_input # model param names should already be set assert model_param_names # callable is not implemented yet assert not init_values
def test_lgt_is_fitted(iclaims_training_data, estimator, keep_samples, point_method): df = iclaims_training_data df['claims'] = np.log(df['claims']) regressor_col = ['trend.unemploy'] if estimator == 'stan-mcmc': lgt = LGT(response_col='claims', date_col='week', regressor_col=regressor_col, seasonality=52, seed=8888, num_warmup=50, num_sample=50, verbose=False, estimator=estimator) elif estimator == 'pyro-svi': lgt = LGT(response_col='claims', date_col='week', regressor_col=regressor_col, seasonality=52, seed=8888, num_steps=10, verbose=False, estimator=estimator) lgt.fit(df, keep_samples=keep_samples, point_method=point_method) is_fitted = lgt.is_fitted() # still True when keep_samples is False assert is_fitted
def test_lgt_fixed_sm_input(make_weekly_data, level_sm_input, seasonality_sm_input, slope_sm_input): train_df, test_df, coef = make_weekly_data lgt = LGT( response_col='response', date_col='week', regressor_col=train_df.columns.tolist()[2:], level_sm_input=level_sm_input, seasonality_sm_input=seasonality_sm_input, slope_sm_input=slope_sm_input, estimator='stan-map', seasonality=52, verbose=False, ) lgt.fit(train_df) predict_df = lgt.predict(test_df) regression_out = lgt.get_regression_coefs() num_regressors = regression_out.shape[0] expected_columns = ['week', 'prediction'] expected_shape = (51, len(expected_columns)) expected_regression_shape = (6, 3) assert predict_df.shape == expected_shape assert predict_df.columns.tolist() == expected_columns assert regression_out.shape == expected_regression_shape assert num_regressors == len(train_df.columns.tolist()[2:])
def test_lgt_aggregated_fit(make_weekly_data, seasonality, estimator, point_method): train_df, test_df, coef = make_weekly_data args = { 'response_col': 'response', 'date_col': 'week', 'prediction_percentiles': [5, 95], 'seasonality': seasonality, 'verbose': False, 'estimator': estimator, } if estimator == 'stan-mcmc': args.update({'num_warmup': 50, 'num_sample': 50}) elif estimator == 'pyro-svi': args.update({'num_steps': 10, 'num_sample': 50}) expected_num_parameters = 10 if seasonality == 52: expected_num_parameters += 2 lgt = LGT(**args) lgt.fit(train_df, point_method=point_method) init_call = lgt._model.get_init_values() if seasonality: assert isinstance(init_call, LGTInitializer) assert init_call.s == 52 init_values = init_call() assert init_values['init_sea'].shape == (51, ) else: assert not init_call predict_df = lgt.predict(test_df) expected_columns = ['week', 'prediction'] expected_shape = (51, len(expected_columns)) assert predict_df.shape == expected_shape assert predict_df.columns.tolist() == expected_columns assert len(lgt._posterior_samples) == expected_num_parameters
def test_lgt_map_single_regressor(iclaims_training_data): df = iclaims_training_data df['claims'] = np.log(df['claims']) regressor_col = ['trend.unemploy'] lgt = LGT( response_col='claims', date_col='week', regressor_col=regressor_col, estimator='stan-map', seasonality=52, seed=8888, ) lgt.fit(df) predicted_df = lgt.predict(df) expected_num_parameters = 13 expected_columns = ['week', 'prediction'] assert predicted_df.shape[0] == df.shape[0] assert predicted_df.columns.tolist() == expected_columns assert len(lgt._posterior_samples) == expected_num_parameters
def test_lgt_aggregated_with_regression(make_weekly_data, estimator, regressor_signs, point_method): train_df, test_df, coef = make_weekly_data if estimator == 'stan-mcmc': lgt = LGT(response_col='response', date_col='week', regressor_col=train_df.columns.tolist()[2:], regressor_sign=regressor_signs, seasonality=52, num_warmup=50, num_sample=50, verbose=False, estimator=estimator) elif estimator == 'pyro-svi': lgt = LGT(response_col='response', date_col='week', regressor_col=train_df.columns.tolist()[2:], regressor_sign=regressor_signs, seasonality=52, num_steps=10, verbose=False, estimator=estimator) else: return None lgt.fit(train_df, point_method=point_method) predict_df = lgt.predict(test_df) regression_out = lgt.get_regression_coefs() num_regressors = regression_out.shape[0] expected_columns = ['week', 'prediction'] expected_shape = (51, len(expected_columns)) expected_regression_shape = (6, 3) assert predict_df.shape == expected_shape assert predict_df.columns.tolist() == expected_columns assert regression_out.shape == expected_regression_shape assert num_regressors == len(train_df.columns.tolist()[2:]) predict_df = lgt.predict(test_df, decompose=True) assert any(predict_df['regression'].values)
def test_lgt_predict_seed(make_weekly_data, estimator, random_seed): train_df, test_df, coef = make_weekly_data args = { 'response_col': 'response', 'date_col': 'week', 'seasonality': 52, 'n_bootstrap_draws': 100, 'verbose': False, 'estimator': estimator, } if estimator == 'stan-mcmc': args.update({'num_warmup': 50, 'num_sample': 100}) elif estimator == 'pyro-svi': args.update({'num_steps': 10}) lgt = LGT(**args) lgt.fit(train_df) predict_df1 = lgt.predict(test_df, seed=random_seed) predict_df2 = lgt.predict(test_df, seed=random_seed) assert all( predict_df1['prediction'].values == predict_df2['prediction'].values)
def test_lgt_missing(iclaims_training_data, estimator): df = iclaims_training_data missing_idx = np.array([10, 20, 30, 40, 41, 42, 43, 44, df.shape[0] - 1]) df.loc[missing_idx, 'claims'] = np.nan dlt = LGT(response_col='claims', date_col='week', seasonality=52, verbose=False, estimator=estimator) dlt.fit(df) predicted_df = dlt.predict(df) if estimator == 'stan-map': expected_columns = ['week', 'prediction'] elif estimator == 'stan-mcmc': expected_columns = [ 'week', 'prediction_5', 'prediction', 'prediction_95' ] assert all(~np.isnan(predicted_df['prediction'])) assert predicted_df.columns.tolist() == expected_columns assert predicted_df.shape[0] == df.shape[0]
def test_lgt_map_reproducibility(make_weekly_data, seasonality): train_df, test_df, coef = make_weekly_data lgt1 = LGT(response_col='response', date_col='week', prediction_percentiles=[5, 95], seasonality=seasonality, estimator='stan-map') # first fit and predict lgt1.fit(train_df) posteriors1 = copy(lgt1._point_posteriors['map']) prediction1 = lgt1.predict(test_df) # second fit and predict # note a new instance must be created to reset the seed # note both fit and predict contain random generation processes lgt2 = LGT(response_col='response', date_col='week', prediction_percentiles=[5, 95], seasonality=seasonality, estimator='stan-map') lgt2.fit(train_df) posteriors2 = copy(lgt2._point_posteriors['map']) prediction2 = lgt2.predict(test_df) # assert same posterior keys assert set(posteriors1.keys()) == set(posteriors2.keys()) # assert posterior draws are reproducible for k, v in posteriors1.items(): assert np.allclose(posteriors1[k], posteriors2[k]) # assert prediction is reproducible assert np.allclose(prediction1['prediction'].values, prediction2['prediction'].values)
def test_backtester_with_training_data(iclaims_training_data): df = iclaims_training_data lgt = LGT(response_col='claims', date_col='week', seasonality=1, verbose=False, estimator='stan-map') backtester = BackTester( model=lgt, df=df, min_train_len=100, incremental_len=100, forecast_len=20, ) backtester.fit_predict() eval_out = backtester.score(include_training_metrics=True) evaluated_test_metrics = set(eval_out.loc[~eval_out['is_training_metric'], 'metric_name'].tolist()) evaluated_train_metrics = set(eval_out.loc[eval_out['is_training_metric'], 'metric_name'].tolist()) expected_test_metrics = [x.__name__ for x in backtester._default_metrics] expected_train_metrics = list( filter( lambda x: backtester._get_metric_callable_signature(x) == {'actual', 'prediction'}, backtester._default_metrics)) expected_train_metrics = [x.__name__ for x in expected_train_metrics] assert set(expected_test_metrics) == evaluated_test_metrics assert set(expected_train_metrics) == evaluated_train_metrics # default metric has 6 values where rmsse is only used in test metric num_training_metrics = 5 num_testing_metrics = 6 train_metric_val = eval_out.loc[eval_out['is_training_metric'], 'metric_values'].values test_metric_val = eval_out.loc[~eval_out['is_training_metric'], 'metric_values'].values assert len(train_metric_val) == num_training_metrics assert len(test_metric_val) == num_testing_metrics assert np.all(~np.isnan(train_metric_val)) assert np.all(~np.isnan(test_metric_val))
def test_backtester_sceduler_args(iclaims_training_data, scheduler_args): df = iclaims_training_data lgt = LGT(response_col='claims', date_col='week', seasonality=1, verbose=False, estimator='stan-map') backtester = BackTester( model=lgt, df=df, **scheduler_args, ) backtester.fit_predict() eval_out = backtester.score(metrics=[smape]) assert np.all(eval_out['metric_values'].values > 0)
def test_lgt_prediction_percentiles(iclaims_training_data, prediction_percentiles): df = iclaims_training_data lgt = LGT(response_col='claims', date_col='week', seasonality=52, num_warmup=50, num_sample=50, seed=8888, prediction_percentiles=prediction_percentiles, estimator='stan-mcmc') if not prediction_percentiles: p_labels = ['_5', '', '_95'] else: p_labels = ['_5', '_10', '', '_95'] lgt.fit(df) predicted_df = lgt.predict(df) expected_columns = ['week'] + ["prediction" + p for p in p_labels] assert predicted_df.columns.tolist() == expected_columns assert predicted_df.shape[0] == df.shape[0] predicted_df = lgt.predict(df, decompose=True) predicted_components = [ 'prediction', PredictionKeys.TREND.value, PredictionKeys.SEASONALITY.value, PredictionKeys.REGRESSION.value ] expected_columns = ['week'] for pc in predicted_components: for p in p_labels: expected_columns.append(pc + p) assert predicted_df.columns.tolist() == expected_columns assert predicted_df.shape[0] == df.shape[0]
def test_lgt_full_reproducibility(make_weekly_data, estimator, regressor_signs, seasonality): train_df, test_df, coef = make_weekly_data lgt_first = LGT(response_col='response', date_col='week', regressor_col=train_df.columns.tolist()[2:], regressor_sign=regressor_signs, prediction_percentiles=[5, 95], seasonality=seasonality, num_warmup=50, num_sample=50, verbose=False, estimator=estimator) # first fit and predict lgt_first.fit(train_df) posteriors_first = copy(lgt_first._posterior_samples) predict_df_first = lgt_first.predict(test_df) regression_out_first = lgt_first.get_regression_coefs() # second fit and predict # note a new instance must be created to reset the seed # note both fit and predict contain random generation processes lgt_second = LGT(response_col='response', date_col='week', regressor_col=train_df.columns.tolist()[2:], regressor_sign=regressor_signs, prediction_percentiles=[5, 95], seasonality=seasonality, num_warmup=50, num_sample=50, verbose=False, estimator=estimator) lgt_second.fit(train_df) posteriors_second = copy(lgt_second._posterior_samples) predict_df_second = lgt_second.predict(test_df) regression_out_second = lgt_second.get_regression_coefs() # assert same posterior keys assert set(posteriors_first.keys()) == set(posteriors_second.keys()) # assert posterior draws are reproducible for k, v in posteriors_first.items(): assert np.allclose(posteriors_first[k], posteriors_second[k]) # assert identical regression columns # this is also checked in posterior samples, but an extra layer just in case # since this one very commonly retrieved by end users assert regression_out_first.equals(regression_out_second) # assert prediction is reproducible assert predict_df_first.equals(predict_df_second)
def test_lgt_mixed_signs_and_order(iclaims_training_data, regressor_signs): df = iclaims_training_data df['claims'] = np.log(df['claims']) raw_regressor_col = ['trend.unemploy', 'trend.filling', 'trend.job'] new_regressor_col = [raw_regressor_col[idx] for idx in [2, 1, 0]] new_regressor_signs = [regressor_signs[idx] for idx in [2, 1, 0]] # mixing ordering of cols in df of prediction new_df = df[['claims', 'week'] + new_regressor_col] lgt = LGT( response_col='claims', date_col='week', regressor_col=raw_regressor_col, regressor_sign=regressor_signs, estimator='stan-map', seasonality=52, seed=8888, ) lgt.fit(df) predicted_df_v1 = lgt.predict(df) predicted_df_v2 = lgt.predict(new_df) # mixing ordering of signs lgt_new = LGT( response_col='claims', date_col='week', regressor_col=new_regressor_col, regressor_sign=new_regressor_signs, estimator='stan-map', seasonality=52, seed=8888, ) lgt_new.fit(df) predicted_df_v3 = lgt_new.predict(df) predicted_df_v4 = lgt_new.predict(new_df) pred_v1 = predicted_df_v1['prediction'].values pred_v2 = predicted_df_v2['prediction'].values pred_v3 = predicted_df_v3['prediction'].values pred_v4 = predicted_df_v4['prediction'].values # they should be all identical; ordering of signs or columns in prediction show not matter assert np.allclose(pred_v1, pred_v2, atol=1e-2) assert np.allclose(pred_v1, pred_v3, atol=1e-2) assert np.allclose(pred_v1, pred_v4, atol=1e-2)