def run_test(env: TrainEnv, predictor: Predictor, test_dataset: Dataset) -> None: len_original = len(test_dataset) test_dataset = TransformedDataset( base_dataset=test_dataset, transformations=[ FilterTransformation( lambda x: x["target"].shape[-1] > predictor.prediction_length) ], ) len_filtered = len(test_dataset) if len_original > len_filtered: logger.warning( f"Not all time-series in the test-channel have " f"enough data to be used for evaluation. Proceeding with " f"{len_filtered}/{len_original} " f"(~{int(len_filtered / len_original * 100)}%) items.") forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=test_dataset, predictor=predictor, num_samples=100) agg_metrics, _item_metrics = Evaluator()( ts_iterator=ts_it, fcst_iterator=forecast_it, num_series=len(test_dataset), ) # we only log aggregate metrics for now as item metrics may be very large for name, score in agg_metrics.items(): logger.info(f"#test_score ({env.current_host}, {name}): {score}")
def run_test( env: TrainEnv, predictor: Predictor, test_dataset: Dataset ) -> None: len_original = maybe_len(test_dataset) test_dataset = TransformedDataset( test_dataset, FilterTransformation( lambda x: x["target"].shape[-1] > predictor.prediction_length ), ) len_filtered = len(test_dataset) if len_original is not None and len_original > len_filtered: logger.warning( f"Not all time-series in the test-channel have " f"enough data to be used for evaluation. Proceeding with " f"{len_filtered}/{len_original} " f"(~{int(len_filtered / len_original * 100)}%) items." ) forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=test_dataset, predictor=predictor, num_samples=100 ) if isinstance(predictor, RepresentableBlockPredictor) and isinstance( predictor.forecast_generator, QuantileForecastGenerator ): quantiles = predictor.forecast_generator.quantiles logger.info(f"Using quantiles `{quantiles}` for evaluation.") evaluator = Evaluator(quantiles=quantiles) else: evaluator = Evaluator() agg_metrics, item_metrics = evaluator( ts_iterator=ts_it, fcst_iterator=forecast_it, num_series=len(test_dataset), ) # we only log aggregate metrics for now as item metrics may be very large for name, score in agg_metrics.items(): logger.info(f"#test_score ({env.current_host}, {name}): {score}") # store metrics with open(env.path.model / "agg_metrics.json", "w") as agg_metric_file: json.dump(agg_metrics, agg_metric_file) with open(env.path.model / "item_metrics.csv", "w") as item_metrics_file: item_metrics.to_csv(item_metrics_file, index=False)
def deep_state(seed=42, data="m4_quarterly", epochs=100, batches=50): mx.random.seed(seed) np.random.seed(seed) dataset = get_dataset(data) trainer = Trainer( ctx=mx.cpu(0), # ctx=mx.gpu(0), epochs=epochs, num_batches_per_epoch=batches, learning_rate=1e-3, ) cardinality = int(dataset.metadata.feat_static_cat[0].cardinality) estimator = DeepStateEstimator( trainer=trainer, cardinality=[cardinality], prediction_length=dataset.metadata.prediction_length, freq=dataset.metadata.freq, use_feat_static_cat=True, ) predictor = estimator.train(dataset.train) # predictor = estimator.train(training_data=dataset.train, # validation_data=dataset.test) forecast_it, ts_it = make_evaluation_predictions(dataset.test, predictor=predictor, num_samples=100) agg_metrics, item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(dataset.test)) metrics = [ "MASE", "sMAPE", "MSIS", "wQuantileLoss[0.5]", "wQuantileLoss[0.9]" ] output = { key: round(value, 8) for key, value in agg_metrics.items() if key in metrics } output["epochs"] = epochs output["seed"] = seed df = pd.DataFrame([output]) return df
def test_dynamic_integration( train_length: int, test_length: int, prediction_length: int, target_start: str, rolling_start: str, num_dynamic_feat: int, ): """ Trains an estimator on a rolled dataset with dynamic features. Tests https://github.com/awslabs/gluon-ts/issues/1390 """ train_ds = create_dynamic_dataset(target_start, train_length, num_dynamic_feat) rolled_ds = generate_rolling_dataset( dataset=create_dynamic_dataset(target_start, test_length, num_dynamic_feat), strategy=StepStrategy(prediction_length=prediction_length), start_time=pd.Timestamp(rolling_start), ) estimator = DeepAREstimator( freq="D", prediction_length=prediction_length, context_length=2 * prediction_length, use_feat_dynamic_real=True, trainer=Trainer(epochs=1), ) predictor = estimator.train(training_data=train_ds) forecast_it, ts_it = make_evaluation_predictions(rolled_ds, predictor=predictor, num_samples=100) training_agg_metrics, _ = Evaluator(num_workers=0)(ts_it, forecast_it) # it should have failed by this point if the dynamic features were wrong assert training_agg_metrics
def test_training_with_implicit_quantile_output(): dataset = get_dataset("constant") metadata = dataset.metadata deepar_estimator = DeepAREstimator( distr_output=ImplicitQuantileOutput(output_domain="Real"), freq=metadata.freq, prediction_length=metadata.prediction_length, trainer=Trainer( device="cpu", epochs=5, learning_rate=1e-3, num_batches_per_epoch=3, batch_size=256, ), input_size=15, ) deepar_predictor = deepar_estimator.train(dataset.train, num_workers=1) forecast_it, ts_it = make_evaluation_predictions( dataset=dataset.test, # test dataset predictor=deepar_predictor, # predictor num_samples=100, # number of sample paths we want for evaluation ) forecasts = list(forecast_it) tss = list(ts_it) evaluator = Evaluator(num_workers=0) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test)) assert agg_metrics["MSE"] > 0
def test_listing_1(): """ Test GluonTS paper examples from arxiv paper: https://arxiv.org/abs/1906.05264 Listing 1 """ from gluonts.dataset.repository.datasets import get_dataset from gluonts.model.deepar import DeepAREstimator from gluonts.trainer import Trainer from gluonts.evaluation import Evaluator from gluonts.evaluation.backtest import backtest_metrics # We use electricity in the paper but that would take too long to run in # the unit test dataset_info, train_ds, test_ds = constant_dataset() meta = dataset_info.metadata estimator = DeepAREstimator( freq=meta.time_granularity, prediction_length=1, trainer=Trainer(epochs=1, batch_size=32), ) predictor = estimator.train(train_ds) evaluator = Evaluator(quantiles=(0.1, 0.5, 0.9)) agg_metrics, item_metrics = backtest_metrics( train_dataset=train_ds, test_dataset=test_ds, forecaster=predictor, evaluator=evaluator, )
def evaluate(dataset_name, estimator): dataset = get_dataset(dataset_name) estimator = estimator( prediction_length=dataset.metadata.prediction_length, freq=dataset.metadata.time_granularity, ) print(f"evaluating {estimator} on {dataset}") predictor = estimator.train(dataset.train) forecast_it, ts_it = make_evaluation_predictions(dataset.test, predictor=predictor, num_eval_samples=100) agg_metrics, item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(dataset.test)) pprint.pprint(agg_metrics) eval_dict = agg_metrics eval_dict["dataset"] = dataset_name eval_dict["estimator"] = type(estimator).__name__ return eval_dict
def evaluate(dataset_name, estimator): dataset = get_dataset(dataset_name) estimator = estimator( prediction_length=dataset.metadata.prediction_length, freq=dataset.metadata.freq, use_feat_static_cat=True, cardinality=[ feat_static_cat.cardinality for feat_static_cat in dataset.metadata.feat_static_cat ], ) print(f"evaluating {estimator} on {dataset}") predictor = estimator.train(dataset.train) forecast_it, ts_it = make_evaluation_predictions(dataset.test, predictor=predictor, num_samples=100) agg_metrics, item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(dataset.test)) pprint.pprint(agg_metrics) eval_dict = agg_metrics eval_dict["dataset"] = dataset_name eval_dict["estimator"] = type(estimator).__name__ return eval_dict
def test_forecasts(method_name): if method_name == "mlp": # https://stackoverflow.com/questions/56254321/error-in-ifncol-matrix-rep-argument-is-of-length-zero # https://cran.r-project.org/web/packages/neuralnet/index.html # published before the bug fix: https://github.com/bips-hb/neuralnet/pull/21 # The issue is still open on nnfor package: https://github.com/trnnick/nnfor/issues/8 # TODO: look for a workaround. pytest.xfail( "MLP currently does not work because " "the `neuralnet` package is not yet updated with a known bug fix in ` bips-hb/neuralnet`" ) dataset = datasets.get_dataset("constant") (train_dataset, test_dataset, metadata) = ( dataset.train, dataset.test, dataset.metadata, ) freq = metadata.freq prediction_length = metadata.prediction_length params = dict( freq=freq, prediction_length=prediction_length, method_name=method_name ) predictor = RForecastPredictor(**params) predictions = list(predictor.predict(train_dataset)) forecast_type = ( QuantileForecast if method_name in QUANTILE_FORECAST_METHODS else SampleForecast ) assert all( isinstance(prediction, forecast_type) for prediction in predictions ) assert all(prediction.freq == freq for prediction in predictions) assert all( prediction.prediction_length == prediction_length for prediction in predictions ) assert all( prediction.start_date == forecast_start(data) for data, prediction in zip(train_dataset, predictions) ) evaluator = Evaluator() agg_metrics, item_metrics = backtest_metrics( test_dataset=test_dataset, predictor=predictor, evaluator=evaluator, ) assert agg_metrics["mean_wQuantileLoss"] < TOLERANCE assert agg_metrics["NRMSE"] < TOLERANCE assert agg_metrics["RMSE"] < TOLERANCE
def test_MASE_sMAPE_M4(timeseries, res): ts_datastructure = pd.Series evaluator = Evaluator(quantiles=QUANTILES) agg_df, item_df = calculate_metrics( timeseries, evaluator, ts_datastructure ) assert abs((agg_df["MASE"] - res["MASE"]) / res["MASE"]) < 0.001, ( "Scores for the metric MASE do not match: " "\nexpected: {} \nobtained: {}".format(res["MASE"], agg_df["MASE"]) ) assert abs((agg_df["MAPE"] - res["MAPE"]) / res["MAPE"]) < 0.001, ( "Scores for the metric MAPE do not match: \nexpected: {} " "\nobtained: {}".format(res["MAPE"], agg_df["MAPE"]) ) assert abs((agg_df["sMAPE"] - res["sMAPE"]) / res["sMAPE"]) < 0.001, ( "Scores for the metric sMAPE do not match: \nexpected: {} " "\nobtained: {}".format(res["sMAPE"], agg_df["sMAPE"]) ) assert ( sum(abs(item_df["seasonal_error"].values - res["seasonal_error"])) < 0.001 ), ( "Scores for the metric seasonal_error do not match: \nexpected: {} " "\nobtained: {}".format( res["seasonal_error"], item_df["seasonal_error"].values ) )
def testDeepRenewal(type, hybridize, freq, num_feat_dynamic_real, cardinality): prediction_length = 3 if type == "synthetic": train_ds, test_ds = make_dummy_datasets_with_features( prediction_length=prediction_length, freq=freq, num_feat_dynamic_real=num_feat_dynamic_real, cardinality=cardinality, ) else: train_ds = make_constant_dataset(train_length=15, freq=freq) test_ds = train_ds trainer = Trainer(ctx="cpu", epochs=1, hybridize=hybridize) # hybridize false for development estimator = DeepRenewalEstimator( prediction_length=prediction_length, freq=freq, trainer=trainer, ) predictor = estimator.train(training_data=train_ds) forecast_it, ts_it = make_evaluation_predictions(dataset=test_ds, predictor=predictor, num_samples=100) evaluator = Evaluator(calculate_owa=False, num_workers=0) agg_metrics, item_metrics = evaluator(ts_it, forecast_it, num_series=len(test_ds)) if type == "synthetic": accuracy = 1.5 else: accuracy = 1.3 assert agg_metrics["ND"] <= accuracy
def simple_main(): import mxnet as mx from pprint import pprint dataset = get_dataset("electricity", regenerate=False) trainer = Trainer( ctx=mx.cpu(0), epochs=10, num_batches_per_epoch=200, learning_rate=1e-3, hybridize=False, ) cardinality = int(dataset.metadata.feat_static_cat[0].cardinality) estimator = DeepFactorEstimator( trainer=trainer, context_length=168, cardinality=[cardinality], prediction_length=dataset.metadata.prediction_length, freq=dataset.metadata.freq, ) predictor = estimator.train(dataset.train) forecast_it, ts_it = make_evaluation_predictions(dataset.test, predictor=predictor, num_eval_samples=100) agg_metrics, item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(dataset.test)) pprint(agg_metrics)
def test_forecast_parser(): # verify that logged for estimator, datasets and metrics can be recovered # from their string representation dataset_info, train_ds, test_ds = constant_dataset() estimator = make_estimator(dataset_info.metadata.freq, dataset_info.prediction_length) assert repr(estimator) == repr(load_code(repr(estimator))) predictor = estimator.train(training_data=train_ds) stats = calculate_dataset_statistics(train_ds) assert stats == eval(repr(stats), globals(), {"gluonts": gluonts}) # TODO: use load evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) agg_metrics, _ = backtest_metrics(test_ds, predictor, evaluator) # reset infinite metrics to 0 (otherwise the assertion below fails) for key, val in agg_metrics.items(): if not math.isfinite(val): agg_metrics[key] = 0.0 assert agg_metrics == load_code(dump_code(agg_metrics))
def _make_evaluation_predictions(self, predictor, test_list_dataset): """Evaluate predictor and generate sample forecasts. Args: predictor (gluonts.model.predictor.Predictor): Trained object used to make forecasts. test_list_dataset (gluonts.dataset.common.ListDataset): ListDataset created with the GluonDataset class. Returns: Dictionary of aggregated metrics over all timeseries. DataFrame of metrics for each timeseries (i.e., each target column). List of gluonts.model.forecast.Forecast (objects storing the predicted distributions as samples). """ try: forecast_it, ts_it = make_evaluation_predictions( dataset=test_list_dataset, predictor=predictor, num_samples=100) forecasts = list(forecast_it) except Exception as err: raise ModelPredictionError( f"GluonTS '{self.model_name}' model crashed when making predictions. Full error: {err}" ) evaluator = Evaluator(num_workers=min(2, multiprocessing.cpu_count())) agg_metrics, item_metrics = evaluator( ts_it, forecasts, num_series=len(test_list_dataset)) return agg_metrics, item_metrics, forecasts
def test_custom_eval_fn( timeseries, res, has_nans, input_type, eval_name, eval_fn, agg_str, fcst_type, ): ts_datastructure = pd.Series evaluator = Evaluator( quantiles=QUANTILES, custom_eval_fn={eval_name: [eval_fn, agg_str, fcst_type]}, ) agg_metrics, item_metrics = calculate_metrics( timeseries, evaluator, ts_datastructure, forecaster=naive_forecaster, has_nans=has_nans, input_type=input_type, ) assert eval_name in agg_metrics.keys() assert eval_name in item_metrics.keys() for metric, score in agg_metrics.items(): if metric in res.keys(): assert np.isclose(score, res[metric], equal_nan=True), ( "Scores for the metric {} do not match: \nexpected: {} " "\nobtained: {}".format(metric, res[metric], score))
def gluonts_evaluation(self, tss, preds, load_path, test_ds): evaluator = Evaluator(quantiles=[0.1, 0.3, 0.5, 0.7, 0.9]) agg_metrics, item_metrics = evaluator(iter(tss), iter(preds), num_series=len(test_ds)) item_metrics.to_csv( os.path.join(load_path, 'models_evaluation_metric.csv')) return item_metrics
def test_simple_model(): dsinfo, training_data, test_data = default_synthetic() freq = dsinfo.metadata.freq prediction_length = dsinfo.prediction_length context_length = 2 * prediction_length hidden_dimensions = [10, 10] net = LightningFeedForwardNetwork( freq=freq, prediction_length=prediction_length, context_length=context_length, hidden_dimensions=hidden_dimensions, distr_output=NormalOutput(), batch_norm=True, scaling=mean_abs_scaling, ) transformation = Chain([ AddObservedValuesIndicator( target_field=FieldName.TARGET, output_field=FieldName.OBSERVED_VALUES, ), InstanceSplitter( target_field=FieldName.TARGET, is_pad_field=FieldName.IS_PAD, start_field=FieldName.START, forecast_start_field=FieldName.FORECAST_START, train_sampler=ExpectedNumInstanceSampler(num_instances=1), past_length=context_length, future_length=prediction_length, time_series_fields=[FieldName.OBSERVED_VALUES], ), ]) data_loader = TrainDataLoader( training_data, batch_size=8, stack_fn=batchify, transform=transformation, num_batches_per_epoch=5, ) trainer = pl.Trainer(max_epochs=3, callbacks=[], weights_summary=None) trainer.fit(net, train_dataloader=data_loader) predictor = net.get_predictor(transformation) forecast_it, ts_it = make_evaluation_predictions( dataset=test_data, predictor=predictor, num_samples=100, ) evaluator = Evaluator(quantiles=[0.5, 0.9], num_workers=None) agg_metrics, _ = evaluator(ts_it, forecast_it)
def test_accuracy(predictor_cls, parameters, accuracy): predictor = predictor_cls(freq=CONSTANT_DATASET_FREQ, **parameters) agg_metrics, item_metrics = backtest_metrics( test_dataset=constant_test_ds, predictor=predictor, evaluator=Evaluator(calculate_owa=True), ) assert agg_metrics["ND"] <= accuracy
def test_accuracy(Estimator, hyperparameters, accuracy): estimator = Estimator.from_hyperparameters(freq=freq, **hyperparameters) agg_metrics, item_metrics = backtest_metrics( train_dataset=train_ds, test_dataset=test_ds, forecaster=estimator, evaluator=Evaluator(calculate_owa=True), ) assert agg_metrics["ND"] <= accuracy
def forecast_metrics(tss, forecasts, quantiles=[0.1, 0.5, 0.9], show=True, dir_save=None) : from gluonts.evaluation import Evaluator evaluator = Evaluator(quantiles=quantiles) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(forecasts )) if show : log(json.dumps(agg_metrics, indent=4)) if dir_save : json.dump(agg_metrics, indent=4) return agg_metrics, item_metrics
def run_test(env: TrainEnv, predictor: Predictor, test_dataset: Dataset) -> None: forecast_it, ts_it = backtest.make_evaluation_predictions( test_dataset, predictor=predictor, num_eval_samples=100) agg_metrics, _item_metrics = Evaluator()(ts_it, forecast_it, num_series=len(test_dataset)) # we only log aggregate metrics for now as item metrics may be # very large log_metrics(env, agg_metrics)
def mse(net,test): predictor = estimator.create_predictor(transformation,net) forecast_it, ts_it = make_evaluation_predictions( dataset=test, # test dataset predictor=predictor, # predictor num_samples=100, # number of sample paths we want for evaluation ) forecasts = list(forecast_it) tss = list(ts_it) evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test)) return agg_metrics['MSE']
def train(epochs, prediction_length, num_layers, dropout_rate): #create train dataset df = pd.read_csv(filepath_or_buffer=os.environ['SM_CHANNEL_TRAIN'] + "/train.csv", header=0, index_col=0) training_data = ListDataset([{ "start": df.index[0], "target": df.value[:] }], freq="5min") #define DeepAR estimator deepar_estimator = DeepAREstimator(freq="5min", prediction_length=prediction_length, dropout_rate=dropout_rate, num_layers=num_layers, trainer=Trainer(epochs=epochs)) #train the model deepar_predictor = deepar_estimator.train(training_data=training_data) #create test dataset df = pd.read_csv(filepath_or_buffer=os.environ['SM_CHANNEL_TEST'] + "/test.csv", header=0, index_col=0) test_data = ListDataset([{ "start": df.index[0], "target": df.value[:] }], freq="5min") #evaluate trained model on test data forecast_it, ts_it = make_evaluation_predictions(test_data, deepar_predictor, num_samples=100) forecasts = list(forecast_it) tss = list(ts_it) evaluator = Evaluator(quantiles=[0.1, 0.5, 0.9]) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(test_data)) print("MSE:", agg_metrics["MSE"]) #save the model deepar_predictor.serialize(pathlib.Path(os.environ['SM_MODEL_DIR'])) return deepar_predictor
def run_test(self, dataset, estimator, predictor): test_dataset = TransformedDataset( dataset, transformations=[ FilterTransformation(lambda el: el['target'].shape[-1] > predictor.prediction_length) ], ) len_orig = len(dataset) len_filtered = len(test_dataset) if len_orig > len_filtered: logging.warning( 'Not all time-series in the test-channel have ' 'enough data to be used for evaluation. Proceeding with ' f'{len_filtered}/{len_orig} ' f'(~{int(len_filtered/len_orig*100)}%) items.') try: log.metric('test_dataset_stats', test_dataset.calc_stats()) except GluonTSDataError as error: logging.error( f"Failure whilst calculating stats for test dataset: {error}") return if isinstance(estimator, GluonEstimator) and isinstance( predictor, GluonPredictor): inference_data_loader = InferenceDataLoader( dataset=test_dataset, transform=predictor.input_transform, batch_size=estimator.trainer.batch_size, ctx=estimator.trainer.ctx, float_type=estimator.float_type, ) if estimator.trainer.hybridize: predictor.hybridize(batch=next(iter(inference_data_loader))) if self.hyperparameters.get('use_symbol_block_predictor'): predictor = predictor.as_symbol_block_predictor( batch=next(iter(inference_data_loader))) num_eval_samples = self.hyperparameters.get('num_eval_samples', 100) quantiles = self.hyperparameters.get( 'quantiles', (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) # we only log aggregate metrics for now as item metrics may be # very large predictions, input_timeseries = backtest.make_evaluation_predictions( test_dataset, predictor, num_eval_samples) agg_metrics, _item_metrics = Evaluator(quantiles=quantiles)( input_timeseries, predictions, num_series=len_filtered) log.metric("agg_metrics", agg_metrics)
def train(arguments): """ Generic train method that trains a specified estimator on a specified dataset. """ logger.info("Downloading estimator config.") estimator_config = Path(arguments.estimator) / "estimator.json" with estimator_config.open() as config_file: estimator = serde.load_json(config_file.read()) logger.info("Downloading dataset.") if arguments.s3_dataset is None: # load built in dataset dataset = datasets.get_dataset(arguments.dataset) else: # load custom dataset s3_dataset_dir = Path(arguments.s3_dataset) dataset = common.load_datasets( metadata=s3_dataset_dir, train=s3_dataset_dir / "train", test=s3_dataset_dir / "test", ) logger.info("Starting model training.") predictor = estimator.train(dataset.train) forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=dataset.test, predictor=predictor, num_samples=int(arguments.num_samples), ) logger.info("Starting model evaluation.") evaluator = Evaluator(quantiles=eval(arguments.quantiles)) agg_metrics, item_metrics = evaluator(ts_it, forecast_it, num_series=len(list(dataset.test))) # required for metric tracking. for name, value in agg_metrics.items(): logger.info(f"gluonts[metric-{name}]: {value}") # save the evaluation results metrics_output_dir = Path(arguments.output_data_dir) with open(metrics_output_dir / "agg_metrics.json", "w") as f: json.dump(agg_metrics, f) with open(metrics_output_dir / "item_metrics.csv", "w") as f: item_metrics.to_csv(f, index=False) # save the model model_output_dir = Path(arguments.model_dir) predictor.serialize(model_output_dir)
def run_test(forecaster, test_dataset): agg_metrics, _item_metrics = backtest.backtest_metrics( train_dataset=None, test_dataset=test_dataset, forecaster=forecaster, evaluator=Evaluator(quantiles=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)), num_eval_samples=100, ) # we only log aggregate metrics for now as item metrics may be # very large log.metric("agg_metrics", agg_metrics)
def test_accuracy(Estimator, hyperparameters, accuracy): estimator = from_hyperparameters(Estimator, hyperparameters, dsinfo) predictor = estimator.train(training_data=dsinfo.train_ds) agg_metrics, item_metrics = backtest_metrics( test_dataset=dsinfo.test_ds, predictor=predictor, evaluator=Evaluator(calculate_owa=statsmodels is not None), ) if dsinfo.name == "synthetic": accuracy = 10.0 assert agg_metrics["ND"] <= accuracy
def test_training_external_features(self): prediction_length = 2 frequency = "3M" gluon_dataset = ListDataset(self.timeseries, freq=frequency) estimator = AutoARIMAEstimator(prediction_length=prediction_length, freq=frequency, season_length=4, use_feat_dynamic_real=True) predictor = estimator.train(gluon_dataset) forecast_it, ts_it = make_evaluation_predictions(dataset=gluon_dataset, predictor=predictor, num_samples=100) timeseries = list(ts_it) forecasts = list(forecast_it) assert forecasts[1].samples.shape == (100, 2) evaluator = Evaluator() agg_metrics, item_metrics = evaluator(iter(timeseries), iter(forecasts), num_series=len(gluon_dataset)) assert agg_metrics["MAPE"] is not None
def __init__( self, var_results: List[VARResultsWrapper], train_datasets: List[np.ndarray], original_datasets: List[np.ndarray], initial_log_values: List[np.ndarray], horizon: int = 6, freq: str = 'M', var_diff: bool = False, ) -> None: self.var_results = var_results self.lag_orders = [results.k_ar for results in var_results] self.horizon = horizon self.train_datasets = train_datasets self.original_datasets = [ np.ma.masked_invalid(original_dataset) for original_dataset in original_datasets ] self.initial_log_values = initial_log_values self.evaluator = Evaluator() self.freq = freq self.var_diff = var_diff
def save_item_metrics(dataset, forecasts, tss, model, metric): evaluator = Evaluator(quantiles=[0.005, 0.1, 0.5, 0.9, 0.995], ) agg_metrics, item_metrics = evaluator(iter(tss), iter(forecasts), num_series=len(dataset.test_ds)) if metric == "Coverage": low_coverage = item_metrics[["Coverage[0.005]"]].to_numpy() high_coverage = item_metrics[["Coverage[0.995]"]].to_numpy() low_score = 0.005 - low_coverage high_score = high_coverage - 0.995 item_metric = high_score + low_score else: item_metric = item_metrics[[metric]].to_numpy() np.save("item_metrics/" + metric + '_' + model + '.npy', item_metric)