def run_test(env: TrainEnv, predictor: Predictor, test_dataset: Dataset) -> None: len_original = len(test_dataset) test_dataset = TransformedDataset( base_dataset=test_dataset, transformations=[ FilterTransformation( lambda x: x["target"].shape[-1] > predictor.prediction_length) ], ) len_filtered = len(test_dataset) if len_original > len_filtered: logger.warning( f"Not all time-series in the test-channel have " f"enough data to be used for evaluation. Proceeding with " f"{len_filtered}/{len_original} " f"(~{int(len_filtered / len_original * 100)}%) items.") forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=test_dataset, predictor=predictor, num_samples=100) agg_metrics, _item_metrics = Evaluator()( ts_iterator=ts_it, fcst_iterator=forecast_it, num_series=len(test_dataset), ) # we only log aggregate metrics for now as item metrics may be very large for name, score in agg_metrics.items(): logger.info(f"#test_score ({env.current_host}, {name}): {score}")
def run_test(self, dataset, estimator, predictor): test_dataset = TransformedDataset( dataset, transformations=[ FilterTransformation(lambda el: el['target'].shape[-1] > predictor.prediction_length) ], ) len_orig = len(dataset) len_filtered = len(test_dataset) if len_orig > len_filtered: logging.warning( 'Not all time-series in the test-channel have ' 'enough data to be used for evaluation. Proceeding with ' f'{len_filtered}/{len_orig} ' f'(~{int(len_filtered/len_orig*100)}%) items.') try: log.metric('test_dataset_stats', test_dataset.calc_stats()) except GluonTSDataError as error: logging.error( f"Failure whilst calculating stats for test dataset: {error}") return if isinstance(estimator, GluonEstimator) and isinstance( predictor, GluonPredictor): inference_data_loader = InferenceDataLoader( dataset=test_dataset, transform=predictor.input_transform, batch_size=estimator.trainer.batch_size, ctx=estimator.trainer.ctx, float_type=estimator.float_type, ) if estimator.trainer.hybridize: predictor.hybridize(batch=next(iter(inference_data_loader))) if self.hyperparameters.get('use_symbol_block_predictor'): predictor = predictor.as_symbol_block_predictor( batch=next(iter(inference_data_loader))) num_eval_samples = self.hyperparameters.get('num_eval_samples', 100) quantiles = self.hyperparameters.get( 'quantiles', (0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9)) # we only log aggregate metrics for now as item metrics may be # very large predictions, input_timeseries = backtest.make_evaluation_predictions( test_dataset, predictor, num_eval_samples) agg_metrics, _item_metrics = Evaluator(quantiles=quantiles)( input_timeseries, predictions, num_series=len_filtered) log.metric("agg_metrics", agg_metrics)
def run_test( env: TrainEnv, predictor: Predictor, test_dataset: Dataset ) -> None: len_original = maybe_len(test_dataset) test_dataset = TransformedDataset( test_dataset, FilterTransformation( lambda x: x["target"].shape[-1] > predictor.prediction_length ), ) len_filtered = len(test_dataset) if len_original is not None and len_original > len_filtered: logger.warning( f"Not all time-series in the test-channel have " f"enough data to be used for evaluation. Proceeding with " f"{len_filtered}/{len_original} " f"(~{int(len_filtered / len_original * 100)}%) items." ) forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=test_dataset, predictor=predictor, num_samples=100 ) if isinstance(predictor, RepresentableBlockPredictor) and isinstance( predictor.forecast_generator, QuantileForecastGenerator ): quantiles = predictor.forecast_generator.quantiles logger.info(f"Using quantiles `{quantiles}` for evaluation.") evaluator = Evaluator(quantiles=quantiles) else: evaluator = Evaluator() agg_metrics, item_metrics = evaluator( ts_iterator=ts_it, fcst_iterator=forecast_it, num_series=len(test_dataset), ) # we only log aggregate metrics for now as item metrics may be very large for name, score in agg_metrics.items(): logger.info(f"#test_score ({env.current_host}, {name}): {score}") # store metrics with open(env.path.model / "agg_metrics.json", "w") as agg_metric_file: json.dump(agg_metrics, agg_metric_file) with open(env.path.model / "item_metrics.csv", "w") as item_metrics_file: item_metrics.to_csv(item_metrics_file, index=False)
def prepare_test_dataset(dataset: Dataset, prediction_length: int) -> Dataset: test_dataset = TransformedDataset( dataset, transformations=[ FilterTransformation( lambda el: el['target'].shape[-1] > prediction_length) ], ) len_orig = len(dataset) len_filtered = len(test_dataset) if len_orig > len_filtered: log.logger.warning( 'Not all time-series in the test-channel have ' 'enough data to be used for evaluation. Proceeding with ' f'{len_filtered}/{len_orig} ' f'(~{int(len_filtered / len_orig * 100)}%) items.') return test_dataset
def create_transformation(self) -> Transformation: return FilterTransformation(lambda x: True)
def run_test( env: TrainEnv, predictor: Predictor, test_dataset: Dataset, hyperparameters: dict, ) -> None: len_original = maybe_len(test_dataset) test_dataset = FilterTransformation( lambda x: x["target"].shape[-1] > predictor.prediction_length ).apply(test_dataset) len_filtered = len(test_dataset) if len_original is not None and len_original > len_filtered: logger.warning( f"Not all time-series in the test-channel have " f"enough data to be used for evaluation. Proceeding with " f"{len_filtered}/{len_original} " f"(~{int(len_filtered / len_original * 100)}%) items." ) forecast_it, ts_it = backtest.make_evaluation_predictions( dataset=test_dataset, predictor=predictor, num_samples=100 ) test_quantiles = ( [ Quantile.parse(quantile).name for quantile in hyperparameters["test_quantiles"] ] if "test_quantiles" in hyperparameters else None ) forecast_generator = getattr(predictor, "forecast_generator", None) if isinstance(forecast_generator, QuantileForecastGenerator): predictor_quantiles = forecast_generator.quantiles if test_quantiles is None: test_quantiles = predictor_quantiles elif not set(test_quantiles).issubset(predictor_quantiles): logger.warning( f"Some of the evaluation quantiles `{test_quantiles}` are " f"not in the computed quantile forecasts `{predictor_quantiles}`." ) test_quantiles = predictor_quantiles if test_quantiles is not None: logger.info(f"Using quantiles `{test_quantiles}` for evaluation.") evaluator = Evaluator(quantiles=test_quantiles) else: evaluator = Evaluator() agg_metrics, item_metrics = evaluator( ts_iterator=ts_it, fcst_iterator=forecast_it, num_series=len(test_dataset), ) # we only log aggregate metrics for now as item metrics may be very large for name, score in agg_metrics.items(): logger.info(f"#test_score ({env.current_host}, {name}): {score}") # store metrics with open(env.path.model / "agg_metrics.json", "w") as agg_metric_file: json.dump(agg_metrics, agg_metric_file) with open(env.path.model / "item_metrics.csv", "w") as item_metrics_file: item_metrics.to_csv(item_metrics_file, index=False)