def train_loader( dataset: ListDataset, prediction_interval_length: float, context_interval_length: float, is_train: bool = True, override_args: dict = None, ) -> Iterable[DataBatch]: if override_args is None: override_args = {} if is_train: sampler = ContinuousTimeUniformSampler( num_instances=10, min_past=context_interval_length, min_future=prediction_interval_length, ) else: sampler = ContinuousTimePredictionSampler( min_past=context_interval_length) splitter = ContinuousTimeInstanceSplitter( future_interval_length=prediction_interval_length, past_interval_length=context_interval_length, instance_sampler=sampler, freq=dataset.freq, ) kwargs = dict( dataset=dataset, transform=splitter, batch_size=10, stack_fn=partial(batchify, dtype=np.float32, variable_length=True), ) kwargs.update(override_args) if is_train: return itertools.islice( TrainDataLoader(num_workers=None, **kwargs), NUM_BATCHES) else: return InferenceDataLoader(**kwargs)
def predict(self, dataset: Dataset, num_samples: Optional[int] = None) -> Iterator[Forecast]: inference_data_loader = InferenceDataLoader( dataset, transform=self.input_transform, batch_size=self.batch_size, stack_fn=lambda data: batchify(data, self.device), ) self.prediction_net.eval() with torch.no_grad(): yield from self.forecast_generator( inference_data_loader=inference_data_loader, prediction_net=self.prediction_net, input_names=self.input_names, freq=self.freq, output_transform=self.output_transform, num_samples=num_samples, )
def test_inference_data_loader(dataset_context): with dataset_context as dataset: dataset_length = len(list(dataset)) counter = defaultdict(lambda: 0) dl = InferenceDataLoader( dataset=dataset, transform=default_transformation(), batch_size=4, stack_fn=partial(batchify, ctx=current_context()), ) batches = list(dl) for batch in batches: assert all(x is False for x in batch["is_train"]) counter = count_item_ids(batches) for entry in dataset: assert counter[entry[FieldName.ITEM_ID]] == 1
def predict( self, dataset: Dataset, num_samples: Optional[int] = None, num_workers: Optional[int] = None, num_prefetch: Optional[int] = None, **kwargs, ) -> Iterator[Forecast]: inference_data_loader = InferenceDataLoader( dataset, transform=self.input_transform, batch_size=self.batch_size, stack_fn=partial(batchify, ctx=self.ctx, dtype=self.dtype), ) with mx.Context(self.ctx): yield from self.forecast_generator( inference_data_loader=inference_data_loader, prediction_net=self.prediction_net, input_names=self.input_names, freq=self.freq, output_transform=self.output_transform, num_samples=num_samples, )
def predict(self, dataset: Dataset, **kwargs) -> Iterator[Forecast]: inference_data_loader = InferenceDataLoader( dataset, self.input_transform, self.batch_size, ctx=self.ctx, float_type=self.float_type, ) for batch in inference_data_loader: inputs = [batch[k] for k in self.input_names] outputs = self.prediction_net(*inputs).asnumpy() if self.output_transform is not None: outputs = self.output_transform(batch, outputs) assert len(batch['forecast_start']) == len(outputs) for i, output in enumerate(outputs): yield self._forecast_cls( output, start_date=batch['forecast_start'][i], freq=self.freq, item_id=batch['item_id'][i] if 'item_id' in batch else None, info=batch['info'][i] if 'info' in batch else None, **self.forecast_kwargs, )
def backtest_metrics( train_dataset: Optional[Dataset], test_dataset: Dataset, forecaster: Union[Estimator, Predictor], evaluator=Evaluator( quantiles=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) ), num_eval_samples: int = 100, logging_file: Optional[str] = None, use_symbol_block_predictor: bool = False, ): """ Parameters ---------- train_dataset Dataset to use for training. test_dataset Dataset to use for testing. forecaster An estimator or a predictor to use for generating predictions. evaluator Evaluator to use. num_eval_samples Number of samples to use when generating sample-based forecasts. logging_file If specified, information of the backtest is redirected to this file. use_symbol_block_predictor Use a :class:`SymbolBlockPredictor` during testing. Returns ------- tuple A tuple of aggregate metrics and per-time-series metrics obtained by training `forecaster` on `train_dataset` and evaluating the resulting `evaluator` provided on the `test_dataset`. """ if logging_file is not None: log_formatter = logging.Formatter( "[%(asctime)s %(levelname)s %(thread)d] %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger = logging.getLogger(__name__) handler = logging.FileHandler(logging_file) handler.setFormatter(log_formatter) logger.addHandler(handler) else: logger = logging.getLogger(__name__) if train_dataset is not None: train_statistics = calculate_dataset_statistics(train_dataset) serialize_message(logger, train_dataset_stats_key, train_statistics) test_statistics = calculate_dataset_statistics(test_dataset) serialize_message(logger, test_dataset_stats_key, test_statistics) if isinstance(forecaster, Estimator): serialize_message(logger, estimator_key, forecaster) predictor = forecaster.train(train_dataset) if isinstance(forecaster, GluonEstimator) and isinstance( predictor, GluonPredictor ): inference_data_loader = InferenceDataLoader( dataset=test_dataset, transform=predictor.input_transform, batch_size=forecaster.trainer.batch_size, ctx=forecaster.trainer.ctx, float_type=forecaster.float_type, ) if forecaster.trainer.hybridize: predictor.hybridize(batch=next(iter(inference_data_loader))) if use_symbol_block_predictor: predictor = predictor.as_symbol_block_predictor( batch=next(iter(inference_data_loader)) ) else: predictor = forecaster forecast_it, ts_it = make_evaluation_predictions( test_dataset, predictor=predictor, num_eval_samples=num_eval_samples ) agg_metrics, item_metrics = evaluator( ts_it, forecast_it, num_series=len(test_dataset) ) # we only log aggregate metrics for now as item metrics may be very large for name, value in agg_metrics.items(): serialize_message(logger, f"metric-{name}", value) if logging_file is not None: # Close the file handler to avoid letting the file open. # https://stackoverflow.com/questions/24816456/python-logging-wont-shutdown logger.removeHandler(handler) del logger, handler return agg_metrics, item_metrics
def backtest_metrics( train_dataset: Optional[Dataset], test_dataset: Dataset, forecaster: Union[Estimator, Predictor], evaluator=Evaluator( quantiles=(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9) ), num_samples: int = 100, logging_file: Optional[str] = None, use_symbol_block_predictor: Optional[bool] = False, num_workers: Optional[int] = None, num_prefetch: Optional[int] = None, **kwargs, ): """ Parameters ---------- train_dataset Dataset to use for training. test_dataset Dataset to use for testing. forecaster An estimator or a predictor to use for generating predictions. evaluator Evaluator to use. num_samples Number of samples to use when generating sample-based forecasts. logging_file If specified, information of the backtest is redirected to this file. use_symbol_block_predictor Use a :class:`SymbolBlockPredictor` during testing. num_workers The number of multiprocessing workers to use for data preprocessing. By default 0, in which case no multiprocessing will be utilized. num_prefetch The number of prefetching batches only works if `num_workers` > 0. If `prefetch` > 0, it allow worker process to prefetch certain batches before acquiring data from iterators. Note that using large prefetching batch will provide smoother bootstrapping performance, but will consume more shared_memory. Using smaller number may forfeit the purpose of using multiple worker processes, try reduce `num_workers` in this case. By default it defaults to `num_workers * 2`. Returns ------- tuple A tuple of aggregate metrics and per-time-series metrics obtained by training `forecaster` on `train_dataset` and evaluating the resulting `evaluator` provided on the `test_dataset`. """ if logging_file is not None: log_formatter = logging.Formatter( "[%(asctime)s %(levelname)s %(thread)d] %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger = logging.getLogger(__name__) handler = logging.FileHandler(logging_file) handler.setFormatter(log_formatter) logger.addHandler(handler) else: logger = logging.getLogger(__name__) if train_dataset is not None: train_statistics = calculate_dataset_statistics(train_dataset) serialize_message(logger, train_dataset_stats_key, train_statistics) test_statistics = calculate_dataset_statistics(test_dataset) serialize_message(logger, test_dataset_stats_key, test_statistics) if isinstance(forecaster, Estimator): serialize_message(logger, estimator_key, forecaster) assert train_dataset is not None predictor = forecaster.train(train_dataset) if isinstance(forecaster, GluonEstimator) and isinstance( predictor, GluonPredictor ): inference_data_loader = InferenceDataLoader( dataset=test_dataset, transform=predictor.input_transform, batch_size=forecaster.trainer.batch_size, ctx=forecaster.trainer.ctx, dtype=forecaster.dtype, num_workers=num_workers, num_prefetch=num_prefetch, **kwargs, ) if forecaster.trainer.hybridize: predictor.hybridize(batch=next(iter(inference_data_loader))) if use_symbol_block_predictor: predictor = predictor.as_symbol_block_predictor( batch=next(iter(inference_data_loader)) ) else: predictor = forecaster forecast_it, ts_it = make_evaluation_predictions( test_dataset, predictor=predictor, num_samples=num_samples ) agg_metrics, item_metrics = evaluator( ts_it, forecast_it, num_series=maybe_len(test_dataset) ) # we only log aggregate metrics for now as item metrics may be very large for name, value in agg_metrics.items(): serialize_message(logger, f"metric-{name}", value) if logging_file is not None: # Close the file handler to avoid letting the file open. # https://stackoverflow.com/questions/24816456/python-logging-wont-shutdown logger.removeHandler(handler) del logger, handler return agg_metrics, item_metrics
def create_loaders( dataset, batch_sizes, past_length, prediction_length_full, prediction_length_rolling, num_batches_per_epoch=50, num_workers=0, extract_tail_chunks_for_train: bool = False, val_full_length=True, ): """ The past_length and prediction_length is seriously unintuitive in gluonTS. Here is a little summary to make sure it is used it correctly: - loader does NOT provide data[-past_length-prediction_length: -prediction_length]. --> Train set may not include test range. prediction_length does not cut it out. - loader instead provides data[-past_length:] and adds prediction_length time_features. --> AFTER this loader, we must AGAIN MANUALLY cut out targets[-prediction_length:]. I had as follows: train: past_length=past_length, prediction_length=0 test: past_length=past_length+n_steps_forecast, prediction_length=0 && cut out y[-prediction_length:] from batch, but use it for eval. Train does not need to forecast Test gives the whole thing for features and we cut out the final part for targets. Now want to do as they do in the repo. for both do: past_length=past_length, prediction_length=prediction_length train: does not matter, prediction_length not used test: this does not make sense. It still gives you y[-past_length:] ... """ input_transforms = {} for name in ["train", "val", "test_full", "test_rolling"]: if name == "train": prediction_length = 0 is_train = True past_length_ = past_length elif name == "val": prediction_length = 0 is_train = False past_length_ = past_length + ( prediction_length_full if val_full_length else prediction_length_rolling ) elif name == "test_full": prediction_length = prediction_length_full is_train = False past_length_ = past_length elif name == "test_rolling": prediction_length = prediction_length_rolling is_train = False past_length_ = past_length else: raise Exception(f"unknown dataset: {name}") input_transforms[name] = create_input_transform( is_train=is_train, prediction_length=prediction_length, past_length=past_length_, use_feat_static_cat=True, use_feat_dynamic_real=True if dataset.metadata.feat_dynamic_real else False, freq=dataset.metadata.freq, time_features=None, extract_tail_chunks_for_train=extract_tail_chunks_for_train, ) train_loader = TrainDataLoader( dataset=dataset.train, transform=input_transforms["train"], num_batches_per_epoch=num_batches_per_epoch, batch_size=batch_sizes["train"], num_workers=num_workers, ctx=None, dtype=np.float32, ) val_loader = ValidationDataLoader( dataset=dataset.train, transform=input_transforms["val"], batch_size=batch_sizes["val"], num_workers=num_workers, ctx=None, dtype=np.float32, ) test_full_loader = InferenceDataLoader( dataset=dataset.test, transform=input_transforms["test_full"], batch_size=batch_sizes["test_full"], num_workers=num_workers, ctx=None, dtype=np.float32, ) test_rolling_loader = InferenceDataLoader( dataset=dataset.test, transform=input_transforms["test_rolling"], batch_size=batch_sizes["test_rolling"], num_workers=num_workers, ctx=None, dtype=np.float32, ) return ( train_loader, val_loader, test_full_loader, test_rolling_loader, input_transforms, )