def _forecast(self): """ make forecasts for all series contained in data """ all_forecasts = [] with tqdm( self.predictor.predict(self.data), total=len(self.data), desc="Making Predictions", ) as forecasts, np.errstate(invalid="ignore"): for forecast in forecasts: if self.mean: point_estimate = np.mean(forecast.samples, axis=0) else: point_estimate = np.median(forecast.samples, axis=0) all_forecasts.append(point_estimate) all_forecasts = np.array(all_forecasts) if self.interpretable and len(self.data) > 0: trends = [] for predictor in self.predictor.predictors: trends.append(predictor.prediction_net.get_trend_forecast()) predictor.prediction_net.clear_trend_forecast() trends = np.stack(trends) trends = np.mean(trends, axis=0) trends = np.expand_dims(trends, axis=1) seasonalities = all_forecasts - trends all_forecasts = np.concatenate( (all_forecasts, trends, seasonalities), axis=1) return all_forecasts # Batch/Series, Components, Prediction Length
def save_datasets( path: Path, data: List[Dict], train_offset: int, default_start_timestamp: Optional[str] = None, ): train = path / "train" test = path / "test" train.mkdir(exist_ok=True) test.mkdir(exist_ok=True) with open(train / "data.json", "w") as train_fp, open(test / "data.json", "w") as test_fp: for i, data_entry in tqdm(enumerate(data), total=len(data), desc="creating json files"): # Convert the data to a GluonTS dataset... # - `default_start_timestamp` is required for some datasets which # are not listed here since some datasets do not define start # timestamps # - `item_id` is added for all datasets ... many datasets provide # the "series_name" dic = to_dict( target_values=data_entry["target"], start=str( data_entry.get("start_timestamp", default_start_timestamp)), item_id=data_entry.get("series_name", i), ) jsonl.dump([dic], test_fp) dic["target"] = dic["target"][:-train_offset] jsonl.dump([dic], train_fp)
def __call__( self, ts_iterator: Iterable[Union[pd.DataFrame, pd.Series]], fcst_iterator: Iterable[Forecast], num_series: Optional[int] = None, ) -> Tuple[Dict[str, float], pd.DataFrame]: """ Compute accuracy metrics by comparing actual data to the forecasts. Parameters ---------- ts_iterator iterator containing true target on the predicted range fcst_iterator iterator of forecasts on the predicted range num_series number of series of the iterator (optional, only used for displaying progress) Returns ------- dict Dictionary of aggregated metrics pd.DataFrame DataFrame containing per-time-series metrics """ ts_iterator = iter(ts_iterator) fcst_iterator = iter(fcst_iterator) rows = [] with tqdm( zip(ts_iterator, fcst_iterator), total=num_series, desc="Running evaluation", ) as it, np.errstate(invalid="ignore"): for ts, forecast in it: rows.append(self.get_metrics_per_ts(ts, forecast)) assert not any( True for _ in ts_iterator ), "ts_iterator has more elements than fcst_iterator" assert not any( True for _ in fcst_iterator ), "fcst_iterator has more elements than ts_iterator" if num_series is not None: assert ( len(rows) == num_series ), f"num_series={num_series} did not match number of elements={len(rows)}" # If all entries of a target array are NaNs, the resulting metric will have value "masked". Pandas does not # handle masked values correctly. Thus we set dtype=np.float64 to convert masked values back to NaNs which # are handled correctly by pandas Dataframes during aggregation. metrics_per_ts = pd.DataFrame(rows, dtype=np.float64) return self.get_aggregate_metrics(metrics_per_ts)
def train(self): with tqdm(self.training_data_loader) as it: for batch_no, data_entry in enumerate(it, start=1): if False: break inputs = [data_entry[k] for k in input_names] @ag.args() def train_finetune(args, reporter): estimator = SimpleFeedForwardEstimator( num_hidden_dimensions=[10], prediction_length=dataset.metadata.prediction_length, context_length=100, freq=dataset.metadata.freq, trainer=Trainer(ctx="cpu", epochs=5, learning_rate=args.learning_rate, num_batches_per_epoch=100 ) ) net = estimator.create_training_network() net.initialize(ctx=None, init='xavier') lr_scheduler = lrs.MetricAttentiveScheduler( objective="min", patience=estimator.trainer.patience, decay_factor=estimator.trainer.learning_rate_decay_factor, min_lr=estimator.trainer.minimum_learning_rate, ) optimizer = mx.optimizer.Adam( learning_rate=estimator.trainer.learning_rate, lr_scheduler=lr_scheduler, wd=estimator.trainer.weight_decay, clip_gradient=estimator.trainer.clip_gradient, ) trainer = mx.gluon.Trainer( net.collect_params(), optimizer=optimizer, kvstore="device", # FIXME: initialize properly ) for epoch in range(args.epochs): mse = newloop(epoch,net,trainer,inputs) print('MSE:', mse) reporter(epoch = epoch+1, accuracy = -mse) train_finetune.register_args(**self.dictionary_of_hyperparameters) self.scheduler = ag.scheduler.FIFOScheduler(train_finetune, resource={'num_cpus': 4, 'num_gpus': 0}, num_trials=5, time_attr='epoch', reward_attr="accuracy") self.scheduler.run() self.scheduler.join_jobs()
def download(self, path: Path): file_path = path / self.file_name with tqdm( [], unit="B", unit_scale=True, unit_divisor=1024, miniters=5, desc=f"Download {self.file_name}:", ) as _tqdm: request.urlretrieve( self.url, filename=file_path, reporthook=request_retrieve_hook(_tqdm), ) return file_path
def create_data(self, dataset): input_names = ['past_target', 'future_target'] training_data_loader = TrainDataLoader( dataset=dataset.train, transform=transformation, batch_size=trainer.batch_size, num_batches_per_epoch=trainer.num_batches_per_epoch, ctx=trainer.ctx, dtype=dtype, num_workers=num_workers, num_prefetch=num_prefetch, ) with tqdm(training_data_loader) as it: for batch_no, data_entry in enumerate(it, start=1): if False: break inputs = [data_entry[k] for k in input_names]
def _forecast(self): """ make forecasts for all series contained in data """ all_forecasts = [] with tqdm(self.predictor.predict(self.data, num_samples=self.num_samples), total=len(self.data), desc="Making Predictions") as forecasts, np.errstate( invalid='ignore'): for forecast in forecasts: point_estimate = forecast.mean if self.mean else forecast.quantile( 0.5) quantiles = np.vstack( [point_estimate] + [forecast.quantile(q) for q in self.quantiles]) all_forecasts.append(quantiles) return np.array( all_forecasts) # Batch/Series, Quantiles, Prediction Length
def check_dataset(dataset_path: Path, length: int, sheet_name): # check that things are correct from gluonts.dataset.common import load_datasets ds = load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", ) assert ds.test is not None assert len(list(ds.train)) == length assert len(list(ds.test)) == length assert ds.metadata.prediction_length is not None for ts_train, ts_test in tqdm( zip(ds.train, ds.test), total=length, desc="checking consistency" ): train_target = ts_train["target"] test_target = ts_test["target"] assert ( len(train_target) == len(test_target) - ds.metadata.prediction_length ) assert np.all(train_target == test_target[: len(train_target)]) assert ts_train["start"] == ts_test["start"] start = ts_train["start"] regex = r"^(\d{4})-(\d{2})-(\d{2})( 00:00(:00)?)?$" m = re.match(regex, str(start)) assert m month, day = m.group(2), m.group(3) if sheet_name in ["M3Quart", "Other"]: assert f"{month}-{day}" in [ "03-31", "06-30", "09-30", "12-31", ], f"Invalid time stamp `{month}-{day}`" elif sheet_name == "M3Year": assert ( f"{month}-{day}" == "12-31" ), f"Invalid time stamp {month}-{day}"
def test_mixture_inference() -> None: mdo = MixtureDistributionOutput([GaussianOutput(), GaussianOutput()]) args_proj = mdo.get_args_proj() args_proj.initialize() args_proj.hybridize() input = mx.nd.ones((BATCH_SIZE, 1)) distr_args = args_proj(input) d = mdo.distribution(distr_args) # plot_samples(d.sample()) trainer = mx.gluon.Trainer( args_proj.collect_params(), "sgd", {"learning_rate": 0.02} ) mixture_samples = mx.nd.array(np_samples) N = 1000 t = tqdm(list(range(N))) for i in t: with mx.autograd.record(): distr_args = args_proj(input) d = mdo.distribution(distr_args) loss = d.loss(mixture_samples) loss.backward() loss_value = loss.mean().asnumpy() t.set_postfix({"loss": loss_value}) trainer.step(BATCH_SIZE) distr_args = args_proj(input) d = mdo.distribution(distr_args) obtained_hist = histogram(d.sample().asnumpy()) # uncomment to see histograms # pl.plot(obtained_hist) # pl.plot(EXPECTED_HIST) # pl.show() assert diff(obtained_hist, EXPECTED_HIST) < 0.5
def test_nanmixture_gaussian_inference() -> None: nmdo = NanMixtureOutput(GaussianOutput()) args_proj = nmdo.get_args_proj() args_proj.initialize() args_proj.hybridize() input = mx.nd.ones((NUM_SAMPLES)) trainer = mx.gluon.Trainer( args_proj.collect_params(), "sgd", {"learning_rate": 0.00001} ) mixture_samples = mx.nd.array(np_samples) N = 1000 t = tqdm(list(range(N))) for _ in t: with mx.autograd.record(): distr_args = args_proj(input) d = nmdo.distribution(distr_args) loss = d.loss(mixture_samples) loss.backward() loss_value = loss.mean().asnumpy() t.set_postfix({"loss": loss_value}) trainer.step(NUM_SAMPLES) mu_hat = d.distribution.mu.asnumpy() sigma_hat = d.distribution.sigma.asnumpy() nan_prob_hat = d.nan_prob.asnumpy() assert ( np.abs(mu - mu_hat) < TOL ), f"mu did not match: mu = {mu}, mu_hat = {mu_hat}" assert ( np.abs(sigma - sigma_hat) < TOL ), f"sigma did not match: sigma = {sigma}, sigma_hat = {sigma_hat}" assert ( np.abs(nan_prob - nan_prob_hat) < TOL ), f"nan_prob did not match: nan_prob = {nan_prob}, nan_prob_hat = {nan_prob_hat}"
def save_datasets(path: Path, data: List[Dict], train_offset: int): train = path / "train" test = path / "test" train.mkdir(exist_ok=True) test.mkdir(exist_ok=True) with open(train / "data.json", "w") as train_fp, open(test / "data.json", "w") as test_fp: for data_entry in tqdm(data, total=len(data), desc="creating json files"): dic = to_dict( target_values=data_entry["target"], start=str(data_entry["start_timestamp"]), ) jsonl.dump([dic], test_fp) dic["target"] = dic["target"][:-train_offset] jsonl.dump([dic], train_fp)
def __call__( self, ts_iterator: Iterable[Union[pd.DataFrame, pd.Series]], fcst_iterator: Iterable[Forecast], num_series: Optional[int] = None, ) -> Tuple[Dict[str, float], pd.DataFrame]: """ Compute accuracy metrics by comparing actual data to the forecasts. Parameters ---------- ts_iterator iterator containing true target on the predicted range fcst_iterator iterator of forecasts on the predicted range num_series number of series of the iterator (optional, only used for displaying progress) Returns ------- dict Dictionary of aggregated metrics pd.DataFrame DataFrame containing per-time-series metrics """ ts_iterator = iter(ts_iterator) fcst_iterator = iter(fcst_iterator) rows = [] with tqdm( zip(ts_iterator, fcst_iterator), total=num_series, desc="Running evaluation", ) as it, np.errstate(invalid="ignore"): for ts, forecast in it: rows.append(self.get_metrics_per_ts(ts, forecast)) return rows
def test_nanmixture_categorical_inference() -> None: nmdo = NanMixtureOutput(CategoricalOutput(3)) args_proj = nmdo.get_args_proj() args_proj.initialize() args_proj.hybridize() input = mx.nd.ones((NUM_SAMPLES)) trainer = mx.gluon.Trainer( args_proj.collect_params(), "sgd", {"learning_rate": 0.000002} ) mixture_samples = mx.nd.array(cat_samples) N = 3000 t = tqdm(list(range(N))) for _ in t: with mx.autograd.record(): distr_args = args_proj(input) d = nmdo.distribution(distr_args) loss = d.loss(mixture_samples) loss.backward() loss_value = loss.mean().asnumpy() t.set_postfix({"loss": loss_value}) trainer.step(NUM_SAMPLES) distr_args = args_proj(input) d = nmdo.distribution(distr_args) cat_probs_hat = d.distribution.probs.asnumpy() nan_prob_hat = d.nan_prob.asnumpy() assert np.allclose( cat_probs, cat_probs_hat, atol=TOL ), f"categorical dist: cat_probs did not match: cat_probs = {cat_probs}, cat_probs_hat = {cat_probs_hat}" assert ( np.abs(nan_prob - nan_prob_hat) < TOL ), f"categorical dist: nan_prob did not match: nan_prob = {nan_prob}, nan_prob_hat = {nan_prob_hat}"
def check_dataset(dataset_path: Path, length: int): # check that things are correct from gluonts.dataset.common import load_datasets ds = load_datasets( metadata=dataset_path, train=dataset_path / "train", test=dataset_path / "test", ) assert ds.test is not None assert len(list(ds.train)) == length assert len(list(ds.test)) == length assert ds.metadata.prediction_length is not None for ts_train, ts_test in tqdm(zip(ds.train, ds.test), total=length, desc="checking consistency"): train_target = ts_train["target"] test_target = ts_test["target"] assert (len(train_target) == len(test_target) - ds.metadata.prediction_length) assert np.all(train_target == test_target[:len(train_target)])
def __call__( self, net: nn.HybridBlock, input_names: List[str], train_iter: TrainDataLoader, ) -> None: # TODO: we may want to return some training information here self.halt = False with tempfile.TemporaryDirectory( prefix="gluonts-trainer-temp-") as gluonts_temp: def base_path() -> str: return os.path.join( gluonts_temp, "{}_{}".format(STATE_ARTIFACT_FILE_NAME, uuid.uuid4()), ) logging.info("Start model training") net.initialize(ctx=self.ctx, init=self.init) with HybridContext( net=net, hybridize=self.hybridize, static_alloc=True, static_shape=True, ): batch_size = train_iter.batch_size epoch_loss = mx.metric.Loss() best_epoch_info = BestEpochInfo( params_path="%s-%s.params" % (base_path(), "init"), epoch_no=-1, metric_value=np.Inf, ) lr_scheduler = lrs.MetricAttentiveScheduler( objective="min", patience=self.patience, decay_factor=self.learning_rate_decay_factor, min_lr=self.minimum_learning_rate, ) optimizer = mx.optimizer.Adam( learning_rate=self.learning_rate, lr_scheduler=lr_scheduler, wd=self.weight_decay, clip_gradient=self.clip_gradient, ) trainer = mx.gluon.Trainer( net.collect_params(), optimizer=optimizer, kvstore="device", # FIXME: initialize properly ) for epoch_no in range(self.epochs): if self.halt: logging.info( f"Epoch[{epoch_no}] Interrupting training") break curr_lr = trainer.learning_rate logging.info( f"Epoch[{epoch_no}] Learning rate is {curr_lr}") # mark epoch start time tic = time.time() epoch_loss.reset() with tqdm(train_iter) as it: for batch_no, data_entry in enumerate(it, start=1): if self.halt: break inputs = [data_entry[k] for k in input_names] with mx.autograd.record(): output = net(*inputs) # network can returns several outputs, the first being always the loss # when having multiple outputs, the forward returns a list in the case of hybrid and a # tuple otherwise # we may wrap network outputs in the future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output loss.backward() trainer.step(batch_size) epoch_loss.update(None, preds=loss) it.set_postfix( ordered_dict={ "avg_epoch_loss": loss_value(epoch_loss) }, refresh=False, ) # print out parameters of the network at the first pass if batch_no == 1 and epoch_no == 0: net_name = type(net).__name__ num_model_param = self.count_model_params(net) logging.info( f"Number of parameters in {net_name}: {num_model_param}" ) # mark epoch end time and log time cost of current epoch toc = time.time() logging.info( "Epoch[%d] Elapsed time %.3f seconds", epoch_no, (toc - tic), ) # check and log epoch loss check_loss_finite(loss_value(epoch_loss)) logging.info( "Epoch[%d] Evaluation metric '%s'=%f", epoch_no, "epoch_loss", loss_value(epoch_loss), ) lr_scheduler.step(loss_value(epoch_loss)) if loss_value(epoch_loss) < best_epoch_info.metric_value: best_epoch_info = BestEpochInfo( params_path="%s-%04d.params" % (base_path(), epoch_no), epoch_no=epoch_no, metric_value=loss_value(epoch_loss), ) net.save_parameters( best_epoch_info.params_path ) # TODO: handle possible exception if not trainer.learning_rate == curr_lr: logging.info(f"Loading parameters from best epoch " f"({best_epoch_info.epoch_no})") net.load_parameters(best_epoch_info.params_path, self.ctx) logging.info(f"Loading parameters from best epoch " f"({best_epoch_info.epoch_no})") net.load_parameters(best_epoch_info.params_path, self.ctx) logging.info(f"Final loss: {best_epoch_info.metric_value} " f"(occurred at epoch {best_epoch_info.epoch_no})") # save net parameters net.save_parameters(best_epoch_info.params_path) logging.getLogger().info("End model training")
x: Tensor, mdo: MixtureDistributionOutput, variate_dimensionality: int = 1, epochs: int = 1_000, ): args_proj = mdo.get_args_proj() args_proj.initialize() args_proj.hybridize() input = mx.nd.ones((variate_dimensionality, 1)) trainer = mx.gluon.Trainer( args_proj.collect_params(), "sgd", {"learning_rate": 0.02} ) t = tqdm(list(range(epochs))) for _ in t: with mx.autograd.record(): distr_args = args_proj(input) d = mdo.distribution(distr_args) loss = d.loss(x).mean() loss.backward() loss_value = loss.asnumpy() t.set_postfix({"loss": loss_value}) trainer.step(1) distr_args = args_proj(input) d = mdo.distribution(distr_args) return d
def loop(epoch_no, batch_iter, is_training: bool = True) -> mx.metric.Loss: nonlocal first_forward tic = time.time() epoch_loss = mx.metric.Loss() # use averaged model for validation if not is_training and isinstance( self.avg_strategy, IterationAveragingStrategy): self.avg_strategy.load_averaged_model(net) with tqdm(batch_iter) as it: for batch_no, data_entry in enumerate(it, start=1): if self.halt: break inputs = [data_entry[k] for k in input_names] if first_forward: first_forward = False _ = net(*inputs) if self.post_initialize_cb: self.post_initialize_cb(net) with mx.autograd.record(): output = net(*inputs) # network can returns several outputs, the first being always the loss # when having multiple outputs, the forward returns a list in the case of hybrid and a # tuple otherwise # we may wrap network outputs in the future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output if is_training: loss.backward() trainer.step(batch_size) # iteration averaging in training if isinstance( self.avg_strategy, IterationAveragingStrategy, ): self.avg_strategy.apply(net) epoch_loss.update(None, preds=loss) lv = loss_value(epoch_loss) if not np.isfinite(lv): logger.warning("Epoch[%d] gave nan loss", epoch_no) return epoch_loss it.set_postfix( ordered_dict={ "epoch": f"{epoch_no + 1}/{self.epochs}", ("" if is_training else "validation_") + "avg_epoch_loss": lv, }, refresh=False, ) # print out parameters of the network at the first pass if batch_no == 1 and epoch_no == 0: net_name = type(net).__name__ num_model_param = self.count_model_params(net) logger.info( f"Number of parameters in {net_name}: {num_model_param}" ) # mark epoch end time and log time cost of current epoch toc = time.time() logger.info( "Epoch[%d] Elapsed time %.3f seconds", epoch_no, (toc - tic), ) logger.info( "Epoch[%d] Evaluation metric '%s'=%f", epoch_no, ("" if is_training else "validation_") + "epoch_loss", lv, ) if not is_training and isinstance( self.avg_strategy, IterationAveragingStrategy): # bring back the cached model self.avg_strategy.load_cached_model(net) return epoch_loss
def loop(epoch_no, batch_iter, is_training: bool = True) -> mx.metric.Loss: tic = time.time() epoch_loss = mx.metric.Loss() with tqdm(batch_iter) as it: for batch_no, data_entry in enumerate(it, start=1): if self.halt: break inputs = [data_entry[k] for k in input_names] with mx.autograd.record(): output = net(*inputs) # network can returns several outputs, the first being always the loss # when having multiple outputs, the forward returns a list in the case of hybrid and a # tuple otherwise # we may wrap network outputs in the future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output if is_training: loss.backward() trainer.step(batch_size) epoch_loss.update(None, preds=loss) it.set_postfix( ordered_dict={ ("" if is_training else "validation_") + "avg_epoch_loss": loss_value(epoch_loss) }, refresh=False, ) # print out parameters of the network at the first pass if batch_no == 1 and epoch_no == 0: net_name = type(net).__name__ num_model_param = self.count_model_params(net) logging.info( f"Number of parameters in {net_name}: {num_model_param}" ) # mark epoch end time and log time cost of current epoch toc = time.time() if (epoch_no % 16 == 0): logging.info( "Epoch[%d] Elapsed time %.3f seconds", epoch_no, (toc - tic), ) # check and log epoch loss check_loss_finite(loss_value(epoch_loss)) if (epoch_no % 16 == 0): logging.info( "Epoch[%d] Evaluation metric '%s'=%f", epoch_no, ("" if is_training else "validation_") + "epoch_loss", loss_value(epoch_loss), ) return epoch_loss
def __call__( self, ts_iterator: Iterable[Union[pd.DataFrame, pd.Series]], fcst_iterator: Iterable[Forecast], num_series: Optional[int] = None, ) -> Tuple[Dict[str, float], pd.DataFrame]: """ Compute accuracy metrics by comparing actual data to the forecasts. Parameters ---------- ts_iterator iterator containing true target on the predicted range fcst_iterator iterator of forecasts on the predicted range num_series number of series of the iterator (optional, only used for displaying progress) Returns ------- dict Dictionary of aggregated metrics pd.DataFrame DataFrame containing per-time-series metrics """ ts_iterator = iter(ts_iterator) fcst_iterator = iter(fcst_iterator) rows = [] with tqdm( zip(ts_iterator, fcst_iterator), total=num_series, desc="Running evaluation", ) as it, np.errstate(invalid="ignore"): if self.num_workers and not sys.platform == "win32": mp_pool = multiprocessing.Pool(initializer=None, processes=self.num_workers) rows = mp_pool.map( func=partial(worker_function, self), iterable=iter(it), chunksize=self.chunk_size, ) mp_pool.close() mp_pool.join() else: for ts, forecast in it: rows.append(self.get_metrics_per_ts(ts, forecast)) assert not any(True for _ in ts_iterator ), "ts_iterator has more elements than fcst_iterator" assert not any(True for _ in fcst_iterator ), "fcst_iterator has more elements than ts_iterator" if num_series is not None: assert len(rows) == num_series, ( f"num_series={num_series} did not match number of" f" elements={len(rows)}") # If all entries of a target array are NaNs, the resulting metric will # have value "masked". Pandas does not handle masked values correctly. # Thus we set dtype=np.float64 to convert masked values back to NaNs # which are handled correctly by pandas Dataframes during # aggregation. metrics_per_ts = pd.DataFrame(rows, dtype=np.float64) return self.get_aggregate_metrics(metrics_per_ts)
from gluonts.trainer import learning_rate_scheduler as lrs from mxnet import gluon, init from newloop import newloop import autogluon as ag from autogluon.utils.mxutils import get_data_rec from loop import loop from testloop import training_data_loader from estimator import estimator from gluonts.gluonts_tqdm import tqdm input_names = ['past_target', 'future_target'] from gluonts.model.simple_feedforward._estimator import SimpleFeedForwardEstimator from dataset import dataset from gluonts.trainer import Trainer from asset import optimizer with tqdm(training_data_loader) as it: for batch_no, data_entry in enumerate(it, start=1): if False: break inputs = [data_entry[k] for k in input_names] class AutoEstimator: def __init__(self, search_space): search_config = {} search_config['learning_rate'] = ag.Real(1e-3, 1e-2, log=True) search_config['epochs'] = ag.Choice(40, 80) for config in search_config.keys(): if not config in search_space.keys(): search_space[config] = search_config[config]
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics: """ Computes the statistics of a given Dataset. Parameters ---------- ts_dataset Dataset of which to compute the statistics. Returns ------- DatasetStatistics NamedTuple containing the statistics. """ num_time_observations = 0 num_time_series = 0 min_target = 1e20 max_target = -1e20 sum_target = 0.0 sum_abs_target = 0.0 integer_dataset = True observed_cats: Optional[List[Set[int]]] = None num_cats: Optional[int] = None num_dynamic_feat: Optional[int] = None num_missing_values = 0 scale_histogram = ScaleHistogram() with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it: for num_time_series, ts in it: target = ts['target'] observed_target = target[~np.isnan(target)] cat = ts['cat'] if 'cat' in ts else [] # FIXME num_observations = len(observed_target) scale_histogram.add(observed_target) if num_observations > 0: num_time_observations += num_observations # TODO: this code does not handle missing value: min_target would # TODO: be NaN if any missing value is present min_target = float(min(min_target, observed_target.min())) max_target = float(max(max_target, observed_target.max())) num_missing_values += int(np.isnan(target).sum()) assert_data_error( np.all(np.isfinite(observed_target)), 'Target values have to be finite (e.g., not "inf", "-inf", ' '"nan", or null) and cannot exceed single precision floating ' 'point range.', ) sum_target += float(observed_target.sum()) sum_abs_target += float(np.abs(observed_target).sum()) integer_dataset = integer_dataset and bool( np.all(np.mod(observed_target, 1) == 0)) if num_cats is None: num_cats = len(cat) observed_cats = [set() for _ in range(num_cats)] # needed to type check assert num_cats is not None assert observed_cats is not None assert_data_error( num_cats == len(cat), 'Not all cat vectors have the same length {} != {}.', num_cats, len(cat), ) for i, c in enumerate(cat): observed_cats[i].add(c) dynamic_feat = ts['dynamic_feat'] if 'dynamic_feat' in ts else None if dynamic_feat is None: # dynamic_feat not found, check it was the first ts we encounter or # that dynamic_feat were seen before assert_data_error( num_dynamic_feat is None or num_dynamic_feat == 0, 'dynamic_feat was found for some instances but not others.', ) num_dynamic_feat = 0 else: if num_dynamic_feat is None: # first dynamic_feat found num_dynamic_feat = dynamic_feat.shape[0] else: assert_data_error( num_dynamic_feat == dynamic_feat.shape[0], 'Found instances with different number of features in ' 'dynamic_feat, found one with {} and another with {}.', num_dynamic_feat, dynamic_feat.shape[0], ) assert_data_error( np.all(np.isfinite(dynamic_feat)), 'Features values have to be finite and cannot exceed single ' 'precision floating point range.', ) num_dynamic_feat_time_steps = dynamic_feat.shape[1] assert_data_error( num_dynamic_feat_time_steps == len(target), 'Each feature in dynamic_feat has to have the same length as ' 'the target. Found an instance with dynamic_feat of length {} ' 'and a target of length {}.', num_dynamic_feat_time_steps, len(target), ) assert_data_error(num_time_series > 0, 'Time series dataset is empty!') assert_data_error( num_time_observations > 0, 'Only empty time series found in the dataset!', ) # note this require the above assumption to avoid a division by zero # runtime error mean_target_length = num_time_observations / num_time_series # note this require the above assumption to avoid a division by zero # runtime error mean_target = sum_target / num_time_observations mean_abs_target = sum_abs_target / num_time_observations integer_dataset = integer_dataset and min_target >= 0.0 assert len(scale_histogram) == num_time_series return DatasetStatistics( cats=observed_cats if observed_cats is not None else [], integer_dataset=integer_dataset, max_target=max_target, mean_abs_target=mean_abs_target, mean_target=mean_target, mean_target_length=mean_target_length, min_target=min_target, num_missing_values=num_missing_values, num_dynamic_feat=num_dynamic_feat if num_dynamic_feat else 0, num_time_observations=num_time_observations, num_time_series=num_time_series, scale_histogram=scale_histogram, )
def loop( # todo call run epoch epoch_no, batch_iter, num_batches_to_use: Optional[int] = None, is_training: bool = True, ) -> mx.metric.Loss: nonlocal first_forward tic = time.time() epoch_loss = mx.metric.Loss() if is_training: # We should not call this method if we haven't compiled the # network yet. Instead, this callback is called after # network initialization. if not first_forward: self.callbacks.on_train_epoch_start( training_network=net) else: self.callbacks.on_validation_epoch_start( training_network=net) batch_iter = itertools.islice(batch_iter, num_batches_to_use) it = tqdm(batch_iter, total=num_batches_to_use) for batch_no, batch in enumerate(it, start=1): # `batch` here is expected to be a dictionary whose fields # should correspond 1-to-1 with the network inputs # see below how `batch.values()` is fed into the network if self.halt: break if first_forward: first_forward = False _ = net(*batch.values()) self.callbacks.on_network_initializing_end( training_network=net) # Call the batch start callback as the model was not # compiled before self.callbacks.on_train_epoch_start( training_network=net) with mx.autograd.record(): # we set the mode explicitly as by default mxnet assumes # predict mode and hence dropout layers are not used if # the mode is not explicitly set to training mode = (autograd.train_mode if is_training else autograd.predict_mode) with mode(): output = net(*batch.values()) # network can returns several outputs, the first being # always the loss when having multiple outputs, the # forward returns a list in the case of hybrid and a # tuple otherwise we may wrap network outputs in the # future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output batch_size = loss.shape[0] if not np.isfinite(ndarray.sum(loss).asscalar()): logger.warning( "Batch [%d] of Epoch[%d] gave NaN loss and it will be ignored", batch_no, epoch_no, ) else: if is_training: loss.backward() trainer.step(batch_size) self.callbacks.on_train_batch_end( training_network=net) else: self.callbacks.on_validation_batch_end( training_network=net) epoch_loss.update(None, preds=loss) lv = loss_value(epoch_loss) it.set_postfix( ordered_dict={ "epoch": f"{epoch_no + 1}/{self.epochs}", ("" if is_training else "validation_") + "avg_epoch_loss": lv, }, refresh=False, ) # print out parameters of the network at the first pass if batch_no == 1 and epoch_no == 0: net_name = type(net).__name__ num_model_param = self.count_model_params(net) logger.info( f"Number of parameters in {net_name}: {num_model_param}" ) it.close() # mark epoch end time and log time cost of current epoch toc = time.time() logger.info( "Epoch[%d] Elapsed time %.3f seconds", epoch_no, (toc - tic), ) logger.info( "Epoch[%d] Evaluation metric '%s'=%f", epoch_no, ("" if is_training else "validation_") + "epoch_loss", lv, ) return epoch_loss
def loop( batch_iter: DataLoader, num_batches_to_use: Optional[int] = None, is_training: bool = True, ) -> mx.metric.Loss: nonlocal first_forward, time_elapsed, validation_idx tic = time.time() subtic = 0 epoch_loss = mx.metric.Loss() batch_iter = itertools.islice(batch_iter, num_batches_to_use) it = tqdm(batch_iter, total=num_batches_to_use) for batch_no, batch in enumerate(it, start=1): # `batch` here is expected to be a dictionary whose fields # should correspond 1-to-1 with the network inputs # see below how `batch.values()` is fed into the network if first_forward: tictic = time.time() first_forward = False _ = net(*batch.values()) self.callbacks.on_network_initialization_end(net) subtic += time.time() - tictic with mx.autograd.record(): # type: ignore # we set the mode explicitly as by default mxnet assumes # predict mode and hence dropout layers are not used if # the mode is not explicitly set to training mode = (autograd.train_mode if is_training else autograd.predict_mode) with mode(): output = net(*batch.values()) # network can returns several outputs, the first being # always the loss when having multiple outputs, the # forward returns a list in the case of hybrid and a # tuple otherwise we may wrap network outputs in the # future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output batch_size = loss.shape[0] # pylint: disable=no-member if not np.isfinite( ndarray.sum(loss).asscalar()): # type: ignore logger.warning( "Batch [%d] gave NaN loss and it will be ignored", batch_no, ) else: if is_training: loss.backward() trainer.step(batch_size) epoch_loss.update(None, preds=loss) if is_training: total_time_elapsed = (time_elapsed + time.time() - tic - subtic) orig_lr = trainer.learning_rate tictic = time.time() self.callbacks.on_train_batch_end(net, total_time_elapsed) subtic += time.time() - tictic if trainer.learning_rate != orig_lr: logger.info( "Trainer learning rate set to %f", trainer.learning_rate, ) lv = _loss_value(epoch_loss) it.set_postfix( ordered_dict={ ("" if is_training else "validation_") + "avg_epoch_loss": lv, }, refresh=False, ) # Check if should finish if is_training: if total_time_elapsed > self.training_time: # type: ignore time_elapsed = total_time_elapsed # type: ignore break if len(self.validation_milestones) > validation_idx and ( total_time_elapsed # type: ignore > self.validation_milestones[validation_idx]): time_elapsed = total_time_elapsed # type: ignore validation_idx += 1 break # If validating, call the callback with the loss else: self.callbacks.on_validation_epoch_end(lv) # mark epoch end time and log time cost of current epoch toc = time.time() logger.info("Elapsed time %.3f seconds", toc - tic) logger.info( "Evaluation metric '%s'=%f", ("" if is_training else "validation_") + "epoch_loss", lv, # type: ignore ) return epoch_loss
def generate_retail_dataset(dataset_path: Path, split: str = "2011-11-24"): retail_dataset_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00352/Online%20Retail.xlsx" df = pd.read_excel(retail_dataset_url) combination = ["StockCode", "Country"] df = _preprocess_retail_data(df, combination) df.to_pickle("tmp/temp.pkl") # df = pd.read_pickle("temp.pkl") idx = pd.IndexSlice[:, :, :split] train_df = df.loc[idx, :].reset_index() idx = pd.IndexSlice[:, :, split:] test_df = df.loc[idx, :].reset_index() full_df = df.reset_index() single_prediction_length = len(test_df["InvoiceDate"].unique()) feat_static_cat = combination feat_dynamic_real = ['UnitPrice'] target = 'Quantity' date_col = 'InvoiceDate' os.makedirs(dataset_path, exist_ok=True) uniq_combs = train_df[combination].drop_duplicates().apply(tuple, axis=1) dynamic_real_train_l = [] dynamic_real_test_l = [] stat_cat_l = [] start_l = [] train_target_l = [] test_target_l = [] for stock_code, country in tqdm(uniq_combs): df = train_df[ (train_df.StockCode == stock_code) & (train_df.Country == country) ] _df = full_df[(full_df.StockCode == stock_code) & (full_df.Country == country)] train_ts = _df[target].values.ravel() if (train_ts>0).sum() > (single_prediction_length+13): test_feat_dyn_array = _df.loc[:, feat_dynamic_real].values.T train_feat_dyn_array = test_feat_dyn_array[:, :-single_prediction_length] test_ts = train_ts.copy() train_ts = train_ts[:-single_prediction_length] dynamic_real_train_l.append(train_feat_dyn_array) dynamic_real_test_l.append(test_feat_dyn_array) start_l.append(df[date_col].min()) train_target_l.append(train_ts) test_target_l.append(test_ts) stat_cat_l.append( np.squeeze(df.loc[:, feat_static_cat].drop_duplicates().values) ) stat_cat_cardinalities = [ len(full_df[col].unique()) for col in feat_static_cat ] with open(dataset_path / "metadata.json", "w") as f: f.write( json.dumps( metadata( cardinality=stat_cat_cardinalities, freq="1D", prediction_length=single_prediction_length, ) ) ) train_file = dataset_path / "train" / "data.json" test_file = dataset_path / "test" / "data.json" train_ds = [ { FieldName.ITEM_ID: "|".join(map(str,uniq_comb)), FieldName.TARGET: target.tolist(), FieldName.START: str(start), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), } for uniq_comb, target, start, fdr, fsc in zip( uniq_combs, train_target_l, start_l, dynamic_real_train_l, stat_cat_l, ) ] save_to_file(train_file, train_ds) test_ds = [ { FieldName.ITEM_ID: "|".join(map(str,uniq_comb)), FieldName.TARGET: target.tolist(), FieldName.START: str(start), FieldName.FEAT_STATIC_CAT: fsc.tolist(), FieldName.FEAT_DYNAMIC_REAL: fdr.tolist(), } for uniq_comb, target, start, fdr, fsc in zip( uniq_combs, test_target_l, start_l, dynamic_real_test_l, stat_cat_l, ) ] save_to_file(test_file, test_ds)
def calculate_dataset_statistics(ts_dataset: Any) -> DatasetStatistics: """ Computes the statistics of a given Dataset. Parameters ---------- ts_dataset Dataset of which to compute the statistics. Returns ------- DatasetStatistics NamedTuple containing the statistics. """ num_time_observations = 0 num_time_series = 0 min_target = 1e20 max_target = -1e20 sum_target = 0.0 sum_abs_target = 0.0 integer_dataset = True observed_feat_static_cat: Optional[List[Set[int]]] = None observed_feat_static_real: Optional[List[Set[float]]] = None num_feat_static_real: Optional[int] = None num_feat_static_cat: Optional[int] = None num_past_feat_dynamic_real: Optional[int] = None num_feat_dynamic_real: Optional[int] = None num_feat_dynamic_cat: Optional[int] = None num_missing_values = 0 scale_histogram = ScaleHistogram() with tqdm(enumerate(ts_dataset, start=1), total=len(ts_dataset)) as it: max_target_length = 0 for num_time_series, ts in it: # TARGET target = ts[FieldName.TARGET] observed_target = target[~np.isnan(target)] num_observations = len(observed_target) if num_observations > 0: # 'nan' is handled in observed_target definition assert_data_error( np.all(np.isfinite(observed_target)), "Target values have to be finite (e.g., not inf, -inf, " "or None) and cannot exceed single precision floating " "point range.", ) num_time_observations += num_observations max_target_length = max(num_observations, max_target_length) min_target = float(min(min_target, observed_target.min())) max_target = float(max(max_target, observed_target.max())) num_missing_values += int(np.isnan(target).sum()) sum_target += float(observed_target.sum()) sum_abs_target += float(np.abs(observed_target).sum()) integer_dataset = integer_dataset and bool( np.all(np.mod(observed_target, 1) == 0)) scale_histogram.add( observed_target) # after checks for inf and None # FEAT_STATIC_CAT feat_static_cat = (ts[FieldName.FEAT_STATIC_CAT] if FieldName.FEAT_STATIC_CAT in ts else []) if num_feat_static_cat is None: num_feat_static_cat = len(feat_static_cat) observed_feat_static_cat = [ set() for _ in range(num_feat_static_cat) ] # needed to type check assert num_feat_static_cat is not None assert observed_feat_static_cat is not None assert_data_error( num_feat_static_cat == len(feat_static_cat), "Not all feat_static_cat vectors have the same length {} != {}.", num_feat_static_cat, len(feat_static_cat), ) for i, c in enumerate(feat_static_cat): observed_feat_static_cat[i].add(c) # FEAT_STATIC_REAL feat_static_real = (ts[FieldName.FEAT_STATIC_REAL] if FieldName.FEAT_STATIC_REAL in ts else []) if num_feat_static_real is None: num_feat_static_real = len(feat_static_real) observed_feat_static_real = [ set() for _ in range(num_feat_static_real) ] # needed to type check assert num_feat_static_real is not None assert observed_feat_static_real is not None assert_data_error( num_feat_static_real == len(feat_static_real), "Not all feat_static_real vectors have the same length {} != {}.", num_feat_static_real, len(feat_static_real), ) for i, c in enumerate(feat_static_real): observed_feat_static_real[i].add(c) # FEAT_DYNAMIC_CAT feat_dynamic_cat = (ts[FieldName.FEAT_DYNAMIC_CAT] if FieldName.FEAT_DYNAMIC_CAT in ts else None) if feat_dynamic_cat is None: # feat_dynamic_cat not found, check it was the first ts we encounter or # that feat_dynamic_cat were seen before assert_data_error( num_feat_dynamic_cat is None or num_feat_dynamic_cat == 0, "feat_dynamic_cat was found for some instances but not others.", ) num_feat_dynamic_cat = 0 else: if num_feat_dynamic_cat is None: # first num_feat_dynamic_cat found num_feat_dynamic_cat = len(feat_dynamic_cat) else: assert_data_error( num_feat_dynamic_cat == len(feat_dynamic_cat), "Found instances with different number of features in " "feat_dynamic_cat, found one with {} and another with {}.", num_feat_dynamic_cat, len(feat_dynamic_cat), ) assert_data_error( np.all(np.isfinite(feat_dynamic_cat)), "Features values have to be finite and cannot exceed single " "precision floating point range.", ) num_feat_dynamic_cat_time_steps = len(feat_dynamic_cat[0]) assert_data_error( num_feat_dynamic_cat_time_steps == len(target), "Each feature in feat_dynamic_cat has to have the same length as " "the target. Found an instance with feat_dynamic_cat of length {} " "and a target of length {}.", num_feat_dynamic_cat_time_steps, len(target), ) # FEAT_DYNAMIC_REAL feat_dynamic_real = None if FieldName.FEAT_DYNAMIC_REAL in ts: feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL] elif FieldName.FEAT_DYNAMIC_REAL_LEGACY in ts: feat_dynamic_real = ts[FieldName.FEAT_DYNAMIC_REAL_LEGACY] if feat_dynamic_real is None: # feat_dynamic_real not found, check it was the first ts we encounter or # that feat_dynamic_real were seen before assert_data_error( num_feat_dynamic_real is None or num_feat_dynamic_real == 0, "feat_dynamic_real was found for some instances but not others.", ) num_feat_dynamic_real = 0 else: if num_feat_dynamic_real is None: # first num_feat_dynamic_real found num_feat_dynamic_real = len(feat_dynamic_real) else: assert_data_error( num_feat_dynamic_real == len(feat_dynamic_real), "Found instances with different number of features in " "feat_dynamic_real, found one with {} and another with {}.", num_feat_dynamic_real, len(feat_dynamic_real), ) assert_data_error( np.all(np.isfinite(feat_dynamic_real)), "Features values have to be finite and cannot exceed single " "precision floating point range.", ) num_feat_dynamic_real_time_steps = len(feat_dynamic_real[0]) assert_data_error( num_feat_dynamic_real_time_steps == len(target), "Each feature in feat_dynamic_real has to have the same length as " "the target. Found an instance with feat_dynamic_real of length {} " "and a target of length {}.", num_feat_dynamic_real_time_steps, len(target), ) # PAST_FEAT_DYNAMIC_REAL past_feat_dynamic_real = None if FieldName.PAST_FEAT_DYNAMIC_REAL in ts: past_feat_dynamic_real = ts[FieldName.PAST_FEAT_DYNAMIC_REAL] if past_feat_dynamic_real is None: # past_feat_dynamic_real not found, check it was the first ts we encounter or # that past_feat_dynamic_real were seen before assert_data_error( num_past_feat_dynamic_real is None or num_past_feat_dynamic_real == 0, "past_feat_dynamic_real was found for some instances but not others.", ) num_past_feat_dynamic_real = 0 else: if num_past_feat_dynamic_real is None: # first num_past_feat_dynamic_real found num_past_feat_dynamic_real = len(past_feat_dynamic_real) else: assert_data_error( num_past_feat_dynamic_real == len( past_feat_dynamic_real), "Found instances with different number of features in " "past_feat_dynamic_real, found one with {} and another with {}.", num_past_feat_dynamic_real, len(past_feat_dynamic_real), ) assert_data_error( np.all(np.isfinite(past_feat_dynamic_real)), "Features values have to be finite and cannot exceed single " "precision floating point range.", ) assert_data_error(num_time_series > 0, "Time series dataset is empty!") assert_data_error( num_time_observations > 0, "Only empty time series found in the dataset!", ) # note this require the above assumption to avoid a division by zero # runtime error mean_target_length = num_time_observations / num_time_series # note this require the above assumption to avoid a division by zero # runtime error mean_target = sum_target / num_time_observations mean_abs_target = sum_abs_target / num_time_observations integer_dataset = integer_dataset and min_target >= 0.0 assert len(scale_histogram) == num_time_series return DatasetStatistics( integer_dataset=integer_dataset, max_target=max_target, mean_abs_target=mean_abs_target, mean_target=mean_target, mean_target_length=mean_target_length, max_target_length=max_target_length, min_target=min_target, num_missing_values=num_missing_values, feat_static_real=observed_feat_static_real if observed_feat_static_real else [], feat_static_cat=observed_feat_static_cat if observed_feat_static_cat else [], num_past_feat_dynamic_real=num_past_feat_dynamic_real, num_feat_dynamic_real=num_feat_dynamic_real, num_feat_dynamic_cat=num_feat_dynamic_cat, num_time_observations=num_time_observations, num_time_series=num_time_series, scale_histogram=scale_histogram, )
def evaluate_deepar( predictor: GluonPredictor, train_data: ListDataset, test_data: ListDataset, hierarchy_dict: Dict[int, List[int]], output_file: str = None, output_mean: bool = True, output_residuals: bool = True, ) -> Dict[Union[int, str], Dict[str, float]]: """ aggregates error metrics for each level of the hierarchy, optionally writes predictions/in-sample residuals to output file Arguments: predictor {GluonPredictor} -- predictor train_data {ListDataset} -- train dataset test_data {ListDataset} -- test dataset hierarchy_dict {Dict[int, List[int]]} -- mapping from hierachy level to series prediction idxs included in that level of hierarchy Keyword Arguments: output_file {str} -- output_file to save predictions (default: {None}) output_mean {bool} -- whether to output the mean (or median) predictions (default: {False}) output_residuals {bool} -- whether to output the residuals of in-sample predictions. If True, the in-sample residuals will be prepended to the out-of-sample predictions. Thus, if the in-sample data contains 24 timeteps, and the out-of-sample data contains 6 timesteps, the output data frame will contain 30 rows (timesteps) (default: {True}) Returns: Dict[Union[int, str], Dict[str, float]] -- mapping of hierarchy level (0-indexed) to dictionaries of aggregated metrics for that level of the hierarchy """ eval_forecasts = [] output_forecasts = [] with tqdm(predictor.predict(train_data), total=len(train_data), desc="Making Predictions") as it, np.errstate(invalid='ignore'): for forecast in it: output_forecasts.append( forecast.mean if output_mean else forecast.quantile(0.5)) eval_forecasts.append(forecast) preds = np.array(output_forecasts) if output_file: if output_residuals: preds = np.concatenate( (predictor.prediction_net.residuals.asnumpy(), preds), axis=1) out_df = pd.DataFrame(preds).T out_df.to_csv(output_file, index=False) eval_forecasts = np.array(eval_forecasts) evaluator = Evaluator(quantiles=[0.5]) evaluations = { level: evaluator([ to_pandas(series) for series in np.array(list(test_data))[np.array(idxs)] ], eval_forecasts[np.array(idxs)])[0] for level, idxs in hierarchy_dict.items() } evaluations['all'] = evaluator( [to_pandas(series) for series in np.array(list(test_data))], eval_forecasts)[0] return evaluations
def loop( epoch_no, batch_iter, num_batches_to_use: Optional[int] = None, is_training: bool = True, ) -> mx.metric.Loss: nonlocal first_forward tic = time.time() epoch_loss = mx.metric.Loss() # use averaged model for validation if not is_training and isinstance( self.avg_strategy, IterationAveragingStrategy ): self.avg_strategy.load_averaged_model(net) batch_iter = itertools.islice( batch_iter, num_batches_to_use ) with tqdm(batch_iter, total=num_batches_to_use) as it: for batch_no, batch in enumerate(it, start=1): # `batch` here is expected to be a dictionary whose fields # should correspond 1-to-1 with the network inputs # see below how `batch.values()` is fed into the network if first_forward: first_forward = False _ = net(*batch.values()) if self.post_initialize_cb: self.post_initialize_cb(net) with mx.autograd.record(): # we set the mode explicitly as by default mxnet assumes predict mode and hence # dropout layers are not used if the mode is not explicitly set to training mode = ( autograd.train_mode if is_training else autograd.predict_mode ) with mode(): output = net(*batch.values()) # network can returns several outputs, the first being always the loss # when having multiple outputs, the forward returns a list in the case of hybrid and a # tuple otherwise # we may wrap network outputs in the future to avoid this type check if isinstance(output, (list, tuple)): loss = output[0] else: loss = output batch_size = loss.shape[0] if not np.isfinite(ndarray.sum(loss).asscalar()): logger.warning( "Batch [%d] of Epoch[%d] gave NaN loss and it will be ignored", batch_no, epoch_no, ) else: if is_training: loss.backward() trainer.step(batch_size) # iteration averaging in training if isinstance( self.avg_strategy, IterationAveragingStrategy, ): self.avg_strategy.apply(net) epoch_loss.update(None, preds=loss) lv = loss_value(epoch_loss) it.set_postfix( ordered_dict={ "epoch": f"{epoch_no + 1}/{self.epochs}", ("" if is_training else "validation_") + "avg_epoch_loss": lv, }, refresh=False, ) # print out parameters of the network at the first pass if batch_no == 1 and epoch_no == 0: net_name = type(net).__name__ num_model_param = self.count_model_params(net) logger.info( f"Number of parameters in {net_name}: {num_model_param}" ) # mark epoch end time and log time cost of current epoch toc = time.time() logger.info( "Epoch[%d] Elapsed time %.3f seconds", epoch_no, (toc - tic), ) logger.info( "Epoch[%d] Evaluation metric '%s'=%f", epoch_no, ("" if is_training else "validation_") + "epoch_loss", lv, ) if not is_training and isinstance( self.avg_strategy, IterationAveragingStrategy ): # bring back the cached model self.avg_strategy.load_cached_model(net) return epoch_loss