def hybrid_forward( self, F, past_target: Tensor, future_target: Tensor, past_feat_dynamic: Tensor, future_feat_dynamic: Tensor, feat_static_cat: Tensor, past_observed_values: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Parameters ---------- F: mx.symbol or mx.ndarray Gluon function space past_target: Tensor shape (batch_size, encoder_length, 1) future_target: Tensor shape (batch_size, encoder_length, decoder_length) past_feat_dynamic shape (batch_size, encoder_length, num_past_feature_dynamic) future_feat_dynamic shape (batch_size, encoder_length, decoder_length, num_feature_dynamic) feat_static_cat shape (batch_size, encoder_length, num_feature_static_cat) past_observed_values: Tensor shape (batch_size, encoder_length, 1) future_observed_values: Tensor shape (batch_size, encoder_length, decoder_length) Returns ------- loss with shape (batch_size, prediction_length) """ dec_output, scale = self.get_decoder_network_output( F, past_target, past_feat_dynamic, future_feat_dynamic, feat_static_cat, past_observed_values, ) if self.quantile_output is not None: dec_dist_output = self.quantile_proj(dec_output) loss = self.loss(future_target, dec_dist_output) else: assert self.distr_output is not None distr_args = self.distr_args_proj(dec_output) distr = self.distr_output.distribution(distr_args, scale=scale) loss = distr.loss(future_target) # mask the loss based on observed indicator weighted_loss = weighted_average(F=F, x=loss, weights=future_observed_values, axis=1) return weighted_loss
def hybrid_forward( self, F, past_target: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, future_target: Tensor, future_observed_values: Tensor, past_feat_dynamic_real: Tensor, past_feat_dynamic_cat: Tensor, future_feat_dynamic_real: Tensor, future_feat_dynamic_cat: Tensor, feat_static_real: Tensor, feat_static_cat: Tensor, ) -> Tensor: ( past_target, past_covariates, past_observed_values, future_target, future_covariates, offset, scale, ) = self._preprocess( F, past_target, past_observed_values, past_is_pad, past_feat_dynamic_real, past_feat_dynamic_cat, future_target, future_feat_dynamic_real, future_feat_dynamic_cat, feat_static_real, feat_static_cat, ) target = F.concat(past_target, future_target, dim=1) covars = F.concat(past_covariates, future_covariates, dim=1) observed_values = F.concat( past_observed_values, future_observed_values, dim=1 ) target = F.slice_axis(target, axis=1, begin=0, end=-1) covars = F.slice_axis(covars, axis=1, begin=0, end=-1) observed_values = F.slice_axis( observed_values, axis=1, begin=0, end=-1 ) preds = self._forward_step( F, self.prediction_length, target, covars, observed_values ) preds = self._postprocess(F, preds, offset, scale) future_target = future_target * (scale + self.normalizer_eps) + offset loss = self.loss(future_target, preds) loss = weighted_average(F, loss, future_observed_values) return loss.mean()
def hybrid_forward( self, F, past_target: Tensor, past_observed_values: Tensor, future_target: Tensor, future_observed_values: Tensor, past_feat_dynamic_real: Tensor, past_feat_dynamic_cat: Tensor, feat_dynamic_real: Tensor, feat_dynamic_cat: Tensor, feat_static_real: Tensor, feat_static_cat: Tensor, ) -> Tensor: ( past_covariates, future_covariates, static_covariates, offset, scale, ) = self._preprocess( F, past_target, past_observed_values, past_feat_dynamic_real, past_feat_dynamic_cat, feat_dynamic_real, feat_dynamic_cat, feat_static_real, feat_static_cat, ) preds = self._forward( F, past_observed_values, past_covariates, future_covariates, static_covariates, ) preds = self._postprocess(F, preds, offset, scale) loss = self.loss(future_target, preds) loss = weighted_average(F, loss, future_observed_values) return loss.mean()
def hybrid_forward( self, F, past_target: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes a probability distribution for future data given the past, and returns the loss associated with the actual future observations. Parameters ---------- F past_target Tensor with past observations. Shape: (batch_size, context_length, target_dim). future_target Tensor with future observations. Shape: (batch_size, prediction_length, target_dim). future_observed_values Tensor indicating which values in the target are observed, and which ones are imputed instead. Returns ------- Tensor Loss tensor. Shape: (batch_size, ). """ distr_args, loc, scale = self.get_distr_args(F, past_target) distr = self.distr_output.distribution(distr_args, loc=loc, scale=scale) # (batch_size, prediction_length, target_dim) loss = distr.loss(future_target) weighted_loss = weighted_average(F=F, x=loss, weights=future_observed_values, axis=1) # (batch_size, ) return weighted_loss
def hybrid_forward( self, F, feat_static_cat: Tensor, past_observed_values: Tensor, past_seasonal_indicators: Tensor, past_time_feat: Tensor, past_target: Tensor, ) -> Tensor: lds, _ = self.compute_lds( F, feat_static_cat=feat_static_cat, seasonal_indicators=past_seasonal_indicators.slice_axis( axis=1, begin=-self.past_length, end=None ), time_feat=past_time_feat.slice_axis( axis=1, begin=-self.past_length, end=None ), length=self.past_length, ) _, scale = self.scaler(past_target, past_observed_values) observed_context = past_observed_values.slice_axis( axis=1, begin=-self.past_length, end=None ) ll, _, _ = lds.log_prob( x=past_target.slice_axis( axis=1, begin=-self.past_length, end=None ), observed=observed_context.min(axis=-1, keepdims=False), scale=scale, ) return weighted_average( F=F, x=-ll, axis=1, weights=observed_context.squeeze(axis=-1) )
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes the loss for training DeepRenewalProcess, all inputs tensors representing time series have NTC layout. Parameters ---------- F feat_static_cat : (batch_size, num_features) feat_static_real : (batch_size, num_features) past_time_feat : (batch_size, history_length, num_features) past_target : (batch_size, history_length, *target_shape) past_observed_values : (batch_size, history_length, *target_shape, seq_len) future_time_feat : (batch_size, prediction_length, num_features) future_target : (batch_size, prediction_length, *target_shape) future_observed_values : (batch_size, prediction_length, *target_shape) Returns loss with shape (batch_size, context + prediction_length, 1) ------- """ distr_m, distr_q = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss_m = distr_m.loss( target.slice_axis(axis=2, begin=0, end=1).squeeze()) # target[:,:,0] loss_q = distr_q.loss( target.slice_axis(axis=2, begin=1, end=2).squeeze()) # target[:,:,1] loss = loss_m + loss_q # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = (observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False)) weighted_loss = weighted_average(F=F, x=loss, weights=loss_weights, axis=1) # need to mask possible nans and -inf loss = F.where(condition=loss_weights, x=loss, y=F.zeros_like(loss)) return weighted_loss, loss
def train_hybrid_forward( self, F, target_dimension_indicator: Tensor, past_time_feat: Tensor, past_target_cdf: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, future_time_feat: Tensor, future_target_cdf: Tensor, future_observed_values: Tensor, ) -> Tuple[Tensor, ...]: """ Computes the loss for training DeepVAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F target_dimension_indicator Indices of the target dimension (batch_size, target_dim) past_time_feat Dynamic features of past time series (batch_size, history_length, num_features) past_target_cdf Past marginal CDF transformed target values (batch_size, history_length, target_dim) past_observed_values Indicator whether or not the values were observed (batch_size, history_length, target_dim) past_is_pad Indicator whether the past target values have been padded (batch_size, history_length) future_time_feat Future time features (batch_size, prediction_length, num_features) future_target_cdf Future marginal CDF transformed target values (batch_size, prediction_length, target_dim) future_observed_values Indicator whether or not the future values were observed (batch_size, prediction_length, target_dim) Returns ------- distr Loss with shape (batch_size, 1) likelihoods Likelihoods for each time step (batch_size, context + prediction_length, 1) distr_args Distribution arguments (context + prediction_length, number_of_arguments) """ seq_len = self.context_length + self.prediction_length # unroll the decoder in "training mode", i.e. by providing future data # as well rnn_outputs, _, scale, lags_scaled, inputs = self.unroll_encoder( F=F, past_time_feat=past_time_feat, past_target_cdf=past_target_cdf, past_observed_values=past_observed_values, past_is_pad=past_is_pad, future_time_feat=future_time_feat, future_target_cdf=future_target_cdf, target_dimension_indicator=target_dimension_indicator, ) # put together target sequence # (batch_size, seq_len, target_dim) target = F.concat( past_target_cdf.slice_axis(axis=1, begin=-self.context_length, end=None), future_target_cdf, dim=1, ) # assert_shape(target, (-1, seq_len, self.target_dim)) distr, distr_args = self.distr( time_features=inputs, rnn_outputs=rnn_outputs, scale=scale, lags_scaled=lags_scaled, target_dimension_indicator=target_dimension_indicator, seq_len=self.context_length + self.prediction_length, ) # we sum the last axis to have the same shape for all likelihoods # (batch_size, subseq_length, 1) likelihoods = -distr.log_prob(target).expand_dims(axis=-1) assert_shape(likelihoods, (-1, seq_len, 1)) past_observed_values = F.broadcast_minimum( past_observed_values, 1 - past_is_pad.expand_dims(axis=-1)) # (batch_size, subseq_length, target_dim) observed_values = F.concat( past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), future_observed_values, dim=1, ) # mask the loss at one time step if one or more observations is missing # in the target dimensions (batch_size, subseq_length, 1) loss_weights = observed_values.min(axis=-1, keepdims=True) assert_shape(loss_weights, (-1, seq_len, 1)) loss = weighted_average(F=F, x=likelihoods, weights=loss_weights, axis=1) assert_shape(loss, (-1, -1, 1)) self.distribution = distr return (loss, likelihoods) + distr_args
def hybrid_forward( self, F, feat_static_cat: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes the loss for training DeepAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F feat_static_cat : (batch_size, num_features) past_time_feat : (batch_size, history_length, num_features) past_target : (batch_size, history_length, *target_shape) past_observed_values : (batch_size, history_length, *target_shape, seq_len) future_time_feat : (batch_size, prediction_length, num_features) future_target : (batch_size, prediction_length, *target_shape) future_observed_values : (batch_size, prediction_length, *target_shape) Returns loss with shape (batch_size, context + prediction_length, 1) ------- """ distr = self.distribution( feat_static_cat=feat_static_cat, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = ( observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False) ) weighted_loss = weighted_average( F=F, x=loss, weights=loss_weights, axis=1 ) return weighted_loss, loss
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes the loss for training DeepAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F feat_static_cat : (batch_size, num_features) feat_static_real : (batch_size, num_features) past_time_feat : (batch_size, history_length, num_features) past_target : (batch_size, history_length, *target_shape) past_observed_values : (batch_size, history_length, *target_shape, seq_len) future_time_feat : (batch_size, prediction_length, num_features) future_target : (batch_size, prediction_length, *target_shape) future_observed_values : (batch_size, prediction_length, *target_shape) Returns loss with shape (batch_size, context + prediction_length, 1) ------- """ outputs = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, return_rnn_outputs=True, ) # since return_rnn_outputs=True, assert: assert isinstance(outputs, tuple) distr, rnn_outputs = outputs # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = ( observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False) ) weighted_loss = weighted_average( F=F, x=loss, weights=loss_weights, axis=1 ) # need to mask possible nans and -inf loss = F.where(condition=loss_weights, x=loss, y=F.zeros_like(loss)) # rnn_outputs is already merged into a single tensor assert not isinstance(rnn_outputs, list) # it seems that the trainer only uses the first return value for backward # so we only add regularization to weighted_loss if self.alpha: ar_loss = self.ar_loss(rnn_outputs) weighted_loss = weighted_loss + ar_loss if self.beta: tar_loss = self.tar_loss(rnn_outputs) weighted_loss = weighted_loss + tar_loss return weighted_loss, loss
def hybrid_forward( self, F, past_target: Tensor, future_target: Tensor, past_feat_dynamic: Tensor, future_feat_dynamic: Tensor, feat_static_cat: Tensor, past_observed_values: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Parameters ---------- F: mx.symbol or mx.ndarray Gluon function space past_target: Tensor shape (batch_size, encoder_length, 1) future_target: Tensor shape (batch_size, encoder_length, decoder_length) past_feat_dynamic shape (batch_size, encoder_length, num_past_feature_dynamic) future_feat_dynamic shape (batch_size, encoder_length, decoder_length, num_feature_dynamic) feat_static_cat shape (batch_size, encoder_length, num_feature_static_cat) past_observed_values: Tensor shape (batch_size, encoder_length, 1) future_observed_values: Tensor shape (batch_size, encoder_length, decoder_length) Returns ------- loss with shape (batch_size, prediction_length) """ input_tar_repr, scale, _ = self.input_repr(past_target, F.ones_like(past_target), None, []) output_tar_repr, _, _ = self.output_repr(future_target, F.ones_like(future_target), None, []) dec_output = self.get_decoder_network_output( F, input_tar_repr, past_feat_dynamic, future_feat_dynamic, feat_static_cat, past_observed_values, ) dec_dist_output = self.quantile_proj(dec_output) loss = self.loss(output_tar_repr, dec_dist_output) # mask the loss based on observed indicator weighted_loss = weighted_average(F=F, x=loss, weights=future_observed_values, axis=1) return weighted_loss
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: if self.ignore_future_targets: distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=None, future_target=None, future_observed_values=future_observed_values, ) loss = distr.loss( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, )) # (batch_size, seq_len, *target_shape) observed_values = past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ) else: distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = (observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False)) weighted_loss = weighted_average(F=F, x=loss, weights=loss_weights, axis=1) total_loss = F.sum(weighted_loss) / weighted_loss.shape[0] print_string = f'Forecasting loss: {total_loss.asscalar()}' # add self-supervised reconciliation loss if self.self_supervised_penalty > 0: agg_preds = F.take(distr.mean, F.array(list(self.hierarchy_agg_dict.keys()))) disagg_preds = F.concat(*[ F.sum(F.take(distr.mean, F.array(disagg_idxs)), axis=0) for disagg_idxs in self.hierarchy_agg_dict.values() ], dim=0).reshape(agg_preds.shape) f_loss = F.sum(F.square(agg_preds - F.sum(disagg_preds, axis=0))) # add embedding reconciliation loss if self.embedding_agg_penalty > 0: embedded = self.embedder( F.expand_dims(F.array([i for i in range(self.cardinality[0])]), axis=1)) agg_embeds = F.take(embedded, F.array(list(self.hierarchy_agg_dict.keys()))) agg_copies = agg_embeds.copy().detach() disagg_embeds = [ F.take(embedded, F.array(disagg_idxs)) for disagg_idxs in self.hierarchy_agg_dict.values() ] disagg_lens = [len(disagg) for disagg in disagg_embeds] max_len = max(disagg_lens) + 1 dim = embedded.shape[1] disagg_embeds = [ F.concat(*[ disagg, F.tile(agg, max_len - disagg.shape[0]).reshape(-1, dim) ], dim=0).reshape(-1, dim).expand_dims(axis=0) for agg, disagg in zip(agg_copies, disagg_embeds) ] disagg_embeds = F.concat(*disagg_embeds, dim=0) if self.embedding_dist_metric == 'cosine': agg_embeds = F.L2Normalization(agg_embeds).expand_dims(axis=2) disagg_embeds = F.L2Normalization(disagg_embeds, mode='spatial') e_loss = 1 - F.batch_dot(disagg_embeds, agg_embeds) else: agg_embeds = agg_embeds.expand_dims(axis=1) stability_constant = 1e-7 e_loss = F.norm(agg_embeds - disagg_embeds + stability_constant, axis=2) e_loss = F.square(e_loss) if self.self_supervised_penalty > 0: total_f_loss = F.sum(f_loss) / weighted_loss.shape[0] / len( self.hierarchy_agg_dict) total_loss = total_loss + total_f_loss * F.array( [self.self_supervised_penalty]) if self.embedding_agg_penalty > 0: total_e_loss = F.sum(e_loss) / len(self.hierarchy_agg_dict) total_loss = total_loss + total_e_loss * F.array( [self.embedding_agg_penalty]) # print forecasting/reconciliation loss at each step if self.print_rec_penalty: if self.self_supervised_penalty > 0: print_string = print_string + f', Self-supervised Loss: {total_f_loss.asscalar()}' if self.embedding_agg_penalty > 0: print_string = print_string + f', Embedding agg Loss: {total_e_loss.asscalar()}' print(print_string) return total_loss, loss