def hybrid_forward(
        self,
        F,
        past_target: Tensor,
        future_target: Tensor,
        past_feat_dynamic: Tensor,
        future_feat_dynamic: Tensor,
        feat_static_cat: Tensor,
        past_observed_values: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Parameters
        ----------
        F: mx.symbol or mx.ndarray
            Gluon function space
        past_target: Tensor
            shape (batch_size, encoder_length, 1)
        future_target: Tensor
            shape (batch_size, encoder_length, decoder_length)
        past_feat_dynamic
            shape (batch_size, encoder_length, num_past_feature_dynamic)
        future_feat_dynamic
            shape (batch_size, encoder_length, decoder_length, num_feature_dynamic)
        feat_static_cat
            shape (batch_size, encoder_length, num_feature_static_cat)
        past_observed_values: Tensor
            shape (batch_size, encoder_length, 1)
        future_observed_values: Tensor
            shape (batch_size, encoder_length, decoder_length)

        Returns
        -------
        loss with shape (batch_size, prediction_length)
        """
        dec_output, scale = self.get_decoder_network_output(
            F,
            past_target,
            past_feat_dynamic,
            future_feat_dynamic,
            feat_static_cat,
            past_observed_values,
        )

        if self.quantile_output is not None:
            dec_dist_output = self.quantile_proj(dec_output)
            loss = self.loss(future_target, dec_dist_output)
        else:
            assert self.distr_output is not None
            distr_args = self.distr_args_proj(dec_output)
            distr = self.distr_output.distribution(distr_args, scale=scale)
            loss = distr.loss(future_target)

        # mask the loss based on observed indicator
        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=future_observed_values,
                                         axis=1)

        return weighted_loss
Beispiel #2
0
    def hybrid_forward(
        self,
        F,
        past_target: Tensor,
        past_observed_values: Tensor,
        past_is_pad: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
        past_feat_dynamic_real: Tensor,
        past_feat_dynamic_cat: Tensor,
        future_feat_dynamic_real: Tensor,
        future_feat_dynamic_cat: Tensor,
        feat_static_real: Tensor,
        feat_static_cat: Tensor,
    ) -> Tensor:
        (
            past_target,
            past_covariates,
            past_observed_values,
            future_target,
            future_covariates,
            offset,
            scale,
        ) = self._preprocess(
            F,
            past_target,
            past_observed_values,
            past_is_pad,
            past_feat_dynamic_real,
            past_feat_dynamic_cat,
            future_target,
            future_feat_dynamic_real,
            future_feat_dynamic_cat,
            feat_static_real,
            feat_static_cat,
        )

        target = F.concat(past_target, future_target, dim=1)
        covars = F.concat(past_covariates, future_covariates, dim=1)
        observed_values = F.concat(
            past_observed_values, future_observed_values, dim=1
        )

        target = F.slice_axis(target, axis=1, begin=0, end=-1)
        covars = F.slice_axis(covars, axis=1, begin=0, end=-1)
        observed_values = F.slice_axis(
            observed_values, axis=1, begin=0, end=-1
        )

        preds = self._forward_step(
            F, self.prediction_length, target, covars, observed_values
        )
        preds = self._postprocess(F, preds, offset, scale)
        future_target = future_target * (scale + self.normalizer_eps) + offset
        loss = self.loss(future_target, preds)
        loss = weighted_average(F, loss, future_observed_values)
        return loss.mean()
Beispiel #3
0
    def hybrid_forward(
        self,
        F,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
        past_feat_dynamic_real: Tensor,
        past_feat_dynamic_cat: Tensor,
        feat_dynamic_real: Tensor,
        feat_dynamic_cat: Tensor,
        feat_static_real: Tensor,
        feat_static_cat: Tensor,
    ) -> Tensor:
        (
            past_covariates,
            future_covariates,
            static_covariates,
            offset,
            scale,
        ) = self._preprocess(
            F,
            past_target,
            past_observed_values,
            past_feat_dynamic_real,
            past_feat_dynamic_cat,
            feat_dynamic_real,
            feat_dynamic_cat,
            feat_static_real,
            feat_static_cat,
        )

        preds = self._forward(
            F,
            past_observed_values,
            past_covariates,
            future_covariates,
            static_covariates,
        )

        preds = self._postprocess(F, preds, offset, scale)

        loss = self.loss(future_target, preds)
        loss = weighted_average(F, loss, future_observed_values)
        return loss.mean()
Beispiel #4
0
    def hybrid_forward(
        self,
        F,
        past_target: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes a probability distribution for future data given the past,
        and returns the loss associated with the actual future observations.

        Parameters
        ----------
        F
        past_target
            Tensor with past observations.
            Shape: (batch_size, context_length, target_dim).
        future_target
            Tensor with future observations.
            Shape: (batch_size, prediction_length, target_dim).
        future_observed_values
            Tensor indicating which values in the target are observed, and
            which ones are imputed instead.

        Returns
        -------
        Tensor
            Loss tensor. Shape: (batch_size, ).
        """
        distr_args, loc, scale = self.get_distr_args(F, past_target)
        distr = self.distr_output.distribution(distr_args,
                                               loc=loc,
                                               scale=scale)

        # (batch_size, prediction_length, target_dim)
        loss = distr.loss(future_target)

        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=future_observed_values,
                                         axis=1)

        # (batch_size, )
        return weighted_loss
Beispiel #5
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        past_observed_values: Tensor,
        past_seasonal_indicators: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
    ) -> Tensor:
        lds, _ = self.compute_lds(
            F,
            feat_static_cat=feat_static_cat,
            seasonal_indicators=past_seasonal_indicators.slice_axis(
                axis=1, begin=-self.past_length, end=None
            ),
            time_feat=past_time_feat.slice_axis(
                axis=1, begin=-self.past_length, end=None
            ),
            length=self.past_length,
        )

        _, scale = self.scaler(past_target, past_observed_values)

        observed_context = past_observed_values.slice_axis(
            axis=1, begin=-self.past_length, end=None
        )

        ll, _, _ = lds.log_prob(
            x=past_target.slice_axis(
                axis=1, begin=-self.past_length, end=None
            ),
            observed=observed_context.min(axis=-1, keepdims=False),
            scale=scale,
        )

        return weighted_average(
            F=F, x=-ll, axis=1, weights=observed_context.squeeze(axis=-1)
        )
Beispiel #6
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes the loss for training DeepRenewalProcess, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        F
        feat_static_cat : (batch_size, num_features)
        feat_static_real : (batch_size, num_features)
        past_time_feat : (batch_size, history_length, num_features)
        past_target : (batch_size, history_length, *target_shape)
        past_observed_values : (batch_size, history_length, *target_shape, seq_len)
        future_time_feat : (batch_size, prediction_length, num_features)
        future_target : (batch_size, prediction_length, *target_shape)
        future_observed_values : (batch_size, prediction_length, *target_shape)

        Returns loss with shape (batch_size, context + prediction_length, 1)
        -------

        """
        distr_m, distr_q = self.distribution(
            feat_static_cat=feat_static_cat,
            feat_static_real=feat_static_real,
            past_time_feat=past_time_feat,
            past_target=past_target,
            past_observed_values=past_observed_values,
            future_time_feat=future_time_feat,
            future_target=future_target,
            future_observed_values=future_observed_values,
        )

        # put together target sequence
        # (batch_size, seq_len, *target_shape)
        target = F.concat(
            past_target.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            ),
            future_target,
            dim=1,
        )

        # (batch_size, seq_len)
        loss_m = distr_m.loss(
            target.slice_axis(axis=2, begin=0,
                              end=1).squeeze())  # target[:,:,0]
        loss_q = distr_q.loss(
            target.slice_axis(axis=2, begin=1,
                              end=2).squeeze())  # target[:,:,1]
        loss = loss_m + loss_q

        # (batch_size, seq_len, *target_shape)
        observed_values = F.concat(
            past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            ),
            future_observed_values,
            dim=1,
        )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (observed_values if (len(self.target_shape) == 0) else
                        observed_values.min(axis=-1, keepdims=False))

        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=loss_weights,
                                         axis=1)

        # need to mask possible nans and -inf
        loss = F.where(condition=loss_weights, x=loss, y=F.zeros_like(loss))

        return weighted_loss, loss
Beispiel #7
0
    def train_hybrid_forward(
        self,
        F,
        target_dimension_indicator: Tensor,
        past_time_feat: Tensor,
        past_target_cdf: Tensor,
        past_observed_values: Tensor,
        past_is_pad: Tensor,
        future_time_feat: Tensor,
        future_target_cdf: Tensor,
        future_observed_values: Tensor,
    ) -> Tuple[Tensor, ...]:
        """
        Computes the loss for training DeepVAR, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        F
        target_dimension_indicator
            Indices of the target dimension (batch_size, target_dim)
        past_time_feat
            Dynamic features of past time series (batch_size, history_length,
            num_features)
        past_target_cdf
            Past marginal CDF transformed target values (batch_size,
            history_length, target_dim)
        past_observed_values
            Indicator whether or not the values were observed (batch_size,
            history_length, target_dim)
        past_is_pad
            Indicator whether the past target values have been padded
            (batch_size, history_length)
        future_time_feat
            Future time features (batch_size, prediction_length, num_features)
        future_target_cdf
            Future marginal CDF transformed target values (batch_size,
            prediction_length, target_dim)
        future_observed_values
            Indicator whether or not the future values were observed
            (batch_size, prediction_length, target_dim)

        Returns
        -------
        distr
            Loss with shape (batch_size, 1)
        likelihoods
            Likelihoods for each time step
            (batch_size, context + prediction_length, 1)
        distr_args
            Distribution arguments (context + prediction_length,
            number_of_arguments)
        """

        seq_len = self.context_length + self.prediction_length

        # unroll the decoder in "training mode", i.e. by providing future data
        # as well
        rnn_outputs, _, scale, lags_scaled, inputs = self.unroll_encoder(
            F=F,
            past_time_feat=past_time_feat,
            past_target_cdf=past_target_cdf,
            past_observed_values=past_observed_values,
            past_is_pad=past_is_pad,
            future_time_feat=future_time_feat,
            future_target_cdf=future_target_cdf,
            target_dimension_indicator=target_dimension_indicator,
        )

        # put together target sequence
        # (batch_size, seq_len, target_dim)
        target = F.concat(
            past_target_cdf.slice_axis(axis=1,
                                       begin=-self.context_length,
                                       end=None),
            future_target_cdf,
            dim=1,
        )

        # assert_shape(target, (-1, seq_len, self.target_dim))

        distr, distr_args = self.distr(
            time_features=inputs,
            rnn_outputs=rnn_outputs,
            scale=scale,
            lags_scaled=lags_scaled,
            target_dimension_indicator=target_dimension_indicator,
            seq_len=self.context_length + self.prediction_length,
        )

        # we sum the last axis to have the same shape for all likelihoods
        # (batch_size, subseq_length, 1)
        likelihoods = -distr.log_prob(target).expand_dims(axis=-1)

        assert_shape(likelihoods, (-1, seq_len, 1))

        past_observed_values = F.broadcast_minimum(
            past_observed_values, 1 - past_is_pad.expand_dims(axis=-1))

        # (batch_size, subseq_length, target_dim)
        observed_values = F.concat(
            past_observed_values.slice_axis(axis=1,
                                            begin=-self.context_length,
                                            end=None),
            future_observed_values,
            dim=1,
        )

        # mask the loss at one time step if one or more observations is missing
        # in the target dimensions (batch_size, subseq_length, 1)
        loss_weights = observed_values.min(axis=-1, keepdims=True)

        assert_shape(loss_weights, (-1, seq_len, 1))

        loss = weighted_average(F=F,
                                x=likelihoods,
                                weights=loss_weights,
                                axis=1)

        assert_shape(loss, (-1, -1, 1))

        self.distribution = distr

        return (loss, likelihoods) + distr_args
Beispiel #8
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes the loss for training DeepAR, all inputs tensors representing time series have NTC layout.

        Parameters
        ----------
        F
        feat_static_cat : (batch_size, num_features)
        past_time_feat : (batch_size, history_length, num_features)
        past_target : (batch_size, history_length, *target_shape)
        past_observed_values : (batch_size, history_length, *target_shape, seq_len)
        future_time_feat : (batch_size, prediction_length, num_features)
        future_target : (batch_size, prediction_length, *target_shape)
        future_observed_values : (batch_size, prediction_length, *target_shape)

        Returns loss with shape (batch_size, context + prediction_length, 1)
        -------

        """

        distr = self.distribution(
            feat_static_cat=feat_static_cat,
            past_time_feat=past_time_feat,
            past_target=past_target,
            past_observed_values=past_observed_values,
            future_time_feat=future_time_feat,
            future_target=future_target,
            future_observed_values=future_observed_values,
        )

        # put together target sequence
        # (batch_size, seq_len, *target_shape)
        target = F.concat(
            past_target.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            ),
            future_target,
            dim=1,
        )

        # (batch_size, seq_len)
        loss = distr.loss(target)

        # (batch_size, seq_len, *target_shape)
        observed_values = F.concat(
            past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            ),
            future_observed_values,
            dim=1,
        )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (
            observed_values
            if (len(self.target_shape) == 0)
            else observed_values.min(axis=-1, keepdims=False)
        )

        weighted_loss = weighted_average(
            F=F, x=loss, weights=loss_weights, axis=1
        )

        return weighted_loss, loss
Beispiel #9
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes the loss for training DeepAR, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        F
        feat_static_cat : (batch_size, num_features)
        feat_static_real : (batch_size, num_features)
        past_time_feat : (batch_size, history_length, num_features)
        past_target : (batch_size, history_length, *target_shape)
        past_observed_values : (batch_size, history_length, *target_shape, seq_len)
        future_time_feat : (batch_size, prediction_length, num_features)
        future_target : (batch_size, prediction_length, *target_shape)
        future_observed_values : (batch_size, prediction_length, *target_shape)

        Returns loss with shape (batch_size, context + prediction_length, 1)
        -------

        """

        outputs = self.distribution(
            feat_static_cat=feat_static_cat,
            feat_static_real=feat_static_real,
            past_time_feat=past_time_feat,
            past_target=past_target,
            past_observed_values=past_observed_values,
            future_time_feat=future_time_feat,
            future_target=future_target,
            future_observed_values=future_observed_values,
            return_rnn_outputs=True,
        )
        # since return_rnn_outputs=True, assert:
        assert isinstance(outputs, tuple)
        distr, rnn_outputs = outputs

        # put together target sequence
        # (batch_size, seq_len, *target_shape)
        target = F.concat(
            past_target.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            ),
            future_target,
            dim=1,
        )

        # (batch_size, seq_len)
        loss = distr.loss(target)

        # (batch_size, seq_len, *target_shape)
        observed_values = F.concat(
            past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            ),
            future_observed_values,
            dim=1,
        )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (
            observed_values
            if (len(self.target_shape) == 0)
            else observed_values.min(axis=-1, keepdims=False)
        )

        weighted_loss = weighted_average(
            F=F, x=loss, weights=loss_weights, axis=1
        )

        # need to mask possible nans and -inf
        loss = F.where(condition=loss_weights, x=loss, y=F.zeros_like(loss))

        # rnn_outputs is already merged into a single tensor
        assert not isinstance(rnn_outputs, list)
        # it seems that the trainer only uses the first return value for backward
        # so we only add regularization to weighted_loss
        if self.alpha:
            ar_loss = self.ar_loss(rnn_outputs)
            weighted_loss = weighted_loss + ar_loss
        if self.beta:
            tar_loss = self.tar_loss(rnn_outputs)
            weighted_loss = weighted_loss + tar_loss

        return weighted_loss, loss
Beispiel #10
0
    def hybrid_forward(
        self,
        F,
        past_target: Tensor,
        future_target: Tensor,
        past_feat_dynamic: Tensor,
        future_feat_dynamic: Tensor,
        feat_static_cat: Tensor,
        past_observed_values: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Parameters
        ----------
        F: mx.symbol or mx.ndarray
            Gluon function space
        past_target: Tensor
            shape (batch_size, encoder_length, 1)
        future_target: Tensor
            shape (batch_size, encoder_length, decoder_length)
        past_feat_dynamic
            shape (batch_size, encoder_length, num_past_feature_dynamic)
        future_feat_dynamic
            shape (batch_size, encoder_length, decoder_length, num_feature_dynamic)
        feat_static_cat
            shape (batch_size, encoder_length, num_feature_static_cat)
        past_observed_values: Tensor
            shape (batch_size, encoder_length, 1)
        future_observed_values: Tensor
            shape (batch_size, encoder_length, decoder_length)

        Returns
        -------
        loss with shape (batch_size, prediction_length)
        """

        input_tar_repr, scale, _ = self.input_repr(past_target,
                                                   F.ones_like(past_target),
                                                   None, [])
        output_tar_repr, _, _ = self.output_repr(future_target,
                                                 F.ones_like(future_target),
                                                 None, [])

        dec_output = self.get_decoder_network_output(
            F,
            input_tar_repr,
            past_feat_dynamic,
            future_feat_dynamic,
            feat_static_cat,
            past_observed_values,
        )

        dec_dist_output = self.quantile_proj(dec_output)
        loss = self.loss(output_tar_repr, dec_dist_output)

        # mask the loss based on observed indicator
        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=future_observed_values,
                                         axis=1)

        return weighted_loss
Beispiel #11
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        if self.ignore_future_targets:

            distr = self.distribution(
                feat_static_cat=feat_static_cat,
                feat_static_real=feat_static_real,
                past_time_feat=past_time_feat,
                past_target=past_target,
                past_observed_values=past_observed_values,
                future_time_feat=None,
                future_target=None,
                future_observed_values=future_observed_values,
            )

            loss = distr.loss(
                past_target.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ))

            # (batch_size, seq_len, *target_shape)
            observed_values = past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            )

        else:

            distr = self.distribution(
                feat_static_cat=feat_static_cat,
                feat_static_real=feat_static_real,
                past_time_feat=past_time_feat,
                past_target=past_target,
                past_observed_values=past_observed_values,
                future_time_feat=future_time_feat,
                future_target=future_target,
                future_observed_values=future_observed_values,
            )

            # put together target sequence
            # (batch_size, seq_len, *target_shape)
            target = F.concat(
                past_target.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ),
                future_target,
                dim=1,
            )

            # (batch_size, seq_len)
            loss = distr.loss(target)

            # (batch_size, seq_len, *target_shape)
            observed_values = F.concat(
                past_observed_values.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=self.history_length,
                ),
                future_observed_values,
                dim=1,
            )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (observed_values if (len(self.target_shape) == 0) else
                        observed_values.min(axis=-1, keepdims=False))

        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=loss_weights,
                                         axis=1)

        total_loss = F.sum(weighted_loss) / weighted_loss.shape[0]
        print_string = f'Forecasting loss: {total_loss.asscalar()}'

        # add self-supervised reconciliation loss
        if self.self_supervised_penalty > 0:
            agg_preds = F.take(distr.mean,
                               F.array(list(self.hierarchy_agg_dict.keys())))
            disagg_preds = F.concat(*[
                F.sum(F.take(distr.mean, F.array(disagg_idxs)), axis=0)
                for disagg_idxs in self.hierarchy_agg_dict.values()
            ],
                                    dim=0).reshape(agg_preds.shape)
            f_loss = F.sum(F.square(agg_preds - F.sum(disagg_preds, axis=0)))

        # add embedding reconciliation loss
        if self.embedding_agg_penalty > 0:
            embedded = self.embedder(
                F.expand_dims(F.array([i for i in range(self.cardinality[0])]),
                              axis=1))

            agg_embeds = F.take(embedded,
                                F.array(list(self.hierarchy_agg_dict.keys())))
            agg_copies = agg_embeds.copy().detach()

            disagg_embeds = [
                F.take(embedded, F.array(disagg_idxs))
                for disagg_idxs in self.hierarchy_agg_dict.values()
            ]
            disagg_lens = [len(disagg) for disagg in disagg_embeds]
            max_len = max(disagg_lens) + 1
            dim = embedded.shape[1]
            disagg_embeds = [
                F.concat(*[
                    disagg,
                    F.tile(agg, max_len - disagg.shape[0]).reshape(-1, dim)
                ],
                         dim=0).reshape(-1, dim).expand_dims(axis=0)
                for agg, disagg in zip(agg_copies, disagg_embeds)
            ]
            disagg_embeds = F.concat(*disagg_embeds, dim=0)

            if self.embedding_dist_metric == 'cosine':
                agg_embeds = F.L2Normalization(agg_embeds).expand_dims(axis=2)
                disagg_embeds = F.L2Normalization(disagg_embeds,
                                                  mode='spatial')
                e_loss = 1 - F.batch_dot(disagg_embeds, agg_embeds)
            else:
                agg_embeds = agg_embeds.expand_dims(axis=1)
                stability_constant = 1e-7
                e_loss = F.norm(agg_embeds - disagg_embeds +
                                stability_constant,
                                axis=2)
                e_loss = F.square(e_loss)

        if self.self_supervised_penalty > 0:
            total_f_loss = F.sum(f_loss) / weighted_loss.shape[0] / len(
                self.hierarchy_agg_dict)
            total_loss = total_loss + total_f_loss * F.array(
                [self.self_supervised_penalty])

        if self.embedding_agg_penalty > 0:
            total_e_loss = F.sum(e_loss) / len(self.hierarchy_agg_dict)
            total_loss = total_loss + total_e_loss * F.array(
                [self.embedding_agg_penalty])

        # print forecasting/reconciliation loss at each step
        if self.print_rec_penalty:

            if self.self_supervised_penalty > 0:
                print_string = print_string + f', Self-supervised Loss: {total_f_loss.asscalar()}'

            if self.embedding_agg_penalty > 0:
                print_string = print_string + f', Embedding agg Loss: {total_e_loss.asscalar()}'

            print(print_string)

        return total_loss, loss