Exemple #1
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        past_is_pad: Tensor,
        past_time_feat: Tensor,
        future_time_feat: Tensor,
        scale: Tensor,
    ) -> Tensor:

        embedded_cat = self.feature_embedder(feat_static_cat)
        static_feat = F.concat(embedded_cat, F.log(scale + 1.0), dim=1)

        past_target = past_target.astype("int32")

        def blow_up(u):
            """
            Expand to (batch_size x num_samples)
            """
            return F.repeat(u, repeats=self.num_samples, axis=0)

        def is_last_layer(i):
            return i + 1 == len(self.dilations)

        queues = []

        full_time_features = F.concat(past_time_feat, future_time_feat, dim=-1)

        future_observed_values = F.slice_axis(future_time_feat,
                                              begin=0,
                                              end=1,
                                              axis=1).ones_like()

        full_observed = F.concat(
            F.expand_dims(past_observed_values, axis=1),
            future_observed_values,
            dim=-1,
        )

        repeated_static_feat = F.repeat(
            F.expand_dims(static_feat, axis=-1),
            repeats=self.pred_length + self.receptive_field,
            axis=-1,
        )

        full_features = F.concat(full_time_features,
                                 full_observed,
                                 repeated_static_feat,
                                 dim=1)

        feature_slice = F.slice_axis(
            full_features,
            begin=-self.pred_length - self.receptive_field + 1,
            end=None,
            axis=-1,
        )

        tmp = F.slice_axis(past_target,
                           begin=-self.receptive_field,
                           end=None,
                           axis=-1)
        o = self.target_embed(tmp).swapaxes(1, 2)
        o = F.concat(
            o,
            F.slice_axis(feature_slice,
                         begin=-self.receptive_field,
                         end=None,
                         axis=-1),
            dim=1,
        )
        o = self.conv_project(o)

        for i, d in enumerate(self.dilations):
            sz = 1 if d == 2**(self.dilation_depth - 1) else d * 2
            _, o = self.residuals[i](o)
            if not is_last_layer(i):
                o_chunk = F.slice_axis(o, begin=-sz - 1, end=-1, axis=-1)
            else:
                o_chunk = o
            queues.append(blow_up(o_chunk))

        res = F.slice_axis(past_target, begin=-2, end=None, axis=-1)
        res = blow_up(res)

        for n in range(self.pred_length):
            queues_next = []
            o = self.target_embed(
                F.slice_axis(res, begin=-2, end=None, axis=-1)).swapaxes(1, 2)
            b = F.slice_axis(
                full_features,
                begin=self.receptive_field + n - 1,
                end=self.receptive_field + n + 1,
                axis=-1,
            )
            b = blow_up(b)
            o = F.concat(o, b, dim=1)
            o = self.conv_project(o)

            skip_outs = []
            for i, d in enumerate(self.dilations):
                skip, o = self.residuals[i](o)
                skip_outs.append(skip)
                if not is_last_layer(i):
                    q = queues[i]
                    o = F.concat(q, o, num_args=2, dim=-1)
                    queues_next.append(
                        F.slice_axis(o, begin=1, end=None, axis=-1))
            queues = queues_next
            y = sum(skip_outs)
            y = self.output_act(y)
            y = self.conv1(y)
            y = self.output_act(y)
            unnormalized_outputs = self.conv2(y)
            if self.temperature > 0:
                probs = F.softmax(unnormalized_outputs / self.temperature,
                                  axis=1)
                y = F.sample_multinomial(probs.swapaxes(1, 2))
            else:
                y = F.argmax(unnormalized_outputs, axis=1)
            y = y.astype("int32")
            res = F.concat(res, y, num_args=2, dim=-1)
        samples = F.slice_axis(res, begin=-self.pred_length, end=None, axis=-1)
        samples = samples.reshape(shape=(-1, self.num_samples,
                                         self.pred_length))
        samples = self.post_transform(samples)
        samples = F.broadcast_mul(scale.expand_dims(axis=1), samples)
        return samples
Exemple #2
0
 def process_static_cat(self, F, feature: Tensor) -> Tensor:
     feature = self.embed_static(feature.astype(self.dtype))
     return F.tile(feature.expand_dims(axis=1), reps=(1, self.T, 1))
Exemple #3
0
 def process_static_real(self, F, feature: Tensor) -> Tensor:
     return F.tile(feature.expand_dims(axis=1), reps=(1, self.T, 1))
Exemple #4
0
 def s(mu: Tensor, sigma: Tensor) -> Tensor:
     raw_samples = self.F.sample_normal(mu=mu.zeros_like(),
                                        sigma=sigma.ones_like())
     return sigma * raw_samples + mu
Exemple #5
0
 def quantile(self, level: Tensor) -> Tensor:
     F = self.F
     for _ in range(self.all_dim):
         level = level.expand_dims(axis=-1)
     return F.broadcast_add(F.broadcast_mul(self.high - self.low, level),
                            self.low)
Exemple #6
0
def nans_like(x: Tensor) -> Tensor:
    return x.zeros_like() / 0.0
Exemple #7
0
def cumsum(F,
           x: Tensor,
           exclusive: bool = False,
           reverse: bool = False) -> Tensor:
    r"""
    Find cumulative sum on the last axis by multiplying with lower triangular
    ones-matrix:

    .. math::

       \operatorname{cumsum}(x) =
       \begin{cases}
         \operatorname{ltr\_ones} \times x
           & \text{for cumulative sum}\\
         x \times \operatorname{ltr\_ones}
           & \text{for cumulative sum in the reverse order}
       \end{cases}

    Also supports `exclusive` flag to start the cumsum with zero.
    For example, if :math:`x = [a, b, c]`, we have

    .. math::

       \operatorname{cumsum}(x) =
       \begin{cases}
         [a, a + b, a + b + c]
           & \text{if }\mathit{reverse = False, exclusive = False}\\
         [0, a, a + b]
           & \text{if }\mathit{reverse = False, exclusive = True}\\
         [a + b + c, b + c, c]
           & \text{if }\mathit{reverse = True, exclusive = False}\\
         [b + c, c, 0]
           & \text{if }\mathit{reverse = True, exclusive = True}\\
       \end{cases}

    Parameters
    ----------
    F
        The function space to use.
    x
        A tensor with shape :math:`(..., n)`.
    exclusive
        If `True`, the cumulative sum starts with zero.
    reverse
        If `True`, the cumulative sum is performed in the opposite direction.

    Returns
    -------
    Tensor:
        A modified tensor with identical shape and cumulative sums in the last
        axis.
    """

    # Create a new axis (for matrix multiplication) either at last location or
    # last-but-one location (for reverse mode)
    exp_dim = -2 if reverse else -1
    # (..., 1, n) if reverse is True and (..., n, 1) otherwise
    x = x.expand_dims(axis=exp_dim)

    # Ones_matrix (..., n, n)
    ones_matrix = F.linalg_gemm2(
        F.ones_like(x),
        F.ones_like(x),
        transpose_a=reverse,
        transpose_b=not reverse,
    )
    cumulative_sum = F.linalg_trmm(ones_matrix, x, rightside=reverse)

    if exclusive:
        cumulative_sum = cumulative_sum - x

    return cumulative_sum.squeeze(axis=exp_dim)
Exemple #8
0
 def f_inv(self, y: Tensor) -> Tensor:
     return y.clip(-np.inf, 30).exp()
Exemple #9
0
 def f(self, x: Tensor) -> Tensor:
     F = getF(x)
     return F.Activation(x.clip(-100.0, np.inf), act_type="softrelu")
Exemple #10
0
 def log_abs_det_jac(self, x: Tensor, y: Tensor) -> Tensor:
     return y.clip(1.0e-20, np.inf).log()
Exemple #11
0
 def f(self, x: Tensor) -> Tensor:
     return x.clip(1.0e-20, np.inf).log()
Exemple #12
0
 def f_inv(self, y: Tensor) -> Tensor:
     return y.clip(1.0e-20, np.inf).log()
Exemple #13
0
 def f(self, x: Tensor) -> Tensor:
     return x.clip(-np.inf, 30).exp()
Exemple #14
0
    def hybrid_forward(
        self,
        F,
        data: Tensor,
        observed_indicator: Tensor,
        scale: Optional[Tensor],
        rep_params: List[Tensor],
        **kwargs,
    ) -> Tuple[Tensor, Tensor, List[Tensor]]:
        data_np = data.asnumpy()
        observed_indicator_np = observed_indicator.astype("int32").asnumpy()

        if scale is None:
            # Even though local binning implicitly scales the data, we still return the scale as an input to the model.
            scale = F.expand_dims(
                F.sum(data * observed_indicator, axis=-1) /
                F.sum(observed_indicator, axis=-1),
                -1,
            )

            bin_centers_hyb = np.ones((len(data), self.num_bins)) * (-1)
            bin_edges_hyb = np.ones((len(data), self.num_bins + 1)) * (-1)

            # Every time series needs to be binned individually
            for i in range(len(data_np)):
                # Identify observed data points.
                data_loc = data_np[i]
                observed_indicator_loc = observed_indicator_np[i]
                data_obs_loc = data_loc[observed_indicator_loc == 1]

                if data_obs_loc.size > 0:
                    # Calculate time series specific bin centers and edges.
                    if self.is_quantile:
                        bin_centers_loc = np.quantile(
                            data_obs_loc, np.linspace(0, 1, self.num_bins))
                    else:
                        bin_centers_loc = np.linspace(
                            np.min(data_obs_loc),
                            np.max(data_obs_loc),
                            self.num_bins,
                        )
                    bin_centers_hyb[i] = ensure_binning_monotonicity(
                        bin_centers_loc)
                    bin_edges_hyb[i] = bin_edges_from_bin_centers(
                        bin_centers_hyb[i])

                    # Bin the time series.
                    data_obs_loc_binned = np.digitize(data_obs_loc,
                                                      bins=bin_edges_hyb[i],
                                                      right=False)
                else:
                    data_obs_loc_binned = []

                # Write the binned time series back into the data array.
                data_loc[observed_indicator_loc == 1] = data_obs_loc_binned
                data_np[i] = data_loc

        else:
            bin_centers_hyb = rep_params[0].asnumpy()
            bin_edges_hyb = rep_params[1].asnumpy()

            bin_edges_hyb = np.repeat(
                bin_edges_hyb,
                len(data_np) / len(bin_edges_hyb),
                axis=0,
            )
            bin_centers_hyb = np.repeat(
                bin_centers_hyb,
                len(data_np) / len(bin_centers_hyb),
                axis=0,
            )

            for i in range(len(data_np)):
                data_loc = data_np[i]
                observed_indicator_loc = observed_indicator_np[i]
                data_obs_loc = data_loc[observed_indicator_loc == 1]

                # Bin the time series based on previously computed bin edges.
                data_obs_loc_binned = np.digitize(data_obs_loc,
                                                  bins=bin_edges_hyb[i],
                                                  right=False)

                data_loc[observed_indicator_loc == 1] = data_obs_loc_binned
                data_np[i] = data_loc

        bin_centers_hyb = F.array(bin_centers_hyb)
        bin_edges_hyb = F.array(bin_edges_hyb)

        data = mx.nd.array(data_np)

        return data, scale, [bin_centers_hyb, bin_edges_hyb]
Exemple #15
0
    def sampling_decoder(
        self,
        F,
        static_feat: Tensor,
        past_target: Tensor,
        time_feat: Tensor,
        scale: Tensor,
        begin_states: List,
    ) -> Tensor:
        """
        Computes sample paths by unrolling the LSTM starting with a initial
        input and state.

        Parameters
        ----------
        static_feat : Tensor
            static features. Shape: (batch_size, num_static_features).
        past_target : Tensor
            target history. Shape: (batch_size, history_length).
        time_feat : Tensor
            time features. Shape: (batch_size, prediction_length, num_time_features).
        scale : Tensor
            tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1).
        begin_states : List
            list of initial states for the LSTM layers.
            the shape of each tensor of the list should be (batch_size, num_cells)
        Returns
        --------
        Tensor
            A tensor containing sampled paths.
            Shape: (batch_size, num_sample_paths, prediction_length).
        """

        # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism
        repeated_past_target = past_target.repeat(
            repeats=self.num_parallel_samples, axis=0)
        repeated_time_feat = time_feat.repeat(
            repeats=self.num_parallel_samples, axis=0)
        repeated_static_feat = static_feat.repeat(
            repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1)
        repeated_scale = scale.repeat(repeats=self.num_parallel_samples,
                                      axis=0)
        repeated_states = [
            s.repeat(repeats=self.num_parallel_samples, axis=0)
            for s in begin_states
        ]

        future_samples = []

        # for each future time-units we draw new samples for this time-unit and update the state
        for k in range(self.prediction_length):
            # (batch_size * num_samples, 1, *target_shape, num_lags)
            lags = self.get_lagged_subsequences(
                F=F,
                sequence=repeated_past_target,
                sequence_length=self.history_length + k,
                indices=self.shifted_lags,
                subsequences_length=1,
            )

            # (batch_size * num_samples, 1, *target_shape, num_lags)
            lags_scaled = F.broadcast_div(lags,
                                          repeated_scale.expand_dims(axis=-1))

            # from (batch_size * num_samples, 1, *target_shape, num_lags)
            # to (batch_size * num_samples, 1, prod(target_shape) * num_lags)
            input_lags = F.reshape(
                data=lags_scaled,
                shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)),
            )

            # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features)
            decoder_input = F.concat(
                input_lags,
                repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1),
                repeated_static_feat,
                dim=-1,
            )

            # output shape: (batch_size * num_samples, 1, num_cells)
            # state shape: (batch_size * num_samples, num_cells)
            rnn_outputs, repeated_states = self.rnn.unroll(
                inputs=decoder_input,
                length=1,
                begin_state=repeated_states,
                layout="NTC",
                merge_outputs=True,
            )

            distr_args = self.proj_distr_args(rnn_outputs)

            # compute likelihood of target given the predicted parameters
            distr = self.distr_output.distribution(distr_args,
                                                   scale=repeated_scale)

            # (batch_size * num_samples, 1, *target_shape)
            new_samples = distr.sample()

            # (batch_size * num_samples, seq_len, *target_shape)
            repeated_past_target = F.concat(repeated_past_target,
                                            new_samples,
                                            dim=1)
            future_samples.append(new_samples)

        # (batch_size * num_samples, prediction_length, *target_shape)
        samples = F.concat(*future_samples, dim=1)

        # (batch_size, num_samples, prediction_length, *target_shape)
        return samples.reshape(shape=((-1, self.num_parallel_samples) +
                                      (self.prediction_length, ) +
                                      self.target_shape))
Exemple #16
0
    def hybrid_forward(self, F, past_target: Tensor,
                       past_observed_values: Tensor) -> Tensor:
        """
        Given the tensor `past_target`, first we normalize it by the
        `past_observed_values` which is an indicator tensor with 0 or 1 values.
        Then it outputs the result of LSTNet.

        Parameters
        ----------
        F
        past_target
            Tensor of shape (batch_size, num_series, context_length)
        past_observed_values
            Tensor of shape (batch_size, num_series, context_length)

        Returns
        -------
        Tensor
            Shape (batch_size, num_series, 1) if `horizon` was specified
            and of shape (batch_size, num_series, prediction_length)
            if `prediction_length` was provided
            
        """

        scaled_past_target, _ = self.scaler(
            past_target.slice_axis(axis=2,
                                   begin=-self.context_length,
                                   end=None),
            past_observed_values.slice_axis(axis=2,
                                            begin=-self.context_length,
                                            end=None),
        )
        c = self.cnn(scaled_past_target)
        c = self.dropout(c)
        c = F.transpose(c, axes=(0, 2, 1))  # NTC

        if F is mx.ndarray:
            ctx = (c.context
                   if isinstance(c, mx.gluon.tensor_types) else c[0].context)
            with ctx:
                rnn_begin_state = self.rnn.begin_state(func=F.zeros,
                                                       dtype=self.dtype,
                                                       batch_size=c.shape[0])
        else:
            rnn_begin_state = self.rnn.begin_state(func=F.zeros,
                                                   dtype=self.dtype,
                                                   batch_size=0)
        r, _ = self.rnn.unroll(
            inputs=c,
            length=min(self.conv_out, self.context_length),
            layout="NTC",
            merge_outputs=True,
            begin_state=rnn_begin_state,
        )
        r = F.squeeze(F.slice_axis(r, axis=1, begin=-1, end=None),
                      axis=1)  # NC
        s = self._skip_rnn_layer(F, c)
        # make fc broadcastable for output
        fc = self.fc(F.concat(r, s,
                              dim=1)).expand_dims(axis=2)  # N x num_series x 1
        if self.prediction_length:
            fc = F.tile(fc, reps=(
                1, 1,
                self.prediction_length))  # N x num_series x prediction_length
        ar = self._ar_highway(F, past_target)
        out = fc + ar
        if self.output_activation is None:
            return out
        return (F.sigmoid(out)
                if self.output_activation == "sigmoid" else F.tanh(out))
Exemple #17
0
def plot_samples(s: Tensor, bins: int = 100) -> None:
    from matplotlib import pyplot as plt

    s = s.asnumpy()
    plt.hist(s, bins=bins)
    plt.show()
Exemple #18
0
    def quantile_internal(
        self, x: Tensor, axis: Optional[int] = None
    ) -> Tensor:
        r"""
        Evaluates the quantile function at the quantile levels contained in `x`.

        Parameters
        ----------
        x
            Tensor of shape ``*gamma.shape`` if axis=None, or containing an
            additional axis on the specified position, otherwise.
        axis
            Index of the axis containing the different quantile levels which
            are to be computed.

        Returns
        -------
        Tensor
            Quantiles tensor, of the same shape as x.
        """

        F = self.F

        # shapes of self
        # self.gamma: (*batch_shape)
        # self.knot_positions, self.b: (*batch_shape, num_pieces)

        # axis=None - passed at inference when num_samples is None
        # The shape of x is (*batch_shape).
        # The shapes of the parameters should be:
        # gamma: (*batch_shape), knot_positions, b: (*batch_shape, num_pieces)
        # They match the self. counterparts so no reshaping is needed

        # axis=0 - passed at inference when num_samples is not None
        # The shape of x is (num_samples, *batch_shape).
        # The shapes of the parameters should be:
        # gamma: (num_samples, *batch_shape), knot_positions, b: (num_samples, *batch_shape, num_pieces),
        # They do not match the self. counterparts and we need to expand the axis=0 to all of them.

        # axis=-2 - passed at training when we evaluate quantiles at knot_positions in order to compute a_tilde
        # The shape of x is shape(x) = shape(knot_positions) = (*batch_shape, num_pieces).
        # The shape of the parameters shopuld be:
        # gamma: (*batch_shape, 1), knot_positions: (*batch_shape, 1, num_pieces), b: (*batch_shape, 1, num_pieces)
        # They do not match the self. counterparts and we need to expand axis=-1 for gamma and axis=-2 for the rest.

        if axis is not None:
            gamma = self.gamma.expand_dims(axis=axis if axis == 0 else -1)
            knot_positions = self.knot_positions.expand_dims(axis=axis)
            b = self.b.expand_dims(axis=axis)
        else:
            gamma, knot_positions, b = self.gamma, self.knot_positions, self.b

        x_minus_knots = F.broadcast_minus(
            x.expand_dims(axis=-1), knot_positions
        )

        quantile = F.broadcast_add(
            gamma, F.sum(F.broadcast_mul(b, F.relu(x_minus_knots)), axis=-1)
        )

        return quantile
Exemple #19
0
 def cdf(self, x: Tensor) -> Tensor:
     F = self.F
     x = x.expand_dims(axis=-1)
     # left_edges = self.bin_edges.slice_axis(axis=-1, begin=0, end=-1)
     mask = F.broadcast_lesser_equal(self.bin_centers, x)
     return F.broadcast_mul(self.bin_probs, mask).sum(axis=-1)
Exemple #20
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        if self.ignore_future_targets:

            distr = self.distribution(
                feat_static_cat=feat_static_cat,
                feat_static_real=feat_static_real,
                past_time_feat=past_time_feat,
                past_target=past_target,
                past_observed_values=past_observed_values,
                future_time_feat=None,
                future_target=None,
                future_observed_values=future_observed_values,
            )

            loss = distr.loss(
                past_target.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ))

            # (batch_size, seq_len, *target_shape)
            observed_values = past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            )

        else:

            distr = self.distribution(
                feat_static_cat=feat_static_cat,
                feat_static_real=feat_static_real,
                past_time_feat=past_time_feat,
                past_target=past_target,
                past_observed_values=past_observed_values,
                future_time_feat=future_time_feat,
                future_target=future_target,
                future_observed_values=future_observed_values,
            )

            # put together target sequence
            # (batch_size, seq_len, *target_shape)
            target = F.concat(
                past_target.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ),
                future_target,
                dim=1,
            )

            # (batch_size, seq_len)
            loss = distr.loss(target)

            # (batch_size, seq_len, *target_shape)
            observed_values = F.concat(
                past_observed_values.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=self.history_length,
                ),
                future_observed_values,
                dim=1,
            )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (observed_values if (len(self.target_shape) == 0) else
                        observed_values.min(axis=-1, keepdims=False))

        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=loss_weights,
                                         axis=1)

        total_loss = F.sum(weighted_loss) / weighted_loss.shape[0]
        print_string = f'Forecasting loss: {total_loss.asscalar()}'

        # add self-supervised reconciliation loss
        if self.self_supervised_penalty > 0:
            agg_preds = F.take(distr.mean,
                               F.array(list(self.hierarchy_agg_dict.keys())))
            disagg_preds = F.concat(*[
                F.sum(F.take(distr.mean, F.array(disagg_idxs)), axis=0)
                for disagg_idxs in self.hierarchy_agg_dict.values()
            ],
                                    dim=0).reshape(agg_preds.shape)
            f_loss = F.sum(F.square(agg_preds - F.sum(disagg_preds, axis=0)))

        # add embedding reconciliation loss
        if self.embedding_agg_penalty > 0:
            embedded = self.embedder(
                F.expand_dims(F.array([i for i in range(self.cardinality[0])]),
                              axis=1))

            agg_embeds = F.take(embedded,
                                F.array(list(self.hierarchy_agg_dict.keys())))
            agg_copies = agg_embeds.copy().detach()

            disagg_embeds = [
                F.take(embedded, F.array(disagg_idxs))
                for disagg_idxs in self.hierarchy_agg_dict.values()
            ]
            disagg_lens = [len(disagg) for disagg in disagg_embeds]
            max_len = max(disagg_lens) + 1
            dim = embedded.shape[1]
            disagg_embeds = [
                F.concat(*[
                    disagg,
                    F.tile(agg, max_len - disagg.shape[0]).reshape(-1, dim)
                ],
                         dim=0).reshape(-1, dim).expand_dims(axis=0)
                for agg, disagg in zip(agg_copies, disagg_embeds)
            ]
            disagg_embeds = F.concat(*disagg_embeds, dim=0)

            if self.embedding_dist_metric == 'cosine':
                agg_embeds = F.L2Normalization(agg_embeds).expand_dims(axis=2)
                disagg_embeds = F.L2Normalization(disagg_embeds,
                                                  mode='spatial')
                e_loss = 1 - F.batch_dot(disagg_embeds, agg_embeds)
            else:
                agg_embeds = agg_embeds.expand_dims(axis=1)
                stability_constant = 1e-7
                e_loss = F.norm(agg_embeds - disagg_embeds +
                                stability_constant,
                                axis=2)
                e_loss = F.square(e_loss)

        if self.self_supervised_penalty > 0:
            total_f_loss = F.sum(f_loss) / weighted_loss.shape[0] / len(
                self.hierarchy_agg_dict)
            total_loss = total_loss + total_f_loss * F.array(
                [self.self_supervised_penalty])

        if self.embedding_agg_penalty > 0:
            total_e_loss = F.sum(e_loss) / len(self.hierarchy_agg_dict)
            total_loss = total_loss + total_e_loss * F.array(
                [self.embedding_agg_penalty])

        # print forecasting/reconciliation loss at each step
        if self.print_rec_penalty:

            if self.self_supervised_penalty > 0:
                print_string = print_string + f', Self-supervised Loss: {total_f_loss.asscalar()}'

            if self.embedding_agg_penalty > 0:
                print_string = print_string + f', Embedding agg Loss: {total_e_loss.asscalar()}'

            print(print_string)

        return total_loss, loss
    def exact_inference(
        self, x_train: Tensor, y_train: Tensor, x_test: Tensor
    ) -> Tuple[Tensor, Tensor, Tensor]:
        """
        Parameters
        ----------
        x_train
            Training set of features of shape (batch_size, context_length, num_features).
        y_train
            Training labels of shape (batch_size, context_length).
        x_test
            Test set of features of shape (batch_size, prediction_length, num_features).
        Returns
        -------
        Tuple
            Tensor
                Predictive GP samples of shape (batch_size, prediction_length, num_samples).
            Tensor
                Predictive mean of the GP of shape (batch_size, prediction_length).
            Tensor
                Predictive standard deviation of the GP of shape (batch_size, prediction_length).
        """
        assert (
            self.context_length is not None
        ), "The value of `context_length` must be set."
        assert (
            self.prediction_length is not None
        ), "The value of `prediction_length` must be set."
        # Compute Cholesky factorization of training kernel matrix
        l_train = self._compute_cholesky_gp(
            self.kernel.kernel_matrix(x_train, x_train), self.context_length
        )

        lower_tri_solve = self.F.linalg.trsm(
            l_train, self.kernel.kernel_matrix(x_train, x_test)
        )
        predictive_mean = self.F.linalg.gemm2(
            lower_tri_solve,
            self.F.linalg.trsm(l_train, y_train.expand_dims(axis=-1)),
            transpose_a=True,
        ).squeeze(axis=-1)
        # Can rewrite second term as
        # :math:`||L^-1 * K(x_train,x_test||_2^2`
        #  and only solve 1 equation
        predictive_covariance = self.kernel.kernel_matrix(
            x_test, x_test
        ) - self.F.linalg.gemm2(
            lower_tri_solve, lower_tri_solve, transpose_a=True
        )
        # Extract diagonal entries of covariance matrix
        predictive_std = batch_diagonal(
            self.F,
            predictive_covariance,
            self.prediction_length,
            self.ctx,
            self.float_type,
        )
        # If self.sample_noise = True, predictive covariance has sigma^2 on the diagonal
        if self.sample_noise:
            predictive_std = self.F.broadcast_add(
                predictive_std, self.sigma ** 2
            )
        predictive_std = self.F.sqrt(predictive_std).squeeze(axis=-1)
        # Compute sample from GP predictive distribution
        return (
            self.sample(predictive_mean, predictive_covariance),
            predictive_mean,
            predictive_std,
        )
Exemple #22
0
def kalman_filter_step(
    F,
    target: Tensor,
    prior_mean: Tensor,
    prior_cov: Tensor,
    emission_coeff: Tensor,
    residual: Tensor,
    noise_std: Tensor,
    latent_dim: int,
    output_dim: int,
):
    """
    One step of the Kalman filter.

    This function computes the filtered state (mean and covariance) given the
    linear system coefficients the prior state (mean and variance),
    as well as observations.

    Parameters
    ----------
    F
    target
        Observations of the system output, shape (batch_size, output_dim)
    prior_mean
        Prior mean of the latent state, shape (batch_size, latent_dim)
    prior_cov
        Prior covariance of the latent state, shape
        (batch_size, latent_dim, latent_dim)
    emission_coeff
        Emission coefficient, shape (batch_size, output_dim, latent_dim)
    residual
        Residual component, shape (batch_size, output_dim)
    noise_std
        Standard deviation of the output noise, shape (batch_size, output_dim)
    latent_dim
        Dimension of the latent state vector
    Returns
    -------
    Tensor
        Filtered_mean, shape (batch_size, latent_dim)
    Tensor
        Filtered_covariance, shape (batch_size, latent_dim, latent_dim)
    Tensor
        Log probability, shape (batch_size, )
    """
    # output_mean: mean of the target (batch_size, obs_dim)
    output_mean = F.linalg_gemm2(
        emission_coeff, prior_mean.expand_dims(axis=-1)
    ).squeeze(axis=-1)

    # noise covariance
    noise_cov = make_nd_diag(F=F, x=noise_std * noise_std, d=output_dim)

    S_hh_x_A_tr = F.linalg_gemm2(prior_cov, emission_coeff, transpose_b=True)

    # covariance of the target
    output_cov = F.linalg_gemm2(emission_coeff, S_hh_x_A_tr) + noise_cov

    # compute the Cholesky decomposition output_cov = LL^T
    L_output_cov = F.linalg_potrf(output_cov)

    # Compute Kalman gain matrix K:
    # K = S_hh X with X = A^T output_cov^{-1}
    # We have X = A^T output_cov^{-1} => X output_cov = A^T => X LL^T = A^T
    # We can thus obtain X by solving two linear systems involving L
    kalman_gain = F.linalg_trsm(
        L_output_cov,
        F.linalg_trsm(
            L_output_cov, S_hh_x_A_tr, rightside=True, transpose=True
        ),
        rightside=True,
    )

    # compute the error
    target_minus_residual = target - residual
    delta = target_minus_residual - output_mean

    # filtered estimates
    filtered_mean = prior_mean.expand_dims(axis=-1) + F.linalg_gemm2(
        kalman_gain, delta.expand_dims(axis=-1)
    )
    filtered_mean = filtered_mean.squeeze(axis=-1)

    # Joseph's symmetrized update for covariance:
    ImKA = F.broadcast_sub(
        F.eye(latent_dim), F.linalg_gemm2(kalman_gain, emission_coeff)
    )

    filtered_cov = F.linalg_gemm2(
        ImKA, F.linalg_gemm2(prior_cov, ImKA, transpose_b=True)
    ) + F.linalg_gemm2(
        kalman_gain, F.linalg_gemm2(noise_cov, kalman_gain, transpose_b=True)
    )

    # likelihood term: (batch_size,)
    log_p = MultivariateGaussian(output_mean, L_output_cov).log_prob(
        target_minus_residual
    )

    return filtered_mean, filtered_cov, log_p
Exemple #23
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        past_time_feat: Tensor,
        future_time_feat: Tensor,
        scale: Tensor,
    ) -> Tensor:
        """
        Computes the training loss for the wavenet model.

        Parameters
        ----------
        F
        feat_static_cat
            Static categorical features: (batch_size, num_cat_features)
        past_target
            Past target: (batch_size, receptive_field)
        past_observed_values
            Observed value indicator for the past target: (batch_size, receptive_field)
        past_time_feat
            Past time features: (batch_size, num_time_features, receptive_field)
        future_time_feat
            Future time features: (batch_size, num_time_features, pred_length)
        scale
            scale of the time series: (batch_size, 1)

        Returns
        -------
        Tensor
            Prediction samples with shape (batch_size, num_samples, pred_length)
        """

        def blow_up(u):
            """
            Expand to (batch_size x num_samples)
            """
            return F.repeat(u, repeats=self.num_samples, axis=0)

        past_target = past_target.astype("int32")
        full_features = self.get_full_features(
            F,
            feat_static_cat=feat_static_cat,
            past_observed_values=past_observed_values,
            past_time_feat=past_time_feat,
            future_time_feat=future_time_feat,
            future_observed_values=None,
            scale=scale,
        )

        # To compute queues for the first step, we need features from
        # -self.pred_length - self.receptive_field + 1 to -self.pred_length + 1
        features_end_ix = (
            -self.pred_length + 1 if self.pred_length > 1 else None
        )
        queues = self.get_initial_conv_queues(
            F,
            past_target=F.slice_axis(
                past_target, begin=-self.receptive_field, end=None, axis=-1
            ),
            features=F.slice_axis(
                full_features,
                begin=-self.pred_length - self.receptive_field + 1,
                end=features_end_ix,
                axis=-1,
            ),
        )
        queues = [blow_up(queue) for queue in queues]

        res = F.slice_axis(past_target, begin=-2, end=None, axis=-1)
        res = blow_up(res)
        for n in range(self.pred_length):
            # Generate one-step ahead predictions. The input consists of target and features
            # corresponding to the last two time steps.
            current_target = F.slice_axis(res, begin=-2, end=None, axis=-1)
            current_features = F.slice_axis(
                full_features,
                begin=self.receptive_field + n - 1,
                end=self.receptive_field + n + 1,
                axis=-1,
            )
            embedding = self.target_feature_embedding(
                F, target=current_target, features=blow_up(current_features),
            )

            # (batch_size, 1, num_bins) where 1 corresponds to the time axis.
            unnormalized_outputs, queues = self.base_net(
                F, embedding, one_step_prediction=True, queues=queues
            )
            if self.temperature > 0:
                # (batch_size, 1, num_bins) where 1 corresponds to the time axis.
                probs = F.softmax(
                    unnormalized_outputs / self.temperature, axis=-1
                )
                # (batch_size, 1)
                y = F.sample_multinomial(probs)
            else:
                # (batch_size, 1)
                y = F.argmax(unnormalized_outputs, axis=-1)
            y = y.astype("int32")
            res = F.concat(res, y, num_args=2, dim=-1)
        samples = F.slice_axis(res, begin=-self.pred_length, end=None, axis=-1)
        samples = samples.reshape(
            shape=(-1, self.num_samples, self.pred_length)
        )
        samples = self.post_transform(samples)
        samples = F.broadcast_mul(scale.expand_dims(axis=1), samples)
        return samples
Exemple #24
0
 def hybrid_forward(self, F, x: Tensor) -> Tuple[Tensor]:
     return (self.value * F.ones_like(x.sum(axis=-1)), )
Exemple #25
0
 def s(low: Tensor, high: Tensor) -> Tensor:
     raw_samples = self.F.sample_uniform(low=low.zeros_like(),
                                         high=high.ones_like(),
                                         dtype=dtype)
     return low + raw_samples * (high - low)
Exemple #26
0
    def unroll_encoder(
        self,
        F,
        feat_static_cat: Tensor,  # (batch_size, num_features)
        feat_static_real: Tensor,  # (batch_size, num_features)
        past_time_feat: Tensor,  # (batch_size, history_length, num_features)
        past_target: Tensor,  # (batch_size, history_length, *target_shape)
        past_observed_values:
        Tensor,  # (batch_size, history_length, *target_shape)
        future_time_feat: Optional[
            Tensor],  # (batch_size, prediction_length, num_features)
        future_target: Optional[
            Tensor],  # (batch_size, prediction_length, *target_shape)
    ) -> Tuple[Tensor, List, Tensor, Tensor]:
        """
        Unrolls the LSTM encoder over past and, if present, future data.
        Returns outputs and state of the encoder, plus the scale of past_target
        and a vector of static features that was constructed and fed as input
        to the encoder.
        All tensor arguments should have NTC layout.
        """

        if future_time_feat is None or future_target is None:
            time_feat = past_time_feat.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            )
            sequence = past_target
            sequence_length = self.history_length
            subsequences_length = self.context_length
        else:
            time_feat = F.concat(
                past_time_feat.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ),
                future_time_feat,
                dim=1,
            )
            sequence = F.concat(past_target, future_target, dim=1)
            sequence_length = self.history_length + self.prediction_length
            subsequences_length = self.context_length + self.prediction_length

        # (batch_size, sub_seq_len, *target_shape, num_lags)
        lags = self.get_lagged_subsequences(
            F=F,
            sequence=sequence,
            sequence_length=sequence_length,
            indices=self.lags_seq,
            subsequences_length=subsequences_length,
        )

        # scale is computed on the context length last units of the past target
        # scale shape is (batch_size, 1, *target_shape)
        _, scale = self.scaler(
            past_target.slice_axis(axis=1,
                                   begin=-self.context_length,
                                   end=None),
            past_observed_values.slice_axis(axis=1,
                                            begin=-self.context_length,
                                            end=None),
        )

        # (batch_size, num_features)
        embedded_cat = self.embedder(feat_static_cat)

        # in addition to embedding features, use the log scale as it can help
        # prediction too
        # (batch_size, num_features + prod(target_shape))
        static_feat = F.concat(
            embedded_cat,
            feat_static_real,
            F.log(scale) if len(self.target_shape) == 0 else F.log(
                scale.squeeze(axis=1)),
            dim=1,
        )

        # (batch_size, subsequences_length, num_features + 1)
        repeated_static_feat = static_feat.expand_dims(axis=1).repeat(
            axis=1, repeats=subsequences_length)

        # (batch_size, sub_seq_len, *target_shape, num_lags)
        lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1))

        # from (batch_size, sub_seq_len, *target_shape, num_lags)
        # to (batch_size, sub_seq_len, prod(target_shape) * num_lags)
        input_lags = F.reshape(
            data=lags_scaled,
            shape=(
                -1,
                subsequences_length,
                len(self.lags_seq) * prod(self.target_shape),
            ),
        )

        # (batch_size, sub_seq_len, input_dim)
        inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1)

        # unroll encoder
        outputs, state = self.rnn.unroll(
            inputs=inputs,
            length=subsequences_length,
            layout="NTC",
            merge_outputs=True,
        )

        # outputs: (batch_size, seq_len, num_cells)
        # state: list of (batch_size, num_cells) tensors
        # scale: (batch_size, 1, *target_shape)
        # static_feat: (batch_size, num_features + prod(target_shape))
        return outputs, state, scale, static_feat
Exemple #27
0
 def process_dynamic_cat(self, F, feature: Tensor) -> Tensor:
     return self.embed_dynamic(feature.astype(self.dtype))
Exemple #28
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes the loss for training DeepAR, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        F
        feat_static_cat : (batch_size, num_features)
        feat_static_real : (batch_size, num_features)
        past_time_feat : (batch_size, history_length, num_features)
        past_target : (batch_size, history_length, *target_shape)
        past_observed_values : (batch_size, history_length, *target_shape, seq_len)
        future_time_feat : (batch_size, prediction_length, num_features)
        future_target : (batch_size, prediction_length, *target_shape)
        future_observed_values : (batch_size, prediction_length, *target_shape)

        Returns loss with shape (batch_size, context + prediction_length, 1)
        -------

        """

        distr = self.distribution(
            feat_static_cat=feat_static_cat,
            feat_static_real=feat_static_real,
            past_time_feat=past_time_feat,
            past_target=past_target,
            past_observed_values=past_observed_values,
            future_time_feat=future_time_feat,
            future_target=future_target,
            future_observed_values=future_observed_values,
        )

        # put together target sequence
        # (batch_size, seq_len, *target_shape)
        target = F.concat(
            past_target.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            ),
            future_target,
            dim=1,
        )

        # (batch_size, seq_len)
        loss = distr.loss(target)

        # (batch_size, seq_len, *target_shape)
        observed_values = F.concat(
            past_observed_values.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=self.history_length,
            ),
            future_observed_values,
            dim=1,
        )

        # mask the loss at one time step iff one or more observations is missing in the target dimensions
        # (batch_size, seq_len)
        loss_weights = (observed_values if (len(self.target_shape) == 0) else
                        observed_values.min(axis=-1, keepdims=False))

        weighted_loss = weighted_average(F=F,
                                         x=loss,
                                         weights=loss_weights,
                                         axis=1)

        return weighted_loss, loss
Exemple #29
0
    def kalman_filter(self, targets: Tensor,
                      observed: Tensor) -> Tuple[Tensor, ...]:
        """
        Performs Kalman filtering given observations.


        Parameters
        ----------
        targets
            Observations, shape (batch_size, seq_length, output_dim)
        observed
            Flag tensor indicating which observations are genuine (1.0) and
            which are missing (0.0)

        Returns
        -------
        Tensor
            Log probabilities, shape (batch_size, seq_length)
        Tensor
            Mean of p(l_T | l_{T-1}), where T is seq_length, with shape
            (batch_size, latent_dim)
        Tensor
            Covariance of p(l_T | l_{T-1}), where T is seq_length, with shape
            (batch_size, latent_dim, latent_dim)
        """
        F = self.F
        # targets[t]: (batch_size, obs_dim)
        targets = targets.split(axis=1,
                                num_outputs=self.seq_length,
                                squeeze_axis=True)

        log_p_seq = []

        mean = self.prior_mean
        cov = self.prior_cov

        observed = (observed.split(
            axis=1, num_outputs=self.seq_length, squeeze_axis=True)
                    if observed is not None else None)

        for t in range(self.seq_length):
            # Compute the filtered distribution
            #   p(l_t | z_1, ..., z_{t + 1})
            # and log - probability
            #   log p(z_t | z_0, z_{t - 1})
            filtered_mean, filtered_cov, log_p = kalman_filter_step(
                F,
                target=targets[t],
                prior_mean=mean,
                prior_cov=cov,
                emission_coeff=self.emission_coeff[t],
                residual=self.residuals[t],
                noise_std=self.noise_std[t],
                latent_dim=self.latent_dim,
                output_dim=self.output_dim,
            )

            log_p_seq.append(log_p.expand_dims(axis=1))

            # Mean of p(l_{t+1} | l_t)
            mean = F.linalg_gemm2(
                self.transition_coeff[t],
                (filtered_mean.expand_dims(axis=-1) if observed is None else
                 F.where(observed[t], x=filtered_mean, y=mean).expand_dims(
                     axis=-1)),
            ).squeeze(axis=-1)

            # Covariance of p(l_{t+1} | l_t)
            cov = F.linalg_gemm2(
                self.transition_coeff[t],
                F.linalg_gemm2(
                    (filtered_cov if observed is None else F.where(
                        observed[t], x=filtered_cov, y=cov)),
                    self.transition_coeff[t],
                    transpose_b=True,
                ),
            ) + F.linalg_gemm2(
                self.innovation_coeff[t],
                self.innovation_coeff[t],
                transpose_a=True,
            )

        # Return sequence of log likelihoods, as well as
        # final mean and covariance of p(l_T | l_{T-1} where T is seq_length
        return F.concat(*log_p_seq, dim=1), mean, cov
Exemple #30
0
    def create_network_input(
            self,
            F,
            feat_static_cat: Tensor,  # (batch_size, num_features)
            past_time_feat: Tensor,  # (batch_size, num_features, history_length)
            past_target: Tensor,  # (batch_size, history_length, 1)
            past_observed_values: Tensor,  # (batch_size, history_length)
            future_time_feat: Optional[
                Tensor],  # (batch_size, num_features, prediction_length)
            future_target: Optional[Tensor],  # (batch_size, prediction_length)
    ) -> Tuple[Tensor, Tensor, Tensor]:
        """
        Creates inputs for the transformer network.
        All tensor arguments should have NTC layout.
        """

        if future_time_feat is None or future_target is None:
            time_feat = past_time_feat.slice_axis(
                axis=1,
                begin=self.history_length - self.context_length,
                end=None,
            )
            sequence = past_target
            sequence_length = self.history_length
            subsequences_length = self.context_length
        else:
            time_feat = F.concat(
                past_time_feat.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ),
                future_time_feat,
                dim=1,
            )
            sequence = F.concat(past_target, future_target, dim=1)
            sequence_length = self.history_length + self.prediction_length
            subsequences_length = self.context_length + self.prediction_length

        # (batch_size, sub_seq_len, *target_shape, num_lags)
        lags = self.get_lagged_subsequences(
            F=F,
            sequence=sequence,
            sequence_length=sequence_length,
            indices=self.lags_seq,
            subsequences_length=subsequences_length,
        )

        # scale is computed on the context length last units of the past target
        # scale shape is (batch_size, 1, *target_shape)
        _, scale = self.scaler(
            past_target.slice_axis(axis=1,
                                   begin=-self.context_length,
                                   end=None),
            past_observed_values.slice_axis(axis=1,
                                            begin=-self.context_length,
                                            end=None),
        )
        embedded_cat = self.embedder(feat_static_cat)

        # in addition to embedding features, use the log scale as it can help prediction too
        # (batch_size, num_features + prod(target_shape))
        static_feat = F.concat(
            embedded_cat,
            F.log(scale) if len(self.target_shape) == 0 else F.log(
                scale.squeeze(axis=1)),
            dim=1,
        )

        repeated_static_feat = static_feat.expand_dims(axis=1).repeat(
            axis=1, repeats=subsequences_length)

        # (batch_size, sub_seq_len, *target_shape, num_lags)
        lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1))

        # from (batch_size, sub_seq_len, *target_shape, num_lags)
        # to (batch_size, sub_seq_len, prod(target_shape) * num_lags)
        input_lags = F.reshape(
            data=lags_scaled,
            shape=(
                -1,
                subsequences_length,
                len(self.lags_seq) * prod(self.target_shape),
            ),
        )

        # (batch_size, sub_seq_len, input_dim)
        inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1)

        return inputs, scale, static_feat