Esempio n. 1
0
    def sampling_decoder(
        self,
        F,
        static_feat: Tensor,
        past_target: Tensor,
        time_feat: Tensor,
        scale: Tensor,
        begin_states: List,
    ) -> Tensor:
        """
        Computes sample paths by unrolling the LSTM starting with a initial
        input and state.

        Parameters
        ----------
        static_feat : Tensor
            static features. Shape: (batch_size, num_static_features).
        past_target : Tensor
            target history. Shape: (batch_size, history_length).
        time_feat : Tensor
            time features. Shape: (batch_size, prediction_length, num_time_features).
        scale : Tensor
            tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1).
        begin_states : List
            list of initial states for the LSTM layers.
            the shape of each tensor of the list should be (batch_size, num_cells)
        Returns
        --------
        Tensor
            A tensor containing sampled paths.
            Shape: (batch_size, num_sample_paths, prediction_length).
        """
        time_feat.attach_grad()
        past_target.attach_grad()
        with autograd.record():
            # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism
            repeated_past_target = past_target.repeat(
                repeats=self.num_parallel_samples, axis=0)
            repeated_time_feat = time_feat.repeat(
                repeats=self.num_parallel_samples, axis=0)
            repeated_static_feat = static_feat.repeat(
                repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1)
            repeated_scale = scale.repeat(repeats=self.num_parallel_samples,
                                          axis=0)
            repeated_states = [
                s.repeat(repeats=self.num_parallel_samples, axis=0)
                for s in begin_states
            ]

            future_samples = []

            # for each future time-units we draw new samples for this time-unit and update the state
            for k in range(self.prediction_length):
                # (batch_size * num_samples, 1, *target_shape, num_lags)
                lags = self.get_lagged_subsequences(
                    F=F,
                    sequence=repeated_past_target,
                    sequence_length=self.history_length + k,
                    indices=self.shifted_lags,
                    subsequences_length=1,
                )

                # (batch_size * num_samples, 1, *target_shape, num_lags)
                lags_scaled = F.broadcast_div(
                    lags, repeated_scale.expand_dims(axis=-1))

                # from (batch_size * num_samples, 1, *target_shape, num_lags)
                # to (batch_size * num_samples, 1, prod(target_shape) * num_lags)
                input_lags = F.reshape(
                    data=lags_scaled,
                    shape=(-1, 1,
                           prod(self.target_shape) * len(self.lags_seq)),
                )

                # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features)
                decoder_input = F.concat(
                    input_lags,
                    repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1),
                    repeated_static_feat,
                    dim=-1,
                )

                # output shape: (batch_size * num_samples, 1, num_cells)
                # state shape: (batch_size * num_samples, num_cells)
                rnn_outputs, repeated_states = self.rnn.unroll(
                    inputs=decoder_input,
                    length=1,
                    begin_state=repeated_states,
                    layout="NTC",
                    merge_outputs=True,
                )
                distr_args = self.proj_distr_args(rnn_outputs)

                # compute likelihood of target given the predicted parameters
                distr = self.distr_output.distribution(distr_args,
                                                       scale=repeated_scale)
                #gaussian has mu and stddev, student T has mu sigma and nu
                gradient_mu_feat = autograd.grad(distr.base_distribution.mu,
                                                 [time_feat],
                                                 create_graph=True)
                gradient_sigma_feat = autograd.grad(
                    distr.base_distribution.sigma, [time_feat],
                    create_graph=True)
                gradient_nu_feat = autograd.grad(distr.base_distribution.nu,
                                                 [time_feat],
                                                 create_graph=True)
                # (batch_size * num_samples, 1, *target_shape)
                new_samples = distr.sample(dtype=self.dtype)
                with open('gradients.npy', 'wb') as f:
                    np.save(f, gradient_mu_feat[0].asnumpy())
                    np.save(f, gradient_nu_feat[0].asnumpy())
                    np.save(f, gradient_sigma_feat[0].asnumpy())

                # (batch_size * num_samples, seq_len, *target_shape)
                repeated_past_target = F.concat(repeated_past_target,
                                                new_samples,
                                                dim=1)
                future_samples.append(new_samples)

            # (batch_size * num_samples, prediction_length, *target_shape)
            samples = F.concat(*future_samples, dim=1)

        # (batch_size, num_samples, prediction_length, *target_shape)
        return samples.reshape(shape=((-1, self.num_parallel_samples) +
                                      (self.prediction_length, ) +
                                      self.target_shape))
Esempio n. 2
0
def test_nan_mixture(
    distr_class,
    p: Tensor,
    x: Tensor,
    distr_params: Dict[str, Tensor],
    distr_params_grad: Dict[str, Tensor],
    serialize_fn,
) -> None:
    # sample from component distributions, and select samples
    distr = distr_class(**distr_params)

    samples = distr.sample(num_samples=NUM_SAMPLES_LARGE)

    rand = mx.nd.random.uniform(shape=(NUM_SAMPLES_LARGE, *p.shape))
    choice = (rand > p.expand_dims(axis=0)).broadcast_like(samples)
    samples_ref = mx.nd.where(choice, samples, samples.zeros_like())

    # construct NanMixture distribution and sample from it
    nan_mixture = NanMixture(nan_prob=p, distribution=distr)

    nan_mixture = serialize_fn(nan_mixture)

    samples_mix = nan_mixture.sample(num_samples=NUM_SAMPLES_LARGE)
    # check that shapes are right

    assert samples.shape == samples_mix.shape == samples_ref.shape

    # TODO check mean and stddev

    # check log_prob
    log_prob = nan_mixture.log_prob(x)

    log_prob_true = mx.nd.log(mx.nd.where(x != x, p, (1 - p) * distr.prob(x)))

    assert np.allclose(log_prob.asnumpy(), log_prob_true.asnumpy())

    for param in distr_params:
        distr_params[param].attach_grad()
    p.attach_grad()

    with mx.autograd.record():
        distr = distr_class(**distr_params)
        nan_mixture = NanMixture(nan_prob=p, distribution=distr)
        nll = -nan_mixture.log_prob(x)
    nll.backward()

    p_grad_true = mx.nd.where(x != x, -1 / p, 1 / (1 - p))
    # gradient is undefined for these cases:
    p_grad_true = mx.nd.where(
        mx.nd.logical_or(
            mx.nd.logical_and(x != x, p == 0),
            mx.nd.logical_and(x == x, p == 1),
        ),
        0.0 / p_grad_true.zeros_like(),
        p_grad_true,
    )

    assert np.allclose(p.grad.asnumpy(), p_grad_true.asnumpy())

    for param in distr_params:

        assert np.allclose(
            distr_params[param].grad.asnumpy(), distr_params_grad[param]
        )
Esempio n. 3
0
    def hybrid_forward(
        self,
        F,
        feat_static_cat: Tensor,
        feat_static_real: Tensor,
        past_time_feat: Tensor,
        past_target: Tensor,
        past_observed_values: Tensor,
        future_time_feat: Tensor,
        future_target: Tensor,
        future_observed_values: Tensor,
    ) -> Tensor:
        """
        Computes the loss for training DeepAR, all inputs tensors representing
        time series have NTC layout.

        Parameters
        ----------
        F
        feat_static_cat : (batch_size, num_features)
        feat_static_real : (batch_size, num_features)
        past_time_feat : (batch_size, history_length, num_features)
        past_target : (batch_size, history_length, *target_shape)
        past_observed_values : (batch_size, history_length, *target_shape, seq_len)
        future_time_feat : (batch_size, prediction_length, num_features)
        future_target : (batch_size, prediction_length, *target_shape)
        future_observed_values : (batch_size, prediction_length, *target_shape)

        Returns loss with shape (batch_size, context + prediction_length, 1)
        -------

        """

        past_time_feat.attach_grad()
        past_target.attach_grad()
        with autograd.record():
            distr = self.distribution(
                feat_static_cat=feat_static_cat,
                feat_static_real=feat_static_real,
                past_time_feat=past_time_feat,
                past_target=past_target,
                past_observed_values=past_observed_values,
                future_time_feat=future_time_feat,
                future_target=future_target,
                future_observed_values=future_observed_values,
            )

            # put together target sequence
            # (batch_size, seq_len, *target_shape)
            target = F.concat(
                past_target.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=None,
                ),
                future_target,
                dim=1,
            )

            # (batch_size, seq_len)
            loss = distr.loss(target)

            # (batch_size, seq_len, *target_shape)
            observed_values = F.concat(
                past_observed_values.slice_axis(
                    axis=1,
                    begin=self.history_length - self.context_length,
                    end=self.history_length,
                ),
                future_observed_values,
                dim=1,
            )

            # mask the loss at one time step iff one or more observations is missing in the target dimensions
            # (batch_size, seq_len)
            loss_weights = (observed_values if (len(self.target_shape) == 0)
                            else observed_values.min(axis=-1, keepdims=False))

            weighted_loss = weighted_average(F=F,
                                             x=loss,
                                             weights=loss_weights,
                                             axis=1)

            # need to mask possible nans and -inf
            loss = F.where(condition=loss_weights,
                           x=loss,
                           y=F.zeros_like(loss))

        return weighted_loss, loss