def sampling_decoder( self, F, static_feat: Tensor, past_target: Tensor, time_feat: Tensor, scale: Tensor, begin_states: List, ) -> Tensor: """ Computes sample paths by unrolling the LSTM starting with a initial input and state. Parameters ---------- static_feat : Tensor static features. Shape: (batch_size, num_static_features). past_target : Tensor target history. Shape: (batch_size, history_length). time_feat : Tensor time features. Shape: (batch_size, prediction_length, num_time_features). scale : Tensor tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1). begin_states : List list of initial states for the LSTM layers. the shape of each tensor of the list should be (batch_size, num_cells) Returns -------- Tensor A tensor containing sampled paths. Shape: (batch_size, num_sample_paths, prediction_length). """ time_feat.attach_grad() past_target.attach_grad() with autograd.record(): # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism repeated_past_target = past_target.repeat( repeats=self.num_parallel_samples, axis=0) repeated_time_feat = time_feat.repeat( repeats=self.num_parallel_samples, axis=0) repeated_static_feat = static_feat.repeat( repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_scale = scale.repeat(repeats=self.num_parallel_samples, axis=0) repeated_states = [ s.repeat(repeats=self.num_parallel_samples, axis=0) for s in begin_states ] future_samples = [] # for each future time-units we draw new samples for this time-unit and update the state for k in range(self.prediction_length): # (batch_size * num_samples, 1, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=repeated_past_target, sequence_length=self.history_length + k, indices=self.shifted_lags, subsequences_length=1, ) # (batch_size * num_samples, 1, *target_shape, num_lags) lags_scaled = F.broadcast_div( lags, repeated_scale.expand_dims(axis=-1)) # from (batch_size * num_samples, 1, *target_shape, num_lags) # to (batch_size * num_samples, 1, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)), ) # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features) decoder_input = F.concat( input_lags, repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1), repeated_static_feat, dim=-1, ) # output shape: (batch_size * num_samples, 1, num_cells) # state shape: (batch_size * num_samples, num_cells) rnn_outputs, repeated_states = self.rnn.unroll( inputs=decoder_input, length=1, begin_state=repeated_states, layout="NTC", merge_outputs=True, ) distr_args = self.proj_distr_args(rnn_outputs) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=repeated_scale) #gaussian has mu and stddev, student T has mu sigma and nu gradient_mu_feat = autograd.grad(distr.base_distribution.mu, [time_feat], create_graph=True) gradient_sigma_feat = autograd.grad( distr.base_distribution.sigma, [time_feat], create_graph=True) gradient_nu_feat = autograd.grad(distr.base_distribution.nu, [time_feat], create_graph=True) # (batch_size * num_samples, 1, *target_shape) new_samples = distr.sample(dtype=self.dtype) with open('gradients.npy', 'wb') as f: np.save(f, gradient_mu_feat[0].asnumpy()) np.save(f, gradient_nu_feat[0].asnumpy()) np.save(f, gradient_sigma_feat[0].asnumpy()) # (batch_size * num_samples, seq_len, *target_shape) repeated_past_target = F.concat(repeated_past_target, new_samples, dim=1) future_samples.append(new_samples) # (batch_size * num_samples, prediction_length, *target_shape) samples = F.concat(*future_samples, dim=1) # (batch_size, num_samples, prediction_length, *target_shape) return samples.reshape(shape=((-1, self.num_parallel_samples) + (self.prediction_length, ) + self.target_shape))
def test_nan_mixture( distr_class, p: Tensor, x: Tensor, distr_params: Dict[str, Tensor], distr_params_grad: Dict[str, Tensor], serialize_fn, ) -> None: # sample from component distributions, and select samples distr = distr_class(**distr_params) samples = distr.sample(num_samples=NUM_SAMPLES_LARGE) rand = mx.nd.random.uniform(shape=(NUM_SAMPLES_LARGE, *p.shape)) choice = (rand > p.expand_dims(axis=0)).broadcast_like(samples) samples_ref = mx.nd.where(choice, samples, samples.zeros_like()) # construct NanMixture distribution and sample from it nan_mixture = NanMixture(nan_prob=p, distribution=distr) nan_mixture = serialize_fn(nan_mixture) samples_mix = nan_mixture.sample(num_samples=NUM_SAMPLES_LARGE) # check that shapes are right assert samples.shape == samples_mix.shape == samples_ref.shape # TODO check mean and stddev # check log_prob log_prob = nan_mixture.log_prob(x) log_prob_true = mx.nd.log(mx.nd.where(x != x, p, (1 - p) * distr.prob(x))) assert np.allclose(log_prob.asnumpy(), log_prob_true.asnumpy()) for param in distr_params: distr_params[param].attach_grad() p.attach_grad() with mx.autograd.record(): distr = distr_class(**distr_params) nan_mixture = NanMixture(nan_prob=p, distribution=distr) nll = -nan_mixture.log_prob(x) nll.backward() p_grad_true = mx.nd.where(x != x, -1 / p, 1 / (1 - p)) # gradient is undefined for these cases: p_grad_true = mx.nd.where( mx.nd.logical_or( mx.nd.logical_and(x != x, p == 0), mx.nd.logical_and(x == x, p == 1), ), 0.0 / p_grad_true.zeros_like(), p_grad_true, ) assert np.allclose(p.grad.asnumpy(), p_grad_true.asnumpy()) for param in distr_params: assert np.allclose( distr_params[param].grad.asnumpy(), distr_params_grad[param] )
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes the loss for training DeepAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F feat_static_cat : (batch_size, num_features) feat_static_real : (batch_size, num_features) past_time_feat : (batch_size, history_length, num_features) past_target : (batch_size, history_length, *target_shape) past_observed_values : (batch_size, history_length, *target_shape, seq_len) future_time_feat : (batch_size, prediction_length, num_features) future_target : (batch_size, prediction_length, *target_shape) future_observed_values : (batch_size, prediction_length, *target_shape) Returns loss with shape (batch_size, context + prediction_length, 1) ------- """ past_time_feat.attach_grad() past_target.attach_grad() with autograd.record(): distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = (observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False)) weighted_loss = weighted_average(F=F, x=loss, weights=loss_weights, axis=1) # need to mask possible nans and -inf loss = F.where(condition=loss_weights, x=loss, y=F.zeros_like(loss)) return weighted_loss, loss