def hybrid_forward( self, F, feat_static_cat: Tensor, past_target: Tensor, past_observed_values: Tensor, past_is_pad: Tensor, past_time_feat: Tensor, future_time_feat: Tensor, scale: Tensor, ) -> Tensor: embedded_cat = self.feature_embedder(feat_static_cat) static_feat = F.concat(embedded_cat, F.log(scale + 1.0), dim=1) past_target = past_target.astype("int32") def blow_up(u): """ Expand to (batch_size x num_samples) """ return F.repeat(u, repeats=self.num_samples, axis=0) def is_last_layer(i): return i + 1 == len(self.dilations) queues = [] full_time_features = F.concat(past_time_feat, future_time_feat, dim=-1) future_observed_values = F.slice_axis(future_time_feat, begin=0, end=1, axis=1).ones_like() full_observed = F.concat( F.expand_dims(past_observed_values, axis=1), future_observed_values, dim=-1, ) repeated_static_feat = F.repeat( F.expand_dims(static_feat, axis=-1), repeats=self.pred_length + self.receptive_field, axis=-1, ) full_features = F.concat(full_time_features, full_observed, repeated_static_feat, dim=1) feature_slice = F.slice_axis( full_features, begin=-self.pred_length - self.receptive_field + 1, end=None, axis=-1, ) tmp = F.slice_axis(past_target, begin=-self.receptive_field, end=None, axis=-1) o = self.target_embed(tmp).swapaxes(1, 2) o = F.concat( o, F.slice_axis(feature_slice, begin=-self.receptive_field, end=None, axis=-1), dim=1, ) o = self.conv_project(o) for i, d in enumerate(self.dilations): sz = 1 if d == 2**(self.dilation_depth - 1) else d * 2 _, o = self.residuals[i](o) if not is_last_layer(i): o_chunk = F.slice_axis(o, begin=-sz - 1, end=-1, axis=-1) else: o_chunk = o queues.append(blow_up(o_chunk)) res = F.slice_axis(past_target, begin=-2, end=None, axis=-1) res = blow_up(res) for n in range(self.pred_length): queues_next = [] o = self.target_embed( F.slice_axis(res, begin=-2, end=None, axis=-1)).swapaxes(1, 2) b = F.slice_axis( full_features, begin=self.receptive_field + n - 1, end=self.receptive_field + n + 1, axis=-1, ) b = blow_up(b) o = F.concat(o, b, dim=1) o = self.conv_project(o) skip_outs = [] for i, d in enumerate(self.dilations): skip, o = self.residuals[i](o) skip_outs.append(skip) if not is_last_layer(i): q = queues[i] o = F.concat(q, o, num_args=2, dim=-1) queues_next.append( F.slice_axis(o, begin=1, end=None, axis=-1)) queues = queues_next y = sum(skip_outs) y = self.output_act(y) y = self.conv1(y) y = self.output_act(y) unnormalized_outputs = self.conv2(y) if self.temperature > 0: probs = F.softmax(unnormalized_outputs / self.temperature, axis=1) y = F.sample_multinomial(probs.swapaxes(1, 2)) else: y = F.argmax(unnormalized_outputs, axis=1) y = y.astype("int32") res = F.concat(res, y, num_args=2, dim=-1) samples = F.slice_axis(res, begin=-self.pred_length, end=None, axis=-1) samples = samples.reshape(shape=(-1, self.num_samples, self.pred_length)) samples = self.post_transform(samples) samples = F.broadcast_mul(scale.expand_dims(axis=1), samples) return samples
def process_static_cat(self, F, feature: Tensor) -> Tensor: feature = self.embed_static(feature.astype(self.dtype)) return F.tile(feature.expand_dims(axis=1), reps=(1, self.T, 1))
def process_static_real(self, F, feature: Tensor) -> Tensor: return F.tile(feature.expand_dims(axis=1), reps=(1, self.T, 1))
def s(mu: Tensor, sigma: Tensor) -> Tensor: raw_samples = self.F.sample_normal(mu=mu.zeros_like(), sigma=sigma.ones_like()) return sigma * raw_samples + mu
def quantile(self, level: Tensor) -> Tensor: F = self.F for _ in range(self.all_dim): level = level.expand_dims(axis=-1) return F.broadcast_add(F.broadcast_mul(self.high - self.low, level), self.low)
def nans_like(x: Tensor) -> Tensor: return x.zeros_like() / 0.0
def cumsum(F, x: Tensor, exclusive: bool = False, reverse: bool = False) -> Tensor: r""" Find cumulative sum on the last axis by multiplying with lower triangular ones-matrix: .. math:: \operatorname{cumsum}(x) = \begin{cases} \operatorname{ltr\_ones} \times x & \text{for cumulative sum}\\ x \times \operatorname{ltr\_ones} & \text{for cumulative sum in the reverse order} \end{cases} Also supports `exclusive` flag to start the cumsum with zero. For example, if :math:`x = [a, b, c]`, we have .. math:: \operatorname{cumsum}(x) = \begin{cases} [a, a + b, a + b + c] & \text{if }\mathit{reverse = False, exclusive = False}\\ [0, a, a + b] & \text{if }\mathit{reverse = False, exclusive = True}\\ [a + b + c, b + c, c] & \text{if }\mathit{reverse = True, exclusive = False}\\ [b + c, c, 0] & \text{if }\mathit{reverse = True, exclusive = True}\\ \end{cases} Parameters ---------- F The function space to use. x A tensor with shape :math:`(..., n)`. exclusive If `True`, the cumulative sum starts with zero. reverse If `True`, the cumulative sum is performed in the opposite direction. Returns ------- Tensor: A modified tensor with identical shape and cumulative sums in the last axis. """ # Create a new axis (for matrix multiplication) either at last location or # last-but-one location (for reverse mode) exp_dim = -2 if reverse else -1 # (..., 1, n) if reverse is True and (..., n, 1) otherwise x = x.expand_dims(axis=exp_dim) # Ones_matrix (..., n, n) ones_matrix = F.linalg_gemm2( F.ones_like(x), F.ones_like(x), transpose_a=reverse, transpose_b=not reverse, ) cumulative_sum = F.linalg_trmm(ones_matrix, x, rightside=reverse) if exclusive: cumulative_sum = cumulative_sum - x return cumulative_sum.squeeze(axis=exp_dim)
def f_inv(self, y: Tensor) -> Tensor: return y.clip(-np.inf, 30).exp()
def f(self, x: Tensor) -> Tensor: F = getF(x) return F.Activation(x.clip(-100.0, np.inf), act_type="softrelu")
def log_abs_det_jac(self, x: Tensor, y: Tensor) -> Tensor: return y.clip(1.0e-20, np.inf).log()
def f(self, x: Tensor) -> Tensor: return x.clip(1.0e-20, np.inf).log()
def f_inv(self, y: Tensor) -> Tensor: return y.clip(1.0e-20, np.inf).log()
def f(self, x: Tensor) -> Tensor: return x.clip(-np.inf, 30).exp()
def hybrid_forward( self, F, data: Tensor, observed_indicator: Tensor, scale: Optional[Tensor], rep_params: List[Tensor], **kwargs, ) -> Tuple[Tensor, Tensor, List[Tensor]]: data_np = data.asnumpy() observed_indicator_np = observed_indicator.astype("int32").asnumpy() if scale is None: # Even though local binning implicitly scales the data, we still return the scale as an input to the model. scale = F.expand_dims( F.sum(data * observed_indicator, axis=-1) / F.sum(observed_indicator, axis=-1), -1, ) bin_centers_hyb = np.ones((len(data), self.num_bins)) * (-1) bin_edges_hyb = np.ones((len(data), self.num_bins + 1)) * (-1) # Every time series needs to be binned individually for i in range(len(data_np)): # Identify observed data points. data_loc = data_np[i] observed_indicator_loc = observed_indicator_np[i] data_obs_loc = data_loc[observed_indicator_loc == 1] if data_obs_loc.size > 0: # Calculate time series specific bin centers and edges. if self.is_quantile: bin_centers_loc = np.quantile( data_obs_loc, np.linspace(0, 1, self.num_bins)) else: bin_centers_loc = np.linspace( np.min(data_obs_loc), np.max(data_obs_loc), self.num_bins, ) bin_centers_hyb[i] = ensure_binning_monotonicity( bin_centers_loc) bin_edges_hyb[i] = bin_edges_from_bin_centers( bin_centers_hyb[i]) # Bin the time series. data_obs_loc_binned = np.digitize(data_obs_loc, bins=bin_edges_hyb[i], right=False) else: data_obs_loc_binned = [] # Write the binned time series back into the data array. data_loc[observed_indicator_loc == 1] = data_obs_loc_binned data_np[i] = data_loc else: bin_centers_hyb = rep_params[0].asnumpy() bin_edges_hyb = rep_params[1].asnumpy() bin_edges_hyb = np.repeat( bin_edges_hyb, len(data_np) / len(bin_edges_hyb), axis=0, ) bin_centers_hyb = np.repeat( bin_centers_hyb, len(data_np) / len(bin_centers_hyb), axis=0, ) for i in range(len(data_np)): data_loc = data_np[i] observed_indicator_loc = observed_indicator_np[i] data_obs_loc = data_loc[observed_indicator_loc == 1] # Bin the time series based on previously computed bin edges. data_obs_loc_binned = np.digitize(data_obs_loc, bins=bin_edges_hyb[i], right=False) data_loc[observed_indicator_loc == 1] = data_obs_loc_binned data_np[i] = data_loc bin_centers_hyb = F.array(bin_centers_hyb) bin_edges_hyb = F.array(bin_edges_hyb) data = mx.nd.array(data_np) return data, scale, [bin_centers_hyb, bin_edges_hyb]
def sampling_decoder( self, F, static_feat: Tensor, past_target: Tensor, time_feat: Tensor, scale: Tensor, begin_states: List, ) -> Tensor: """ Computes sample paths by unrolling the LSTM starting with a initial input and state. Parameters ---------- static_feat : Tensor static features. Shape: (batch_size, num_static_features). past_target : Tensor target history. Shape: (batch_size, history_length). time_feat : Tensor time features. Shape: (batch_size, prediction_length, num_time_features). scale : Tensor tensor containing the scale of each element in the batch. Shape: (batch_size, 1, 1). begin_states : List list of initial states for the LSTM layers. the shape of each tensor of the list should be (batch_size, num_cells) Returns -------- Tensor A tensor containing sampled paths. Shape: (batch_size, num_sample_paths, prediction_length). """ # blows-up the dimension of each tensor to batch_size * self.num_parallel_samples for increasing parallelism repeated_past_target = past_target.repeat( repeats=self.num_parallel_samples, axis=0) repeated_time_feat = time_feat.repeat( repeats=self.num_parallel_samples, axis=0) repeated_static_feat = static_feat.repeat( repeats=self.num_parallel_samples, axis=0).expand_dims(axis=1) repeated_scale = scale.repeat(repeats=self.num_parallel_samples, axis=0) repeated_states = [ s.repeat(repeats=self.num_parallel_samples, axis=0) for s in begin_states ] future_samples = [] # for each future time-units we draw new samples for this time-unit and update the state for k in range(self.prediction_length): # (batch_size * num_samples, 1, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=repeated_past_target, sequence_length=self.history_length + k, indices=self.shifted_lags, subsequences_length=1, ) # (batch_size * num_samples, 1, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, repeated_scale.expand_dims(axis=-1)) # from (batch_size * num_samples, 1, *target_shape, num_lags) # to (batch_size * num_samples, 1, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=(-1, 1, prod(self.target_shape) * len(self.lags_seq)), ) # (batch_size * num_samples, 1, prod(target_shape) * num_lags + num_time_features + num_static_features) decoder_input = F.concat( input_lags, repeated_time_feat.slice_axis(axis=1, begin=k, end=k + 1), repeated_static_feat, dim=-1, ) # output shape: (batch_size * num_samples, 1, num_cells) # state shape: (batch_size * num_samples, num_cells) rnn_outputs, repeated_states = self.rnn.unroll( inputs=decoder_input, length=1, begin_state=repeated_states, layout="NTC", merge_outputs=True, ) distr_args = self.proj_distr_args(rnn_outputs) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=repeated_scale) # (batch_size * num_samples, 1, *target_shape) new_samples = distr.sample() # (batch_size * num_samples, seq_len, *target_shape) repeated_past_target = F.concat(repeated_past_target, new_samples, dim=1) future_samples.append(new_samples) # (batch_size * num_samples, prediction_length, *target_shape) samples = F.concat(*future_samples, dim=1) # (batch_size, num_samples, prediction_length, *target_shape) return samples.reshape(shape=((-1, self.num_parallel_samples) + (self.prediction_length, ) + self.target_shape))
def hybrid_forward(self, F, past_target: Tensor, past_observed_values: Tensor) -> Tensor: """ Given the tensor `past_target`, first we normalize it by the `past_observed_values` which is an indicator tensor with 0 or 1 values. Then it outputs the result of LSTNet. Parameters ---------- F past_target Tensor of shape (batch_size, num_series, context_length) past_observed_values Tensor of shape (batch_size, num_series, context_length) Returns ------- Tensor Shape (batch_size, num_series, 1) if `horizon` was specified and of shape (batch_size, num_series, prediction_length) if `prediction_length` was provided """ scaled_past_target, _ = self.scaler( past_target.slice_axis(axis=2, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=2, begin=-self.context_length, end=None), ) c = self.cnn(scaled_past_target) c = self.dropout(c) c = F.transpose(c, axes=(0, 2, 1)) # NTC if F is mx.ndarray: ctx = (c.context if isinstance(c, mx.gluon.tensor_types) else c[0].context) with ctx: rnn_begin_state = self.rnn.begin_state(func=F.zeros, dtype=self.dtype, batch_size=c.shape[0]) else: rnn_begin_state = self.rnn.begin_state(func=F.zeros, dtype=self.dtype, batch_size=0) r, _ = self.rnn.unroll( inputs=c, length=min(self.conv_out, self.context_length), layout="NTC", merge_outputs=True, begin_state=rnn_begin_state, ) r = F.squeeze(F.slice_axis(r, axis=1, begin=-1, end=None), axis=1) # NC s = self._skip_rnn_layer(F, c) # make fc broadcastable for output fc = self.fc(F.concat(r, s, dim=1)).expand_dims(axis=2) # N x num_series x 1 if self.prediction_length: fc = F.tile(fc, reps=( 1, 1, self.prediction_length)) # N x num_series x prediction_length ar = self._ar_highway(F, past_target) out = fc + ar if self.output_activation is None: return out return (F.sigmoid(out) if self.output_activation == "sigmoid" else F.tanh(out))
def plot_samples(s: Tensor, bins: int = 100) -> None: from matplotlib import pyplot as plt s = s.asnumpy() plt.hist(s, bins=bins) plt.show()
def quantile_internal( self, x: Tensor, axis: Optional[int] = None ) -> Tensor: r""" Evaluates the quantile function at the quantile levels contained in `x`. Parameters ---------- x Tensor of shape ``*gamma.shape`` if axis=None, or containing an additional axis on the specified position, otherwise. axis Index of the axis containing the different quantile levels which are to be computed. Returns ------- Tensor Quantiles tensor, of the same shape as x. """ F = self.F # shapes of self # self.gamma: (*batch_shape) # self.knot_positions, self.b: (*batch_shape, num_pieces) # axis=None - passed at inference when num_samples is None # The shape of x is (*batch_shape). # The shapes of the parameters should be: # gamma: (*batch_shape), knot_positions, b: (*batch_shape, num_pieces) # They match the self. counterparts so no reshaping is needed # axis=0 - passed at inference when num_samples is not None # The shape of x is (num_samples, *batch_shape). # The shapes of the parameters should be: # gamma: (num_samples, *batch_shape), knot_positions, b: (num_samples, *batch_shape, num_pieces), # They do not match the self. counterparts and we need to expand the axis=0 to all of them. # axis=-2 - passed at training when we evaluate quantiles at knot_positions in order to compute a_tilde # The shape of x is shape(x) = shape(knot_positions) = (*batch_shape, num_pieces). # The shape of the parameters shopuld be: # gamma: (*batch_shape, 1), knot_positions: (*batch_shape, 1, num_pieces), b: (*batch_shape, 1, num_pieces) # They do not match the self. counterparts and we need to expand axis=-1 for gamma and axis=-2 for the rest. if axis is not None: gamma = self.gamma.expand_dims(axis=axis if axis == 0 else -1) knot_positions = self.knot_positions.expand_dims(axis=axis) b = self.b.expand_dims(axis=axis) else: gamma, knot_positions, b = self.gamma, self.knot_positions, self.b x_minus_knots = F.broadcast_minus( x.expand_dims(axis=-1), knot_positions ) quantile = F.broadcast_add( gamma, F.sum(F.broadcast_mul(b, F.relu(x_minus_knots)), axis=-1) ) return quantile
def cdf(self, x: Tensor) -> Tensor: F = self.F x = x.expand_dims(axis=-1) # left_edges = self.bin_edges.slice_axis(axis=-1, begin=0, end=-1) mask = F.broadcast_lesser_equal(self.bin_centers, x) return F.broadcast_mul(self.bin_probs, mask).sum(axis=-1)
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: if self.ignore_future_targets: distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=None, future_target=None, future_observed_values=future_observed_values, ) loss = distr.loss( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, )) # (batch_size, seq_len, *target_shape) observed_values = past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ) else: distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = (observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False)) weighted_loss = weighted_average(F=F, x=loss, weights=loss_weights, axis=1) total_loss = F.sum(weighted_loss) / weighted_loss.shape[0] print_string = f'Forecasting loss: {total_loss.asscalar()}' # add self-supervised reconciliation loss if self.self_supervised_penalty > 0: agg_preds = F.take(distr.mean, F.array(list(self.hierarchy_agg_dict.keys()))) disagg_preds = F.concat(*[ F.sum(F.take(distr.mean, F.array(disagg_idxs)), axis=0) for disagg_idxs in self.hierarchy_agg_dict.values() ], dim=0).reshape(agg_preds.shape) f_loss = F.sum(F.square(agg_preds - F.sum(disagg_preds, axis=0))) # add embedding reconciliation loss if self.embedding_agg_penalty > 0: embedded = self.embedder( F.expand_dims(F.array([i for i in range(self.cardinality[0])]), axis=1)) agg_embeds = F.take(embedded, F.array(list(self.hierarchy_agg_dict.keys()))) agg_copies = agg_embeds.copy().detach() disagg_embeds = [ F.take(embedded, F.array(disagg_idxs)) for disagg_idxs in self.hierarchy_agg_dict.values() ] disagg_lens = [len(disagg) for disagg in disagg_embeds] max_len = max(disagg_lens) + 1 dim = embedded.shape[1] disagg_embeds = [ F.concat(*[ disagg, F.tile(agg, max_len - disagg.shape[0]).reshape(-1, dim) ], dim=0).reshape(-1, dim).expand_dims(axis=0) for agg, disagg in zip(agg_copies, disagg_embeds) ] disagg_embeds = F.concat(*disagg_embeds, dim=0) if self.embedding_dist_metric == 'cosine': agg_embeds = F.L2Normalization(agg_embeds).expand_dims(axis=2) disagg_embeds = F.L2Normalization(disagg_embeds, mode='spatial') e_loss = 1 - F.batch_dot(disagg_embeds, agg_embeds) else: agg_embeds = agg_embeds.expand_dims(axis=1) stability_constant = 1e-7 e_loss = F.norm(agg_embeds - disagg_embeds + stability_constant, axis=2) e_loss = F.square(e_loss) if self.self_supervised_penalty > 0: total_f_loss = F.sum(f_loss) / weighted_loss.shape[0] / len( self.hierarchy_agg_dict) total_loss = total_loss + total_f_loss * F.array( [self.self_supervised_penalty]) if self.embedding_agg_penalty > 0: total_e_loss = F.sum(e_loss) / len(self.hierarchy_agg_dict) total_loss = total_loss + total_e_loss * F.array( [self.embedding_agg_penalty]) # print forecasting/reconciliation loss at each step if self.print_rec_penalty: if self.self_supervised_penalty > 0: print_string = print_string + f', Self-supervised Loss: {total_f_loss.asscalar()}' if self.embedding_agg_penalty > 0: print_string = print_string + f', Embedding agg Loss: {total_e_loss.asscalar()}' print(print_string) return total_loss, loss
def exact_inference( self, x_train: Tensor, y_train: Tensor, x_test: Tensor ) -> Tuple[Tensor, Tensor, Tensor]: """ Parameters ---------- x_train Training set of features of shape (batch_size, context_length, num_features). y_train Training labels of shape (batch_size, context_length). x_test Test set of features of shape (batch_size, prediction_length, num_features). Returns ------- Tuple Tensor Predictive GP samples of shape (batch_size, prediction_length, num_samples). Tensor Predictive mean of the GP of shape (batch_size, prediction_length). Tensor Predictive standard deviation of the GP of shape (batch_size, prediction_length). """ assert ( self.context_length is not None ), "The value of `context_length` must be set." assert ( self.prediction_length is not None ), "The value of `prediction_length` must be set." # Compute Cholesky factorization of training kernel matrix l_train = self._compute_cholesky_gp( self.kernel.kernel_matrix(x_train, x_train), self.context_length ) lower_tri_solve = self.F.linalg.trsm( l_train, self.kernel.kernel_matrix(x_train, x_test) ) predictive_mean = self.F.linalg.gemm2( lower_tri_solve, self.F.linalg.trsm(l_train, y_train.expand_dims(axis=-1)), transpose_a=True, ).squeeze(axis=-1) # Can rewrite second term as # :math:`||L^-1 * K(x_train,x_test||_2^2` # and only solve 1 equation predictive_covariance = self.kernel.kernel_matrix( x_test, x_test ) - self.F.linalg.gemm2( lower_tri_solve, lower_tri_solve, transpose_a=True ) # Extract diagonal entries of covariance matrix predictive_std = batch_diagonal( self.F, predictive_covariance, self.prediction_length, self.ctx, self.float_type, ) # If self.sample_noise = True, predictive covariance has sigma^2 on the diagonal if self.sample_noise: predictive_std = self.F.broadcast_add( predictive_std, self.sigma ** 2 ) predictive_std = self.F.sqrt(predictive_std).squeeze(axis=-1) # Compute sample from GP predictive distribution return ( self.sample(predictive_mean, predictive_covariance), predictive_mean, predictive_std, )
def kalman_filter_step( F, target: Tensor, prior_mean: Tensor, prior_cov: Tensor, emission_coeff: Tensor, residual: Tensor, noise_std: Tensor, latent_dim: int, output_dim: int, ): """ One step of the Kalman filter. This function computes the filtered state (mean and covariance) given the linear system coefficients the prior state (mean and variance), as well as observations. Parameters ---------- F target Observations of the system output, shape (batch_size, output_dim) prior_mean Prior mean of the latent state, shape (batch_size, latent_dim) prior_cov Prior covariance of the latent state, shape (batch_size, latent_dim, latent_dim) emission_coeff Emission coefficient, shape (batch_size, output_dim, latent_dim) residual Residual component, shape (batch_size, output_dim) noise_std Standard deviation of the output noise, shape (batch_size, output_dim) latent_dim Dimension of the latent state vector Returns ------- Tensor Filtered_mean, shape (batch_size, latent_dim) Tensor Filtered_covariance, shape (batch_size, latent_dim, latent_dim) Tensor Log probability, shape (batch_size, ) """ # output_mean: mean of the target (batch_size, obs_dim) output_mean = F.linalg_gemm2( emission_coeff, prior_mean.expand_dims(axis=-1) ).squeeze(axis=-1) # noise covariance noise_cov = make_nd_diag(F=F, x=noise_std * noise_std, d=output_dim) S_hh_x_A_tr = F.linalg_gemm2(prior_cov, emission_coeff, transpose_b=True) # covariance of the target output_cov = F.linalg_gemm2(emission_coeff, S_hh_x_A_tr) + noise_cov # compute the Cholesky decomposition output_cov = LL^T L_output_cov = F.linalg_potrf(output_cov) # Compute Kalman gain matrix K: # K = S_hh X with X = A^T output_cov^{-1} # We have X = A^T output_cov^{-1} => X output_cov = A^T => X LL^T = A^T # We can thus obtain X by solving two linear systems involving L kalman_gain = F.linalg_trsm( L_output_cov, F.linalg_trsm( L_output_cov, S_hh_x_A_tr, rightside=True, transpose=True ), rightside=True, ) # compute the error target_minus_residual = target - residual delta = target_minus_residual - output_mean # filtered estimates filtered_mean = prior_mean.expand_dims(axis=-1) + F.linalg_gemm2( kalman_gain, delta.expand_dims(axis=-1) ) filtered_mean = filtered_mean.squeeze(axis=-1) # Joseph's symmetrized update for covariance: ImKA = F.broadcast_sub( F.eye(latent_dim), F.linalg_gemm2(kalman_gain, emission_coeff) ) filtered_cov = F.linalg_gemm2( ImKA, F.linalg_gemm2(prior_cov, ImKA, transpose_b=True) ) + F.linalg_gemm2( kalman_gain, F.linalg_gemm2(noise_cov, kalman_gain, transpose_b=True) ) # likelihood term: (batch_size,) log_p = MultivariateGaussian(output_mean, L_output_cov).log_prob( target_minus_residual ) return filtered_mean, filtered_cov, log_p
def hybrid_forward( self, F, feat_static_cat: Tensor, past_target: Tensor, past_observed_values: Tensor, past_time_feat: Tensor, future_time_feat: Tensor, scale: Tensor, ) -> Tensor: """ Computes the training loss for the wavenet model. Parameters ---------- F feat_static_cat Static categorical features: (batch_size, num_cat_features) past_target Past target: (batch_size, receptive_field) past_observed_values Observed value indicator for the past target: (batch_size, receptive_field) past_time_feat Past time features: (batch_size, num_time_features, receptive_field) future_time_feat Future time features: (batch_size, num_time_features, pred_length) scale scale of the time series: (batch_size, 1) Returns ------- Tensor Prediction samples with shape (batch_size, num_samples, pred_length) """ def blow_up(u): """ Expand to (batch_size x num_samples) """ return F.repeat(u, repeats=self.num_samples, axis=0) past_target = past_target.astype("int32") full_features = self.get_full_features( F, feat_static_cat=feat_static_cat, past_observed_values=past_observed_values, past_time_feat=past_time_feat, future_time_feat=future_time_feat, future_observed_values=None, scale=scale, ) # To compute queues for the first step, we need features from # -self.pred_length - self.receptive_field + 1 to -self.pred_length + 1 features_end_ix = ( -self.pred_length + 1 if self.pred_length > 1 else None ) queues = self.get_initial_conv_queues( F, past_target=F.slice_axis( past_target, begin=-self.receptive_field, end=None, axis=-1 ), features=F.slice_axis( full_features, begin=-self.pred_length - self.receptive_field + 1, end=features_end_ix, axis=-1, ), ) queues = [blow_up(queue) for queue in queues] res = F.slice_axis(past_target, begin=-2, end=None, axis=-1) res = blow_up(res) for n in range(self.pred_length): # Generate one-step ahead predictions. The input consists of target and features # corresponding to the last two time steps. current_target = F.slice_axis(res, begin=-2, end=None, axis=-1) current_features = F.slice_axis( full_features, begin=self.receptive_field + n - 1, end=self.receptive_field + n + 1, axis=-1, ) embedding = self.target_feature_embedding( F, target=current_target, features=blow_up(current_features), ) # (batch_size, 1, num_bins) where 1 corresponds to the time axis. unnormalized_outputs, queues = self.base_net( F, embedding, one_step_prediction=True, queues=queues ) if self.temperature > 0: # (batch_size, 1, num_bins) where 1 corresponds to the time axis. probs = F.softmax( unnormalized_outputs / self.temperature, axis=-1 ) # (batch_size, 1) y = F.sample_multinomial(probs) else: # (batch_size, 1) y = F.argmax(unnormalized_outputs, axis=-1) y = y.astype("int32") res = F.concat(res, y, num_args=2, dim=-1) samples = F.slice_axis(res, begin=-self.pred_length, end=None, axis=-1) samples = samples.reshape( shape=(-1, self.num_samples, self.pred_length) ) samples = self.post_transform(samples) samples = F.broadcast_mul(scale.expand_dims(axis=1), samples) return samples
def hybrid_forward(self, F, x: Tensor) -> Tuple[Tensor]: return (self.value * F.ones_like(x.sum(axis=-1)), )
def s(low: Tensor, high: Tensor) -> Tensor: raw_samples = self.F.sample_uniform(low=low.zeros_like(), high=high.ones_like(), dtype=dtype) return low + raw_samples * (high - low)
def unroll_encoder( self, F, feat_static_cat: Tensor, # (batch_size, num_features) feat_static_real: Tensor, # (batch_size, num_features) past_time_feat: Tensor, # (batch_size, history_length, num_features) past_target: Tensor, # (batch_size, history_length, *target_shape) past_observed_values: Tensor, # (batch_size, history_length, *target_shape) future_time_feat: Optional[ Tensor], # (batch_size, prediction_length, num_features) future_target: Optional[ Tensor], # (batch_size, prediction_length, *target_shape) ) -> Tuple[Tensor, List, Tensor, Tensor]: """ Unrolls the LSTM encoder over past and, if present, future data. Returns outputs and state of the encoder, plus the scale of past_target and a vector of static features that was constructed and fed as input to the encoder. All tensor arguments should have NTC layout. """ if future_time_feat is None or future_target is None: time_feat = past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) sequence = past_target sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_time_feat, dim=1, ) sequence = F.concat(past_target, future_target, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, *target_shape) _, scale = self.scaler( past_target.slice_axis(axis=1, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), ) # (batch_size, num_features) embedded_cat = self.embedder(feat_static_cat) # in addition to embedding features, use the log scale as it can help # prediction too # (batch_size, num_features + prod(target_shape)) static_feat = F.concat( embedded_cat, feat_static_real, F.log(scale) if len(self.target_shape) == 0 else F.log( scale.squeeze(axis=1)), dim=1, ) # (batch_size, subsequences_length, num_features + 1) repeated_static_feat = static_feat.expand_dims(axis=1).repeat( axis=1, repeats=subsequences_length) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) # unroll encoder outputs, state = self.rnn.unroll( inputs=inputs, length=subsequences_length, layout="NTC", merge_outputs=True, ) # outputs: (batch_size, seq_len, num_cells) # state: list of (batch_size, num_cells) tensors # scale: (batch_size, 1, *target_shape) # static_feat: (batch_size, num_features + prod(target_shape)) return outputs, state, scale, static_feat
def process_dynamic_cat(self, F, feature: Tensor) -> Tensor: return self.embed_dynamic(feature.astype(self.dtype))
def hybrid_forward( self, F, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Tensor: """ Computes the loss for training DeepAR, all inputs tensors representing time series have NTC layout. Parameters ---------- F feat_static_cat : (batch_size, num_features) feat_static_real : (batch_size, num_features) past_time_feat : (batch_size, history_length, num_features) past_target : (batch_size, history_length, *target_shape) past_observed_values : (batch_size, history_length, *target_shape, seq_len) future_time_feat : (batch_size, prediction_length, num_features) future_target : (batch_size, prediction_length, *target_shape) future_observed_values : (batch_size, prediction_length, *target_shape) Returns loss with shape (batch_size, context + prediction_length, 1) ------- """ distr = self.distribution( feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, future_observed_values=future_observed_values, ) # put together target sequence # (batch_size, seq_len, *target_shape) target = F.concat( past_target.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_target, dim=1, ) # (batch_size, seq_len) loss = distr.loss(target) # (batch_size, seq_len, *target_shape) observed_values = F.concat( past_observed_values.slice_axis( axis=1, begin=self.history_length - self.context_length, end=self.history_length, ), future_observed_values, dim=1, ) # mask the loss at one time step iff one or more observations is missing in the target dimensions # (batch_size, seq_len) loss_weights = (observed_values if (len(self.target_shape) == 0) else observed_values.min(axis=-1, keepdims=False)) weighted_loss = weighted_average(F=F, x=loss, weights=loss_weights, axis=1) return weighted_loss, loss
def kalman_filter(self, targets: Tensor, observed: Tensor) -> Tuple[Tensor, ...]: """ Performs Kalman filtering given observations. Parameters ---------- targets Observations, shape (batch_size, seq_length, output_dim) observed Flag tensor indicating which observations are genuine (1.0) and which are missing (0.0) Returns ------- Tensor Log probabilities, shape (batch_size, seq_length) Tensor Mean of p(l_T | l_{T-1}), where T is seq_length, with shape (batch_size, latent_dim) Tensor Covariance of p(l_T | l_{T-1}), where T is seq_length, with shape (batch_size, latent_dim, latent_dim) """ F = self.F # targets[t]: (batch_size, obs_dim) targets = targets.split(axis=1, num_outputs=self.seq_length, squeeze_axis=True) log_p_seq = [] mean = self.prior_mean cov = self.prior_cov observed = (observed.split( axis=1, num_outputs=self.seq_length, squeeze_axis=True) if observed is not None else None) for t in range(self.seq_length): # Compute the filtered distribution # p(l_t | z_1, ..., z_{t + 1}) # and log - probability # log p(z_t | z_0, z_{t - 1}) filtered_mean, filtered_cov, log_p = kalman_filter_step( F, target=targets[t], prior_mean=mean, prior_cov=cov, emission_coeff=self.emission_coeff[t], residual=self.residuals[t], noise_std=self.noise_std[t], latent_dim=self.latent_dim, output_dim=self.output_dim, ) log_p_seq.append(log_p.expand_dims(axis=1)) # Mean of p(l_{t+1} | l_t) mean = F.linalg_gemm2( self.transition_coeff[t], (filtered_mean.expand_dims(axis=-1) if observed is None else F.where(observed[t], x=filtered_mean, y=mean).expand_dims( axis=-1)), ).squeeze(axis=-1) # Covariance of p(l_{t+1} | l_t) cov = F.linalg_gemm2( self.transition_coeff[t], F.linalg_gemm2( (filtered_cov if observed is None else F.where( observed[t], x=filtered_cov, y=cov)), self.transition_coeff[t], transpose_b=True, ), ) + F.linalg_gemm2( self.innovation_coeff[t], self.innovation_coeff[t], transpose_a=True, ) # Return sequence of log likelihoods, as well as # final mean and covariance of p(l_T | l_{T-1} where T is seq_length return F.concat(*log_p_seq, dim=1), mean, cov
def create_network_input( self, F, feat_static_cat: Tensor, # (batch_size, num_features) past_time_feat: Tensor, # (batch_size, num_features, history_length) past_target: Tensor, # (batch_size, history_length, 1) past_observed_values: Tensor, # (batch_size, history_length) future_time_feat: Optional[ Tensor], # (batch_size, num_features, prediction_length) future_target: Optional[Tensor], # (batch_size, prediction_length) ) -> Tuple[Tensor, Tensor, Tensor]: """ Creates inputs for the transformer network. All tensor arguments should have NTC layout. """ if future_time_feat is None or future_target is None: time_feat = past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ) sequence = past_target sequence_length = self.history_length subsequences_length = self.context_length else: time_feat = F.concat( past_time_feat.slice_axis( axis=1, begin=self.history_length - self.context_length, end=None, ), future_time_feat, dim=1, ) sequence = F.concat(past_target, future_target, dim=1) sequence_length = self.history_length + self.prediction_length subsequences_length = self.context_length + self.prediction_length # (batch_size, sub_seq_len, *target_shape, num_lags) lags = self.get_lagged_subsequences( F=F, sequence=sequence, sequence_length=sequence_length, indices=self.lags_seq, subsequences_length=subsequences_length, ) # scale is computed on the context length last units of the past target # scale shape is (batch_size, 1, *target_shape) _, scale = self.scaler( past_target.slice_axis(axis=1, begin=-self.context_length, end=None), past_observed_values.slice_axis(axis=1, begin=-self.context_length, end=None), ) embedded_cat = self.embedder(feat_static_cat) # in addition to embedding features, use the log scale as it can help prediction too # (batch_size, num_features + prod(target_shape)) static_feat = F.concat( embedded_cat, F.log(scale) if len(self.target_shape) == 0 else F.log( scale.squeeze(axis=1)), dim=1, ) repeated_static_feat = static_feat.expand_dims(axis=1).repeat( axis=1, repeats=subsequences_length) # (batch_size, sub_seq_len, *target_shape, num_lags) lags_scaled = F.broadcast_div(lags, scale.expand_dims(axis=-1)) # from (batch_size, sub_seq_len, *target_shape, num_lags) # to (batch_size, sub_seq_len, prod(target_shape) * num_lags) input_lags = F.reshape( data=lags_scaled, shape=( -1, subsequences_length, len(self.lags_seq) * prod(self.target_shape), ), ) # (batch_size, sub_seq_len, input_dim) inputs = F.concat(input_lags, time_feat, repeated_static_feat, dim=-1) return inputs, scale, static_feat