def distribution( self, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, return_rnn_outputs: bool = False, ) -> Union[Distribution, Tuple[Distribution, Tensor]]: """ Returns the distribution predicted by the model on the range of past_target and future_target. The distribution is obtained by unrolling the network with the true target, this is also the distribution that is being minimized during training. This can be used in anomaly detection, see for instance examples/anomaly_detection.py. Input arguments are the same as for the hybrid_forward method. Returns ------- Distribution a distribution object whose mean has shape: (batch_size, context_length + prediction_length). Tensor (optional) when return_rnn_outputs=True, rnn_outputs will be returned so that it could be used for regularization """ # unroll the decoder in "training mode" # i.e. by providing future data as well F = getF(feat_static_cat) rnn_outputs, _, scale, _ = self.unroll_encoder( F=F, feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, ) distr_args = self.proj_distr_args(rnn_outputs) # return the output of rnn layers if return_rnn_outputs=True, so that it can be used for regularization later # assume no dropout for outputs, so can be directly used for activation regularization return ( ( self.distr_output.distribution(distr_args, scale=scale), rnn_outputs, ) if return_rnn_outputs else self.distr_output.distribution(distr_args, scale=scale) )
def distribution( self, cond_mean: Tensor, interval_alpha_bias: Optional[Tensor] = None, size_alpha_bias: Optional[Tensor] = None, ) -> Tuple[Distribution, ...]: F = getF(cond_mean) cond_interval, cond_size = F.split(cond_mean, num_outputs=2, axis=-1) alpha_biases = [ F.broadcast_mul(F.ones_like(cond_interval), bias) if bias is not None else None for bias in [interval_alpha_bias, size_alpha_bias] ] distr_params = zip( [self.interval_distr_output, self.size_distr_output], [cond_interval, cond_size], alpha_biases, ) return tuple( ( do.distribution(mean) if len(do.args_dim) == 1 else do.distribution( [mean, F.Activation(alpha_bias, "softrelu") + 1e-5] ) ) for ix, (do, mean, alpha_bias) in enumerate(distr_params) )
def get_issm_coeff( self, seasonal_indicators: Tensor # (batch_size, time_length) ) -> Tuple[Tensor, Tensor, Tensor]: F = getF(seasonal_indicators) emission_coeff_ls, transition_coeff_ls, innovation_coeff_ls = zip( self.nonseasonal_issm.get_issm_coeff(seasonal_indicators), *[ issm.get_issm_coeff( seasonal_indicators.slice_axis(axis=-1, begin=ix, end=ix + 1)) for ix, issm in enumerate(self.seasonal_issms) ], ) # stack emission and innovation coefficients emission_coeff = F.concat(*emission_coeff_ls, dim=-1) innovation_coeff = F.concat(*innovation_coeff_ls, dim=-1) # transition coefficient is block diagonal! transition_coeff = _make_block_diagonal(transition_coeff_ls) return emission_coeff, transition_coeff, innovation_coeff
def __init__( self, amplitude: Tensor, length_scale: Tensor, frequency: Tensor, F=None, ) -> None: """ Parameters ---------- amplitude : Tensor Periodic kernel amplitude hyper-parameter of shape (batch_size, 1, 1). length_scale : Tensor Periodic kernel length scale hyper-parameter of of shape (batch_size, 1, 1). frequency : Tensor Periodic kernel hyper-parameter of shape (batch_size, 1, 1). F : ModuleType A module that can either refer to the Symbol API or the NDArray API in MXNet. """ self.F = F if F else getF(amplitude) self.amplitude = amplitude self.length_scale = length_scale self.frequency = frequency
def s(xi: Tensor, beta: Tensor) -> Tensor: F = getF(xi) sample_U = uniform.Uniform(F.zeros_like(xi), F.ones_like(xi)).sample() boxcox = box_cox_transform.BoxCoxTransform(-xi, F.array([0])) sample_X = -1 * boxcox.f(1 - sample_U) * beta return sample_X
def log_survival(self, x: Tensor) -> Tensor: r""" Logarithm of the survival function :math:`\log S(x) = \log(1 - CDF(x))`. We define :math:`z = (\log(x) - \mu) / \sigma` and obtain the survival function as :math:`S(x) = sigmoid(-z)`, or equivalently :math:`\log S(x) = -\log(1 + \exp(z))`. """ log_x = x.clip(1e-20, np.inf).log() z = (log_x - self.mu) / self.sigma F = getF(x) return -F.Activation(z, "softrelu")
def log_intensity(self, x: Tensor) -> Tensor: r""" Logarithm of the intensity (a.k.a. hazard) function. The intensity is defined as :math:`\lambda(x) = p(x) / S(x)`. We define :math:`z = (\log(x) - \mu) / \sigma` and obtain the intensity as :math:`\lambda(x) = sigmoid(z) / (\sigma * \log(x))`, or equivalently :math:`\log \lambda(x) = z - \log(1 + \exp(z)) - \log(\sigma) - \log(x)`. """ log_x = x.clip(1e-20, np.inf).log() z = (log_x - self.mu) / self.sigma F = getF(x) return z - self.sigma.log() - F.Activation(z, "softrelu") - log_x
def emission_coeff( self, feature: Tensor # (batch_size, time_length, 1) ) -> Tensor: F = getF(feature) _emission_coeff = F.ones(shape=(1, 1, 1, self.latent_dim())) # get the right shape: (batch_size, time_length, obs_dim, latent_dim) zeros = _broadcast_param( feature.squeeze(axis=2), axes=[2, 3], sizes=[1, self.latent_dim()], ) return _emission_coeff.broadcast_like(zeros)
def log_intensity(self, y: Tensor) -> Tensor: r""" Logarithm of the intensity (a.k.a. hazard) function. The intensity is defined as :math:`\lambda(y) = p(y) / S(y)`. """ F = getF(y) lp = 0.0 x = y for t in self.transforms[::-1]: x = t.f_inv(y) ladj = t.log_abs_det_jac(x, y) lp -= sum_trailing_axes(F, ladj, self.event_dim - t.event_dim) y = x return self.base_distribution.log_intensity(x) + lp
def forwardshift(A): """ Shift an array's content forward by 1 time step along the first axis, keeping the shape identical by padding on the left with zeros. Parameters ---------- A : nd.NDArray Shape (N, T, ...), the tensor in which the entries will be shifted forward by one """ F = getF(A) A = F.Concat(F.zeros_like(F.slice_axis(A, axis=1, begin=0, end=1)), A, dim=1) return F.slice_axis(A, axis=1, begin=0, end=-1)
def sample( self, num_samples=None, dtype=np.float32, lower_bound: Optional[Tensor] = None, ) -> Tensor: r""" Draw samples from the distribution. We generate samples as :math:`u \sim Uniform(0, 1), x = S^{-1}(u)`, where :math:`S^{-1}` is the inverse of the survival function :math:`S(x) = 1 - CDF(x)`. Parameters ---------- num_samples Number of samples to generate. dtype Data type of the generated samples. lower_bound If None, generate samples as usual. If lower_bound is provided, all generated samples will be larger than the specified values. That is, we sample from `p(x | x > lower_bound)`. Shape: `(*batch_size)` Returns ------- x Sampled inter-event times. Shape: `(num_samples, *batch_size)` """ F = getF(self.mu) if num_samples is not None: sample_shape = (num_samples,) + self.batch_shape else: sample_shape = self.batch_shape u = F.uniform(0, 1, shape=sample_shape) # Make sure that the generated samples are larger than condition_above. # This is easy to ensure when using inverse-survival sampling: we # simply multiply `u ~ Uniform(0, 1)` by `S(y)` to ensure that # `x > y`. with autograd.pause(): if lower_bound is not None: survival = self.log_survival(lower_bound).exp() u = u * survival x = (self.mu + self.sigma * (F.log1p(-u) - F.log(u))).exp() return x
def distribution( self, feat_static_cat: Tensor, feat_static_real: Tensor, past_time_feat: Tensor, past_target: Tensor, past_observed_values: Tensor, future_time_feat: Tensor, future_target: Tensor, future_observed_values: Tensor, ) -> Distribution: """ Returns the distribution predicted by the model on the range of past_target and future_target. The distribution is obtained by unrolling the network with the true target, this is also the distribution that is being minimized during training. This can be used in anomaly detection, see for instance examples/anomaly_detection.py. Input arguments are the same as for the hybrid_forward method. Returns ------- Distribution a distribution object whose mean has shape: (batch_size, context_length + prediction_length). """ # unroll the decoder in "training mode" # i.e. by providing future data as well F = getF(feat_static_cat) rnn_outputs, _, scale, _ = self.unroll_encoder( F=F, feat_static_cat=feat_static_cat, feat_static_real=feat_static_real, past_time_feat=past_time_feat, past_target=past_target, past_observed_values=past_observed_values, future_time_feat=future_time_feat, future_target=future_target, ) distr_args = self.proj_distr_args(rnn_outputs) return self.distr_output.distribution(distr_args, scale=scale)
def log_abs_det(A: Tensor) -> Tensor: """ Logarithm of the absolute value of matrix `A` Parameters ---------- A Tensor matrix from which to compute the log absolute value of its determinant Returns ------- Tensor """ F = getF(A) A_squared = F.linalg.gemm2(A, A, transpose_a=True) L = F.linalg.potrf(A_squared) return F.diag(L, axis1=-2, axis2=-1).abs().log().sum(-1)
def transition_coeff( self, feature: Tensor # (batch_size, time_length, 1) ) -> Tensor: F = getF(feature) _transition_coeff = (F.eye( self.latent_dim()).expand_dims(axis=0).expand_dims(axis=0)) # get the right shape: (batch_size, time_length, latent_dim, latent_dim) zeros = _broadcast_param( feature.squeeze(axis=2), axes=[2, 3], sizes=[self.latent_dim(), self.latent_dim()], ) return _transition_coeff.broadcast_like(zeros)
def forwardshift(A): """ Shift an array's content forward by 1 time step along the first axis, keeping the shape identical by repeating the first element. Parameters ---------- A : nd.NDArray Shape (N, T, ...), the tensor in which the entries will be shifted forward by one """ F = getF(A) return F.Concat( F.slice_axis(A, axis=1, begin=0, end=1), F.slice_axis(A, axis=1, begin=0, end=-1), dim=1, )
def _make_block_diagonal(blocks: List[Tensor]) -> Tensor: assert (len(blocks) > 0), "You need at least one tensor to make a block-diagonal tensor" if len(blocks) == 1: return blocks[0] F = getF(blocks[0]) # transition coefficient is block diagonal! block_diagonal = _make_2_block_diagonal(F, blocks[0], blocks[1]) for i in range(2, len(blocks)): block_diagonal = _make_2_block_diagonal(F=F, left=block_diagonal, right=blocks[i]) return block_diagonal
def emission_coeff( self, seasonal_indicators: Tensor # (batch_size, time_length) ) -> Tensor: F = getF(seasonal_indicators) _emission_coeff = F.ones(shape=(1, 1, 1, self.latent_dim())) # get the right shape: (batch_size, seq_length, obs_dim, latent_dim) zeros = _broadcast_param( F.zeros_like( seasonal_indicators.slice_axis(axis=-1, begin=0, end=1).squeeze(axis=-1)), axes=[2, 3], sizes=[1, self.latent_dim()], ) return _emission_coeff.broadcast_like(zeros)
def transition_coeff( self, seasonal_indicators: Tensor # (batch_size, time_length) ) -> Tensor: F = getF(seasonal_indicators) _transition_coeff = (F.eye( self.latent_dim()).expand_dims(axis=0).expand_dims(axis=0)) # get the right shape: (batch_size, seq_length, latent_dim, latent_dim) zeros = _broadcast_param( F.zeros_like( seasonal_indicators.slice_axis(axis=-1, begin=0, end=1).squeeze(axis=-1)), axes=[2, 3], sizes=[self.latent_dim(), self.latent_dim()], ) return _transition_coeff.broadcast_like(zeros)
def distr( self, rnn_outputs: Tensor, time_features: Tensor, scale: Tensor, lags_scaled: Tensor, target_dimension_indicator: Tensor, seq_len: int, ): """ Returns the distribution of GPVAR with respect to the RNN outputs. Parameters ---------- rnn_outputs Outputs of the unrolled RNN (batch_size, seq_len, num_cells) time_features Dynamic time features (batch_size, seq_len, num_features) scale Mean scale for each time series (batch_size, 1, target_dim) lags_scaled Scaled lags used for RNN input (batch_size, seq_len, target_dim, num_lags) target_dimension_indicator Indices of the target dimension (batch_size, target_dim) seq_len Length of the sequences Returns ------- distr Distribution instance distr_args Distribution arguments """ F = getF(rnn_outputs) # (batch_size, target_dim, embed_dim) index_embeddings = self.embed(target_dimension_indicator) # broadcast to (batch_size, seq_len, target_dim, embed_dim) repeated_index_embeddings = index_embeddings.expand_dims( axis=1).repeat(axis=1, repeats=seq_len) # broadcast to (batch_size, seq_len, target_dim, num_features) time_features = time_features.expand_dims(axis=2).repeat( axis=2, repeats=self.target_dim_sample) # (batch_size, seq_len, target_dim, embed_dim + num_cells + num_inputs) distr_input = F.concat(rnn_outputs, repeated_index_embeddings, time_features, dim=-1) # TODO 1 pass inputs in proj args distr_args = self.proj_dist_args(distr_input) # compute likelihood of target given the predicted parameters distr = self.distr_output.distribution(distr_args, scale=scale, dim=self.target_dim_sample) return distr, distr_args
def innovation_coeff(self, seasonal_indicators: Tensor) -> Tensor: F = getF(seasonal_indicators) # seasonal_indicators = F.modulo(seasonal_indicators - 1, self.latent_dim) return F.one_hot(seasonal_indicators, depth=self.latent_dim()).squeeze(axis=2)
def emission_coeff(self, seasonal_indicators: Tensor) -> Tensor: F = getF(seasonal_indicators) return F.one_hot(seasonal_indicators, depth=self.latent_dim())
def __init__(self, xi: Tensor, beta: Tensor, F=None) -> None: self.xi = xi self.beta = beta self.F = F if F else getF(xi) # assuming xi and beta of same type
def F(self): return getF(self.xi)
def innovation_coeff(self, feature: Tensor) -> Tensor: F = getF(feature) return F.one_hot(feature, depth=self.latent_dim()).squeeze(axis=2)
def __init__( self, sigma: Tensor, kernel: Kernel, prediction_length: Optional[int] = None, context_length: Optional[int] = None, num_samples: Optional[int] = None, float_type: DType = np.float64, jitter_method: str = "iter", max_iter_jitter: int = 10, neg_tol: float = -1e-8, diag_weight: float = 1e-6, increase_jitter: int = 10, sample_noise: bool = True, F=None, ) -> None: r""" Parameters ---------- sigma Noise parameter of shape (batch_size, num_data_points, 1), where num_data_points is the number of rows in the Cholesky matrix. kernel Kernel object. prediction_length Prediction length. context_length Training length. num_samples The number of samples to be drawn. float_type Determines whether to use single or double precision. jitter_method Iteratively jitter method or use eigenvalue decomposition depending on problem size. max_iter_jitter Maximum number of iterations for jitter to iteratively make the matrix positive definite. neg_tol Parameter in the jitter methods to eliminate eliminate matrices with diagonal elements smaller than this when checking if a matrix is positive definite. diag_weight Multiple of mean of diagonal entries to initialize the jitter. increase_jitter Each iteration multiply by jitter by this amount sample_noise Boolean to determine whether to add :math:`\sigma^2I` to the predictive covariance matrix. F A module that can either refer to the Symbol API or the NDArray API in MXNet. """ assert (prediction_length is None or prediction_length > 0 ), "The value of `prediction_length` should be > 0" assert (context_length is None or context_length > 0 ), "The value of `context_length` should be > 0" assert (num_samples is None or num_samples > 0), "The value of `num_samples` should be > 0" self.sigma = sigma self.kernel = kernel self.prediction_length = prediction_length self.context_length = (context_length if context_length is not None else prediction_length) self.num_samples = num_samples self.F = F if F else getF(sigma) self.float_type = float_type self.jitter_method = jitter_method self.max_iter_jitter = max_iter_jitter self.neg_tol = neg_tol self.diag_weight = diag_weight self.increase_jitter = increase_jitter self.sample_noise = sample_noise
def emission_coeff(self, feature: Tensor) -> Tensor: F = getF(feature) return F.one_hot(feature, depth=self.latent_dim())