def test_deterministic_l2(mu: float, hybridize: bool) -> None: """ Test to check that maximizing the likelihood recovers the parameters. This tests uses the Gaussian distribution with fixed variance and sample mean. This essentially reduces to determistic L2. """ # generate samples mu = mu mus = mx.nd.zeros(NUM_SAMPLES) + mu deterministic_distr = Gaussian(mu=mus, sigma=0.1 * mx.nd.ones_like(mus)) samples = deterministic_distr.sample() class GaussianFixedVarianceOutput(GaussianOutput): @classmethod def domain_map(cls, F, mu, sigma): sigma = 0.1 * F.ones_like(sigma) return mu.squeeze(axis=-1), sigma.squeeze(axis=-1) mu_hat, _ = maximum_likelihood_estimate_sgd( GaussianFixedVarianceOutput(), samples, init_biases=[3 * mu, 0.1], hybridize=hybridize, num_epochs=PositiveInt(1), ) assert (np.abs(mu_hat - mu) < TOL * mu), f"mu did not match: mu = {mu}, mu_hat = {mu_hat}"
def test_gaussian_likelihood(mu: float, sigma: float, hybridize: bool): """ Test to check that maximizing the likelihood recovers the parameters """ # generate samples mus = mx.nd.zeros((NUM_SAMPLES, )) + mu sigmas = mx.nd.zeros((NUM_SAMPLES, )) + sigma distr = Gaussian(mus, sigmas) samples = distr.sample() init_biases = [ mu - START_TOL_MULTIPLE * TOL * mu, inv_softplus(sigma - START_TOL_MULTIPLE * TOL * sigma), ] mu_hat, sigma_hat = maximum_likelihood_estimate_sgd( GaussianOutput(), samples, init_biases=init_biases, hybridize=hybridize, learning_rate=PositiveFloat(0.001), num_epochs=PositiveInt(5), ) assert (np.abs(mu_hat - mu) < TOL * mu), f"mu did not match: mu = {mu}, mu_hat = {mu_hat}" assert (np.abs(sigma_hat - sigma) < TOL * sigma ), f"alpha did not match: sigma = {sigma}, sigma_hat = {sigma_hat}"
def sample(self, mean, log_std): std = log_std.exp() distribution = Gaussian(mu=mean, sigma=std) sample = distribution.sample( dtype="float64" ) # for reparameterization trick (mu + std * N(0,1)) sample_log_prob = distribution.log_prob(sample) return self.scale_and_bound(sample, sample_log_prob, mean)
def test_box_cox_tranform( lambdas: Tuple[float, float], mu_sigma: Tuple[float, float], hybridize: bool, ): ''' Test to check that maximizing the likelihood recovers the parameters ''' # test instance lam_1, lam_2 = lambdas mu, sigma = mu_sigma # generate samples lamdas_1 = mx.nd.zeros((NUM_SAMPLES, )) + lam_1 lamdas_2 = mx.nd.zeros((NUM_SAMPLES, )) + lam_2 transform = InverseBoxCoxTransform(lamdas_1, lamdas_2) mus = mx.nd.zeros((NUM_SAMPLES, )) + mu sigmas = mx.nd.zeros((NUM_SAMPLES, )) + sigma gausian_distr = Gaussian(mus, sigmas) # Here the base distribution is Guassian which is transformed to # non-Gaussian via the inverse Box-Cox transform. # Sampling from `trans_distr` gives non-Gaussian samples trans_distr = TransformedDistribution(gausian_distr, transform) # Given the non-Gaussian samples find the true parameters # of the Box-Cox transformation as well as the underlying Gaussian distribution. samples = trans_distr.sample() init_biases = [ mu - START_TOL_MULTIPLE * TOL * mu, inv_softplus(sigma - START_TOL_MULTIPLE * TOL * sigma), lam_1 - START_TOL_MULTIPLE * TOL * lam_1, inv_softplus(lam_2 - START_TOL_MULTIPLE * TOL * lam_2), ] mu_hat, sigma_hat, lam_1_hat, lam_2_hat = maximum_likelihood_estimate_sgd( TransformedDistributionOutput( GaussianOutput(), InverseBoxCoxTransformOutput(lb_obs=lam_2, fix_lambda_2=True), ), samples, init_biases=init_biases, hybridize=hybridize, learning_rate=PositiveFloat(0.01), num_epochs=PositiveInt(18), ) assert (np.abs(lam_1_hat - lam_1) < TOL * lam_1 ), f"lam_1 did not match: lam_1 = {lam_1}, lam_1_hat = {lam_1_hat}" # assert ( # np.abs(lam_2_hat - lam_2) < TOL * lam_2 # ), f"lam_2 did not match: lam_2 = {lam_2}, lam_2_hat = {lam_2_hat}" assert np.abs(mu_hat - mu) < TOL * np.abs( mu), f"mu did not match: mu = {mu}, mu_hat = {mu_hat}" assert (np.abs(sigma_hat - sigma) < TOL * sigma ), f"sigma did not match: sigma = {sigma}, sigma_hat = {sigma_hat}"
return np.mean(np.abs(x - y)) NUM_SAMPLES = 1_000 NUM_SAMPLES_LARGE = 100_000 SHAPE = (2, 1, 3) @pytest.mark.parametrize( "distr1, distr2, p", [ ( Gaussian( mu=mx.nd.zeros(shape=SHAPE), sigma=1e-3 + 0.2 * mx.nd.ones(shape=SHAPE), ), Gaussian( mu=mx.nd.ones(shape=SHAPE), sigma=1e-3 + 0.1 * mx.nd.ones(shape=SHAPE), ), 0.2 * mx.nd.ones(shape=SHAPE), ), ( StudentT( mu=mx.nd.ones(shape=SHAPE), sigma=1e-1 + mx.nd.zeros(shape=SHAPE), nu=mx.nd.zeros(shape=SHAPE) + 2.2, ), Gaussian( mu=-mx.nd.ones(shape=SHAPE),
def sample(self, num_samples: Optional[int] = None, scale: Optional[Tensor] = None) -> Tensor: r""" Generates samples from the LDS: p(z_1, z_2, \ldots, z_{`seq_length`}). Parameters ---------- num_samples Number of samples to generate scale Scale of each sequence in x, shape (batch_size, output_dim) Returns ------- Tensor Samples, shape (num_samples, batch_size, seq_length, output_dim) """ F = self.F # Note on shapes: here we work with tensors of the following shape # in each time step t: (num_samples, batch_size, dim, dim), # where dim can be obs_dim or latent_dim or a constant 1 to facilitate # generalized matrix multiplication (gemm2) # Sample observation noise for all time steps # noise_std: (batch_size, seq_length, obs_dim, 1) noise_std = F.stack(*self.noise_std, axis=1).expand_dims(axis=-1) # samples_eps_obs[t]: (num_samples, batch_size, obs_dim, 1) samples_eps_obs = (Gaussian(noise_std.zeros_like(), noise_std).sample(num_samples).split( axis=-3, num_outputs=self.seq_length, squeeze_axis=True)) # Sample standard normal for all time steps # samples_eps_std_normal[t]: (num_samples, batch_size, obs_dim, 1) samples_std_normal = (Gaussian( noise_std.zeros_like(), noise_std.ones_like()).sample(num_samples).split( axis=-3, num_outputs=self.seq_length, squeeze_axis=True)) # Sample the prior state. # samples_lat_state: (num_samples, batch_size, latent_dim, 1) # The prior covariance is observed to be slightly negative definite whenever there is # excessive zero padding at the beginning of the time series. # We add positive tolerance to the diagonal to avoid numerical issues. # Note that `jitter_cholesky` adds positive tolerance only if the decomposition without jitter fails. state = MultivariateGaussian( self.prior_mean, jitter_cholesky(F, self.prior_cov, self.latent_dim, float_type=np.float32), ) samples_lat_state = state.sample(num_samples).expand_dims(axis=-1) samples_seq = [] for t in range(self.seq_length): # Expand all coefficients to include samples in axis 0 # emission_coeff_t: (num_samples, batch_size, obs_dim, latent_dim) # transition_coeff_t: # (num_samples, batch_size, latent_dim, latent_dim) # innovation_coeff_t: (num_samples, batch_size, 1, latent_dim) emission_coeff_t, transition_coeff_t, innovation_coeff_t = [ _broadcast_param(coeff, axes=[0], sizes=[num_samples]) if num_samples is not None else coeff for coeff in [ self.emission_coeff[t], self.transition_coeff[t], self.innovation_coeff[t], ] ] # Expand residuals as well # residual_t: (num_samples, batch_size, obs_dim, 1) residual_t = (_broadcast_param( self.residuals[t].expand_dims(axis=-1), axes=[0], sizes=[num_samples], ) if num_samples is not None else self.residuals[t].expand_dims( axis=-1)) # (num_samples, batch_size, 1, obs_dim) samples_t = (F.linalg_gemm2(emission_coeff_t, samples_lat_state) + residual_t + samples_eps_obs[t]) samples_t = (samples_t.swapaxes(dim1=2, dim2=3) if num_samples is not None else samples_t.swapaxes(dim1=1, dim2=2)) samples_seq.append(samples_t) # sample next state: (num_samples, batch_size, latent_dim, 1) samples_lat_state = F.linalg_gemm2( transition_coeff_t, samples_lat_state) + F.linalg_gemm2( innovation_coeff_t, samples_std_normal[t], transpose_a=True) # (num_samples, batch_size, seq_length, obs_dim) samples = F.concat(*samples_seq, dim=-2) return (samples if scale is None else F.broadcast_mul( samples, scale.expand_dims(axis=1).expand_dims( axis=0) if num_samples is not None else scale.expand_dims( axis=1), ))
StudentT, Uniform, TransformedDistribution, Dirichlet, DirichletMultinomial, ) from gluonts.distribution.bijection import AffineTransformation from gluonts.distribution.box_cox_transform import BoxCoxTransform @pytest.mark.parametrize( "distr, expected_batch_shape, expected_event_shape", [ ( Gaussian( mu=mx.nd.zeros(shape=(3, 4, 5)), sigma=mx.nd.ones(shape=(3, 4, 5)), ), (3, 4, 5), (), ), ( Gamma( alpha=mx.nd.ones(shape=(3, 4, 5)), beta=mx.nd.ones(shape=(3, 4, 5)), ), (3, 4, 5), (), ), ( Beta( alpha=mx.nd.ones(shape=(3, 4, 5)),
def sample( self, num_samples: Optional[int] = None, scale: Optional[Tensor] = None ) -> Tensor: r""" Generates samples from the LDS: p(z_1, z_2, \ldots, z_{`seq_length`}). Parameters ---------- num_samples Number of samples to generate scale Scale of each sequence in x, shape (batch_size, output_dim) Returns ------- Tensor Samples, shape (num_samples, batch_size, seq_length, output_dim) """ F = self.F # Note on shapes: here we work with tensors of the following shape # in each time step t: (num_samples, batch_size, dim, dim), # where dim can be obs_dim or latent_dim or a constant 1 to facilitate # generalized matrix multiplication (gemm2) # Sample observation noise for all time steps # noise_std: (batch_size, seq_length, obs_dim, 1) noise_std = F.stack(*self.noise_std, axis=1).expand_dims(axis=-1) # samples_eps_obs[t]: (num_samples, batch_size, obs_dim, 1) samples_eps_obs = ( Gaussian(noise_std.zeros_like(), noise_std) .sample(num_samples) .split(axis=2, num_outputs=self.seq_length, squeeze_axis=True) ) # Sample standard normal for all time steps # samples_eps_std_normal[t]: (num_samples, batch_size, obs_dim, 1) samples_std_normal = ( Gaussian(noise_std.zeros_like(), noise_std.ones_like()) .sample(num_samples) .split(axis=2, num_outputs=self.seq_length, squeeze_axis=True) ) # Sample the prior state. # samples_lat_state: (num_samples, batch_size, latent_dim, 1) state = MultivariateGaussian( self.prior_mean, F.linalg_potrf(self.prior_cov) ) samples_lat_state = state.sample(num_samples).expand_dims(axis=-1) samples_seq = [] for t in range(self.seq_length): # Expand all coefficients to include samples in axis 0 # emission_coeff_t: (num_samples, batch_size, obs_dim, latent_dim) # transition_coeff_t: # (num_samples, batch_size, latent_dim, latent_dim) # innovation_coeff_t: (num_samples, batch_size, 1, latent_dim) emission_coeff_t, transition_coeff_t, innovation_coeff_t = [ _broadcast_param(coeff, axes=[0], sizes=[num_samples]) for coeff in [ self.emission_coeff[t], self.transition_coeff[t], self.innovation_coeff[t], ] ] # Expand residuals as well # residual_t: (num_samples, batch_size, obs_dim, 1) residual_t = _broadcast_param( self.residuals[t].expand_dims(axis=-1), axes=[0], sizes=[num_samples], ) # (num_samples, batch_size, 1, obs_dim) samples_t = ( F.linalg_gemm2(emission_coeff_t, samples_lat_state) + residual_t + samples_eps_obs[t] ).swapaxes(dim1=2, dim2=3) samples_seq.append(samples_t) # sample next state: (num_samples, batch_size, latent_dim, 1) samples_lat_state = F.linalg_gemm2( transition_coeff_t, samples_lat_state ) + F.linalg_gemm2( innovation_coeff_t, samples_std_normal[t], transpose_a=True ) # (num_samples, batch_size, seq_length, obs_dim) samples = F.concat(*samples_seq, dim=2) return ( samples if scale is None else F.broadcast_mul(samples, scale.expand_dims(axis=1)) )