def sample(self, mean: Tensor, covariance: Tensor) -> Tensor:
     r"""
     Parameters
     ----------
     covariance
         The covariance matrix of the GP of shape (batch_size, prediction_length, prediction_length).
     mean
         The mean vector of the GP of shape (batch_size, prediction_length).
     Returns
     -------
     Tensor
         Samples from a Gaussian Process of shape (batch_size, prediction_length, num_samples), where :math:`L`
         is the matrix square root, Cholesky Factor of the covariance matrix with the added noise tolerance on the
         diagonal, :math:`Lz`, where :math:`z \sim N(0,I)` and assumes the mean is zero.
     """
     assert (self.num_samples
             is not None), "The value of `num_samples` must be set."
     assert (self.prediction_length
             is not None), "The value of `prediction_length` must be set."
     samples = MultivariateGaussian(
         mean,
         self._compute_cholesky_gp(covariance, self.prediction_length,
                                   self.sample_noise),
     ).sample_rep(self.num_samples, dtype=self.float_type
                  )  # Shape (num_samples, batch_size, prediction_length)
     return self.F.transpose(samples, axes=(1, 2, 0))
Example #2
0
    def log_prob(self, x_train: Tensor, y_train: Tensor) -> Tensor:
        r"""
        This method computes the negative marginal log likelihood
        :math:`-\frac{1}{2} (d \log(2\pi) + \log(|K|) + y^TK^{-1}y)`,
        where :math:`d` is the dimension.
        This can be written in terms of the Cholesky factor  :math:`L` as
        :math:`\log(|K|) = \log(|LL^T|) = \log(|L||L|^T) = \log(|L|^2) = 2\log(|L|)`
        :math:`= 2\log(\prod_i^n L_{ii}) = 2 \sum_i^N \log(L_{ii})` and
        :math:`y^TK^{-1}y = (y^TL^{-T})(L^{-1}y) = (L^{-1}y)^T(L^{-1}y) = ||L^{-1}y||_2^2`.

        Parameters
        --------------------
        x_train
            Training set of features of shape (batch_size, context_length, num_features).
        y_train
            Training labels of shape (batch_size, context_length).

        Returns
        --------------------
        Tensor
            The negative log marginal likelihood of shape (batch_size,)
        """
        assert (
            self.context_length is not None
        ), "The value of `context_length` must be set."
        return -MultivariateGaussian(
            self.F.zeros_like(y_train),  # 0 mean gaussian process prior
            self._compute_cholesky_gp(
                self.kernel.kernel_matrix(x_train, x_train),
                self.context_length,
            ),
        ).log_prob(y_train)
Example #3
0
    def log_prob(self, x_train: Tensor, y_train: Tensor) -> Tensor:
        r"""
        This method computes the negative marginal log likelihood
        
        .. math::
            :nowrap:

                \begin{aligned}
                    \frac{1}{2} [d \log(2\pi) + \log(|K|) + y^TK^{-1}y],
                \end{aligned}

        where :math:`d` is the number of data points.
        This can be written in terms of the Cholesky factor  :math:`L` as

        .. math::
            :nowrap:

            \begin{aligned}
                \log(|K|) = \log(|LL^T|) &= \log(|L||L|^T) = \log(|L|^2) = 2\log(|L|) \\
                &= 2\log\big(\prod_i^n L_{ii}\big) = 2 \sum_i^N \log(L_{ii})
            \end{aligned}
                 and

        .. math::
            :nowrap:

                 \begin{aligned}
                    y^TK^{-1}y = (y^TL^{-T})(L^{-1}y) = (L^{-1}y)^T(L^{-1}y) = ||L^{-1}y||_2^2.
                \end{aligned}

        Parameters
        --------------------
        x_train
            Training set of features of shape (batch_size, context_length, num_features).
        y_train
            Training labels of shape (batch_size, context_length).

        Returns
        --------------------
        Tensor
            The negative log marginal likelihood of shape (batch_size,)
        """
        assert (
            self.context_length is not None
        ), "The value of `context_length` must be set."
        return -MultivariateGaussian(
            self.F.zeros_like(y_train),  # 0 mean gaussian process prior
            self._compute_cholesky_gp(
                self.kernel.kernel_matrix(x_train, x_train),
                self.context_length,
            ),
        ).log_prob(y_train)
Example #4
0
def test_multivariate_gaussian() -> None:
    num_samples = 2000
    dim = 2

    mu = np.arange(0, dim) / float(dim)

    L_diag = np.ones((dim, ))
    L_low = 0.1 * np.ones((dim, dim)) * np.tri(dim, k=-1)
    L = np.diag(L_diag) + L_low
    Sigma = L.dot(L.transpose())

    distr = MultivariateGaussian(mu=mx.nd.array(mu), L=mx.nd.array(L))

    samples = distr.sample(num_samples)

    mu_hat, L_hat = maximum_likelihood_estimate_sgd(
        MultivariateGaussianOutput(dim=dim),
        samples,
        init_biases=
        None,  # todo we would need to rework biases a bit to use it in the multivariate case
        hybridize=False,
        learning_rate=PositiveFloat(0.01),
        num_epochs=PositiveInt(10),
    )

    distr = MultivariateGaussian(mu=mx.nd.array([mu_hat]),
                                 L=mx.nd.array([L_hat]))

    Sigma_hat = distr.variance[0].asnumpy()

    assert np.allclose(
        mu_hat, mu, atol=0.1,
        rtol=0.1), f"mu did not match: mu = {mu}, mu_hat = {mu_hat}"
    assert np.allclose(
        Sigma_hat, Sigma, atol=0.1, rtol=0.1
    ), f"Sigma did not match: sigma = {Sigma}, sigma_hat = {Sigma_hat}"
Example #5
0
def kalman_filter_step(
    F,
    target: Tensor,
    prior_mean: Tensor,
    prior_cov: Tensor,
    emission_coeff: Tensor,
    residual: Tensor,
    noise_std: Tensor,
    latent_dim: int,
    output_dim: int,
):
    """
    One step of the Kalman filter.

    This function computes the filtered state (mean and covariance) given the
    linear system coefficients the prior state (mean and variance),
    as well as observations.

    Parameters
    ----------
    F
    target
        Observations of the system output, shape (batch_size, output_dim)
    prior_mean
        Prior mean of the latent state, shape (batch_size, latent_dim)
    prior_cov
        Prior covariance of the latent state, shape
        (batch_size, latent_dim, latent_dim)
    emission_coeff
        Emission coefficient, shape (batch_size, output_dim, latent_dim)
    residual
        Residual component, shape (batch_size, output_dim)
    noise_std
        Standard deviation of the output noise, shape (batch_size, output_dim)
    latent_dim
        Dimension of the latent state vector
    Returns
    -------
    Tensor
        Filtered_mean, shape (batch_size, latent_dim)
    Tensor
        Filtered_covariance, shape (batch_size, latent_dim, latent_dim)
    Tensor
        Log probability, shape (batch_size, )
    """
    # output_mean: mean of the target (batch_size, obs_dim)
    output_mean = F.linalg_gemm2(
        emission_coeff, prior_mean.expand_dims(axis=-1)).squeeze(axis=-1)

    # noise covariance
    noise_cov = make_nd_diag(F=F, x=noise_std * noise_std, d=output_dim)

    S_hh_x_A_tr = F.linalg_gemm2(prior_cov, emission_coeff, transpose_b=True)

    # covariance of the target
    output_cov = F.linalg_gemm2(emission_coeff, S_hh_x_A_tr) + noise_cov

    # compute the Cholesky decomposition output_cov = LL^T
    L_output_cov = F.linalg_potrf(output_cov)

    # Compute Kalman gain matrix K:
    # K = S_hh X with X = A^T output_cov^{-1}
    # We have X = A^T output_cov^{-1} => X output_cov = A^T => X LL^T = A^T
    # We can thus obtain X by solving two linear systems involving L
    kalman_gain = F.linalg_trsm(
        L_output_cov,
        F.linalg_trsm(L_output_cov,
                      S_hh_x_A_tr,
                      rightside=True,
                      transpose=True),
        rightside=True,
    )

    # compute the error
    target_minus_residual = target - residual
    delta = target_minus_residual - output_mean

    # filtered estimates
    filtered_mean = prior_mean.expand_dims(axis=-1) + F.linalg_gemm2(
        kalman_gain, delta.expand_dims(axis=-1))
    filtered_mean = filtered_mean.squeeze(axis=-1)

    # Joseph's symmetrized update for covariance:
    ImKA = F.broadcast_sub(F.eye(latent_dim),
                           F.linalg_gemm2(kalman_gain, emission_coeff))

    filtered_cov = F.linalg_gemm2(
        ImKA, F.linalg_gemm2(
            prior_cov, ImKA, transpose_b=True)) + F.linalg_gemm2(
                kalman_gain,
                F.linalg_gemm2(noise_cov, kalman_gain, transpose_b=True))

    # likelihood term: (batch_size,)
    log_p = MultivariateGaussian(output_mean,
                                 L_output_cov).log_prob(target_minus_residual)

    return filtered_mean, filtered_cov, log_p
Example #6
0
    def sample_marginals(self,
                         num_samples: Optional[int] = None,
                         scale: Optional[Tensor] = None) -> Tensor:
        r"""
        Generates samples from the marginals p(z_t),
        t = 1, \ldots, `seq_length`.

        Parameters
        ----------
        num_samples
            Number of samples to generate
        scale
            Scale of each sequence in x, shape (batch_size, output_dim)

        Returns
        -------
        Tensor
            Samples, shape (num_samples, batch_size, seq_length, output_dim)
        """
        F = self.F

        state_mean = self.prior_mean.expand_dims(axis=-1)
        state_cov = self.prior_cov

        output_mean_seq = []
        output_cov_seq = []

        for t in range(self.seq_length):
            # compute and store observation mean at time t
            output_mean = F.linalg_gemm2(
                self.emission_coeff[t],
                state_mean) + self.residuals[t].expand_dims(axis=-1)

            output_mean_seq.append(output_mean)

            # compute and store observation cov at time t
            output_cov = F.linalg_gemm2(
                self.emission_coeff[t],
                F.linalg_gemm2(
                    state_cov, self.emission_coeff[t], transpose_b=True),
            ) + make_nd_diag(F=F,
                             x=self.noise_std[t] * self.noise_std[t],
                             d=self.output_dim)

            output_cov_seq.append(output_cov.expand_dims(axis=1))

            state_mean = F.linalg_gemm2(self.transition_coeff[t], state_mean)

            state_cov = F.linalg_gemm2(
                self.transition_coeff[t],
                F.linalg_gemm2(
                    state_cov, self.transition_coeff[t], transpose_b=True),
            ) + F.linalg_gemm2(
                self.innovation_coeff[t],
                self.innovation_coeff[t],
                transpose_a=True,
            )

        output_mean = F.concat(*output_mean_seq, dim=1)
        output_cov = F.concat(*output_cov_seq, dim=1)

        L = F.linalg_potrf(output_cov)

        output_distribution = MultivariateGaussian(output_mean, L)

        samples = output_distribution.sample(num_samples=num_samples)

        return (samples if scale is None else F.broadcast_mul(
            samples, scale.expand_dims(axis=1)))
Example #7
0
    def sample(self,
               num_samples: Optional[int] = None,
               scale: Optional[Tensor] = None) -> Tensor:
        r"""
        Generates samples from the LDS: p(z_1, z_2, \ldots, z_{`seq_length`}).

        Parameters
        ----------
        num_samples
            Number of samples to generate
        scale
            Scale of each sequence in x, shape (batch_size, output_dim)

        Returns
        -------
        Tensor
            Samples, shape (num_samples, batch_size, seq_length, output_dim)
        """
        F = self.F

        # Note on shapes: here we work with tensors of the following shape
        # in each time step t: (num_samples, batch_size, dim, dim),
        # where dim can be obs_dim or latent_dim or a constant 1 to facilitate
        # generalized matrix multiplication (gemm2)

        # Sample observation noise for all time steps
        # noise_std: (batch_size, seq_length, obs_dim, 1)
        noise_std = F.stack(*self.noise_std, axis=1).expand_dims(axis=-1)

        # samples_eps_obs[t]: (num_samples, batch_size, obs_dim, 1)
        samples_eps_obs = (Gaussian(noise_std.zeros_like(),
                                    noise_std).sample(num_samples).split(
                                        axis=-3,
                                        num_outputs=self.seq_length,
                                        squeeze_axis=True))

        # Sample standard normal for all time steps
        # samples_eps_std_normal[t]: (num_samples, batch_size, obs_dim, 1)
        samples_std_normal = (Gaussian(
            noise_std.zeros_like(),
            noise_std.ones_like()).sample(num_samples).split(
                axis=-3, num_outputs=self.seq_length, squeeze_axis=True))

        # Sample the prior state.
        # samples_lat_state: (num_samples, batch_size, latent_dim, 1)
        # The prior covariance is observed to be slightly negative definite whenever there is
        # excessive zero padding at the beginning of the time series.
        # We add positive tolerance to the diagonal to avoid numerical issues.
        # Note that `jitter_cholesky` adds positive tolerance only if the decomposition without jitter fails.
        state = MultivariateGaussian(
            self.prior_mean,
            jitter_cholesky(F,
                            self.prior_cov,
                            self.latent_dim,
                            float_type=np.float32),
        )
        samples_lat_state = state.sample(num_samples).expand_dims(axis=-1)

        samples_seq = []
        for t in range(self.seq_length):
            # Expand all coefficients to include samples in axis 0
            # emission_coeff_t: (num_samples, batch_size, obs_dim, latent_dim)
            # transition_coeff_t:
            #   (num_samples, batch_size, latent_dim, latent_dim)
            # innovation_coeff_t: (num_samples, batch_size, 1, latent_dim)
            emission_coeff_t, transition_coeff_t, innovation_coeff_t = [
                _broadcast_param(coeff, axes=[0], sizes=[num_samples])
                if num_samples is not None else coeff for coeff in [
                    self.emission_coeff[t],
                    self.transition_coeff[t],
                    self.innovation_coeff[t],
                ]
            ]

            # Expand residuals as well
            # residual_t: (num_samples, batch_size, obs_dim, 1)
            residual_t = (_broadcast_param(
                self.residuals[t].expand_dims(axis=-1),
                axes=[0],
                sizes=[num_samples],
            ) if num_samples is not None else self.residuals[t].expand_dims(
                axis=-1))

            # (num_samples, batch_size, 1, obs_dim)
            samples_t = (F.linalg_gemm2(emission_coeff_t, samples_lat_state) +
                         residual_t + samples_eps_obs[t])
            samples_t = (samples_t.swapaxes(dim1=2, dim2=3) if num_samples
                         is not None else samples_t.swapaxes(dim1=1, dim2=2))
            samples_seq.append(samples_t)

            # sample next state: (num_samples, batch_size, latent_dim, 1)
            samples_lat_state = F.linalg_gemm2(
                transition_coeff_t, samples_lat_state) + F.linalg_gemm2(
                    innovation_coeff_t,
                    samples_std_normal[t],
                    transpose_a=True)

        # (num_samples, batch_size, seq_length, obs_dim)
        samples = F.concat(*samples_seq, dim=-2)
        return (samples if scale is None else F.broadcast_mul(
            samples,
            scale.expand_dims(axis=1).expand_dims(
                axis=0) if num_samples is not None else scale.expand_dims(
                    axis=1),
        ))
Example #8
0
     ),
     (3, 4, 5),
     (),
 ),
 (
     StudentT(
         mu=mx.nd.zeros(shape=(3, 4, 5)),
         sigma=mx.nd.ones(shape=(3, 4, 5)),
         nu=mx.nd.ones(shape=(3, 4, 5)),
     ),
     (3, 4, 5),
     (),
 ),
 (
     MultivariateGaussian(
         mu=mx.nd.zeros(shape=(3, 4, 5)),
         L=make_nd_diag(F=mx.nd, x=mx.nd.ones(shape=(3, 4, 5)), d=5),
     ),
     (3, 4),
     (5, ),
 ),
 (Dirichlet(alpha=mx.nd.ones(shape=(3, 4, 5))), (3, 4), (5, )),
 (
     DirichletMultinomial(
         dim=5, n_trials=9, alpha=mx.nd.ones(shape=(3, 4, 5))),
     (3, 4),
     (5, ),
 ),
 (
     Laplace(mu=mx.nd.zeros(shape=(3, 4, 5)),
             b=mx.nd.ones(shape=(3, 4, 5))),
     (3, 4, 5),
Example #9
0
    def sample(
        self, num_samples: Optional[int] = None, scale: Optional[Tensor] = None
    ) -> Tensor:
        r"""
        Generates samples from the LDS: p(z_1, z_2, \ldots, z_{`seq_length`}).

        Parameters
        ----------
        num_samples
            Number of samples to generate
        scale
            Scale of each sequence in x, shape (batch_size, output_dim)

        Returns
        -------
        Tensor
            Samples, shape (num_samples, batch_size, seq_length, output_dim)
        """
        F = self.F

        # Note on shapes: here we work with tensors of the following shape
        # in each time step t: (num_samples, batch_size, dim, dim),
        # where dim can be obs_dim or latent_dim or a constant 1 to facilitate
        # generalized matrix multiplication (gemm2)

        # Sample observation noise for all time steps
        # noise_std: (batch_size, seq_length, obs_dim, 1)
        noise_std = F.stack(*self.noise_std, axis=1).expand_dims(axis=-1)

        # samples_eps_obs[t]: (num_samples, batch_size, obs_dim, 1)
        samples_eps_obs = (
            Gaussian(noise_std.zeros_like(), noise_std)
            .sample(num_samples)
            .split(axis=2, num_outputs=self.seq_length, squeeze_axis=True)
        )

        # Sample standard normal for all time steps
        # samples_eps_std_normal[t]: (num_samples, batch_size, obs_dim, 1)
        samples_std_normal = (
            Gaussian(noise_std.zeros_like(), noise_std.ones_like())
            .sample(num_samples)
            .split(axis=2, num_outputs=self.seq_length, squeeze_axis=True)
        )

        # Sample the prior state.
        # samples_lat_state: (num_samples, batch_size, latent_dim, 1)
        state = MultivariateGaussian(
            self.prior_mean, F.linalg_potrf(self.prior_cov)
        )
        samples_lat_state = state.sample(num_samples).expand_dims(axis=-1)

        samples_seq = []
        for t in range(self.seq_length):
            # Expand all coefficients to include samples in axis 0
            # emission_coeff_t: (num_samples, batch_size, obs_dim, latent_dim)
            # transition_coeff_t:
            #   (num_samples, batch_size, latent_dim, latent_dim)
            # innovation_coeff_t: (num_samples, batch_size, 1, latent_dim)
            emission_coeff_t, transition_coeff_t, innovation_coeff_t = [
                _broadcast_param(coeff, axes=[0], sizes=[num_samples])
                for coeff in [
                    self.emission_coeff[t],
                    self.transition_coeff[t],
                    self.innovation_coeff[t],
                ]
            ]

            # Expand residuals as well
            # residual_t: (num_samples, batch_size, obs_dim, 1)
            residual_t = _broadcast_param(
                self.residuals[t].expand_dims(axis=-1),
                axes=[0],
                sizes=[num_samples],
            )

            # (num_samples, batch_size, 1, obs_dim)
            samples_t = (
                F.linalg_gemm2(emission_coeff_t, samples_lat_state)
                + residual_t
                + samples_eps_obs[t]
            ).swapaxes(dim1=2, dim2=3)
            samples_seq.append(samples_t)

            # sample next state: (num_samples, batch_size, latent_dim, 1)
            samples_lat_state = F.linalg_gemm2(
                transition_coeff_t, samples_lat_state
            ) + F.linalg_gemm2(
                innovation_coeff_t, samples_std_normal[t], transpose_a=True
            )

        # (num_samples, batch_size, seq_length, obs_dim)
        samples = F.concat(*samples_seq, dim=2)
        return (
            samples
            if scale is None
            else F.broadcast_mul(samples, scale.expand_dims(axis=1))
        )