Esempio n. 1
0
File: hmm.py Progetto: Tommliu/mx-ml
    def _forward(self, Obs):
        """
        Computes the forward probability trellis for an HMM parameterized by
        :math:`(A, B, \pi)`.

        Notes
        -----
        The forward trellis (sometimes referred to as `alpha` in the HMM
        literature), is a 2D array where entry `i`, `j` represents the probability
        under the HMM of being in latent state `i` after seeing the first `j`
        observations:

        .. math:: \mathtt{forward[i,j]} = P(o_1,\ldots,o_j,q_j=i|A,B,\pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of
        type `i`.

        The DP step is::

            forward[i,j] = sum_{s'=1}^N forward[s',j-1] * A[s',i] * B[i,o_j]
                         = sum_{s'=1}^N P(o_1,\ldots,o_{j-1},q_{j-1}=s'|A,B,pi) *
                           P(q_j=i|q_{j-1}=s') * P(o_j|q_j=i)

        In words, ``forward[i,j]`` is the weighted sum of the values computed
        on the previous timestep. The weight on each previous state value is
        the product of the probability of transitioning from that state to
        state `i` and the probability of emitting observation `j` in state `i`.

        Parameters
        ----------
        Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            An observation sequence of length `T`.

        Returns
        -------
        forward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)`
            The forward trellis.
        """
        eps = self.eps
        T = Obs.shape[0]

        # initialize the forward probability matrix
        forward = np.zeros((self.N, T))

        ot = Obs[0]
        for s in range(self.N):
            forward[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps)

        for t in range(1, T):
            ot = Obs[t]
            for s in range(self.N):
                forward[s, t] = logsumexp(
                    [
                        forward[s_, t - 1]
                        + np.log(self.A[s_, s] + eps)
                        + np.log(self.B[s, ot] + eps)
                        for s_ in range(self.N)
                    ]
                )
        return forward
Esempio n. 2
0
File: hmm.py Progetto: Tommliu/mx-ml
    def _backward(self, Obs):
        """
        Compute the backward probability trellis for an HMM parameterized by
        :math:`(A, B, \pi)`.

        Notes
        -----
        The backward trellis (sometimes referred to as `beta` in the HMM
        literature), is a 2D array where entry `i`,`j` represents the probability
        of seeing the observations from time `j+1` onward given that the HMM is
        in state `i` at time `j`

        .. math:: \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,\pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`.

        The DP step is::

            backward[i,j] = sum_{s'=1}^N backward[s',j+1] * A[i, s'] * B[s',o_{j+1}]
                          = sum_{s'=1}^N P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,pi) *
                            P(q_{j+1}=s'|q_{j}=i) * P(o_{j+1}|q_{j+1}=s')

        In words, ``backward[i,j]`` is the weighted sum of the values computed
        on the following timestep. The weight on each state value from the
        `j+1`'th timestep is the product of the probability of transitioning from
        state i to that state and the probability of emitting observation `j+1`
        from that state.

        Parameters
        ----------
        Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            A single observation sequence of length `T`.

        Returns
        -------
        backward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)`
            The backward trellis.
        """
        eps = self.eps
        T = Obs.shape[0]

        # initialize the backward trellis
        backward = np.zeros((self.N, T))

        for s in range(self.N):
            backward[s, T - 1] = 0

        for t in reversed(range(T - 1)):
            ot1 = Obs[t + 1]
            for s in range(self.N):
                backward[s, t] = logsumexp(
                    [
                        np.log(self.A[s, s_] + eps)
                        + np.log(self.B[s_, ot1] + eps)
                        + backward[s_, t + 1]
                        for s_ in range(self.N)
                    ]
                )
        return backward
Esempio n. 3
0
    def _loss(self, X, target, neg_samples):
        """Actual computation of NCE loss"""
        fstr = "X must have shape (n_ex, n_c, n_in), but got {} dims instead"
        assert X.ndim == 3, fstr.format(X.ndim)

        W = self.parameters["W"]
        b = self.parameters["b"]

        # sample negative samples from the noise distribution
        if neg_samples is None:
            neg_samples = self.noise_sampler(self.num_negative_samples)
        assert len(neg_samples) == self.num_negative_samples

        # get the probability of the negative sample class and the target
        # class under the noise distribution
        p_neg_samples = self.noise_sampler.probs[neg_samples]
        p_target = np.atleast_2d(self.noise_sampler.probs[target])

        # save the noise samples for debugging
        noise_samples = (neg_samples, p_target, p_neg_samples)

        # compute the logit for the negative samples and target
        Z_target = X @ W[target].T + b[0, target]
        Z_neg = X @ W[neg_samples].T + b[0, neg_samples]

        # subtract the log probability of each label under the noise dist
        if self.subtract_log_label_prob:
            n, m = Z_target.shape[0], Z_neg.shape[0]
            Z_target[range(n), ...] -= np.log(p_target)
            Z_neg[range(m), ...] -= np.log(p_neg_samples)

        # only retain the probability of the target under its associated
        # minibatch example
        aa, _, cc = Z_target.shape
        Z_target = Z_target[range(aa), :, range(cc)][..., None]

        # p_target = (n_ex, n_c, 1)
        # p_neg = (n_ex, n_c, n_samples)
        pred_p_target = self.act_fn(Z_target)
        pred_p_neg = self.act_fn(Z_neg)

        # if we're in evaluation mode, ignore the negative samples - just
        # return the binary cross entropy on the targets
        y_pred = pred_p_target
        if self.trainable:
            # (n_ex, n_c, 1 + n_samples) (target is first column)
            y_pred = np.concatenate((y_pred, pred_p_neg), axis=-1)

        n_targets = 1
        y_true = np.zeros_like(y_pred)
        y_true[..., :n_targets] = 1

        # binary cross entropy
        eps = 2.220446049250313e-16
        np.clip(y_pred, eps, 1 - eps, y_pred)
        loss = -np.sum(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
        return loss, Z_target, Z_neg, y_pred, y_true, noise_samples
Esempio n. 4
0
    def loss(y, y_pred):
        """
        Compute the cross-entropy (log) loss.

        Notes
        -----
        This method returns the sum (not the average!) of the losses for each
        sample.

        Parameters
        ----------
        y : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Class labels (one-hot with `m` possible classes) for each of `n`
            examples.
        y_pred : :py:class:`ndarray <numpy.ndarray>` of shape (n, m)
            Probabilities of each of `m` classes for the `n` examples in the
            batch.

        Returns
        -------
        loss : float
            The sum of the cross-entropy across classes and examples.
        """
        is_binary(y)
        is_stochastic(y_pred)

        # prevent taking the log of 0
        eps = 2.220446049250313e-16

        # each example is associated with a single class; sum the negative log
        # probability of the correct label over all samples in the batch.
        # observe that we are taking advantage of the fact that y is one-hot
        # encoded
        cross_entropy = -np.sum(y * np.log(y_pred + eps))
        return cross_entropy
Esempio n. 5
0
def logsumexp(log_probs, axis=None):
    """
    Redefine scipy.special.logsumexp
    see: http://bayesjumping.net/log-sum-exp-trick/
    """
    _max = np.max(log_probs)
    ds = log_probs - _max
    exp_sum = np.exp(ds).sum(axis=axis)
    return _max + np.log(exp_sum)
Esempio n. 6
0
    def fn(self, z):
        """
        Evaluate the softplus activation on the elements of input `z`.

        .. math::

            \\text{SoftPlus}(z_i) = \log(1 + e^{z_i})
        """
        return np.log(np.exp(z) + 1)
Esempio n. 7
0
def log_gaussian_pdf(x_i, mu, sigma):
    """
    Compute log N(x_i | mu, sigma)
    """
    n = len(mu)
    a = n * np.log(2 * np.pi)
    _, b = np.linalg.slogdet(sigma)

    y = np.linalg.solve(sigma, x_i - mu)
    c = np.dot(x_i - mu, y)
    return -0.5 * (a + b + c)
Esempio n. 8
0
File: hmm.py Progetto: Tommliu/mx-ml
def logsumexp(log_probs, axis=None):
    """
    Redefine scipy.special.logsumexp
    see: http://bayesjumping.net/log-sum-exp-trick/
    """
    # print("\nlogsumexp")
    # print("log_probs",type(log_probs),log_probs.shape,log_probs)
    _max = np.max(log_probs)
    # print("_max",type(_max),_max.shape,_max)
    ds = log_probs - _max
    exp_sum = np.exp(ds).sum(axis=axis)
    # print("exp_sum",type(exp_sum),exp_sum.shape,exp_sum)
    return float(_max + np.log(exp_sum))
Esempio n. 9
0
File: lda.py Progetto: Tommliu/mx-ml
    def VLB(self):
        """
        Return the variational lower bound associated with the current model
        parameters.
        """
        phi = self.phi
        alpha = self.alpha
        beta = self.beta
        gamma = self.gamma
        corpus = self.corpus

        D = self.D
        T = self.T
        N = self.N

        a, b, c, _d = 0, 0, 0, 0
        for d in range(D):
            a += (
                gammaln(np.sum(alpha))
                - np.sum(gammaln(alpha))
                + np.sum([(alpha[t] - 1) * dg(gamma, d, t) for t in range(T)])
            )

            _d += (
                gammaln(np.sum(gamma[d, :]))
                - np.sum(gammaln(gamma[d, :]))
                + np.sum([(gamma[d, t] - 1) * dg(gamma, d, t) for t in range(T)])
            )

            for n in range(N[d]):
                w_n = int(corpus[d][n])

                b += np.sum([phi[d][n, t] * dg(gamma, d, t) for t in range(T)])
                c += np.sum([phi[d][n, t] * np.log(beta[w_n, t]) for t in range(T)])
                _d += np.sum([phi[d][n, t] * np.log(phi[d][n, t]) for t in range(T)])

        return a + b + c - _d
Esempio n. 10
0
    def _E_step(self):
        for i in range(self.N):
            x_i = self.X[i, :]

            denom_vals = []
            for c in range(self.C):
                pi_c = self.pi[c]
                mu_c = self.mu[c, :]
                sigma_c = self.sigma[c, :, :]

                log_pi_c = np.log(pi_c)
                log_p_x_i = log_gaussian_pdf(x_i, mu_c, sigma_c)

                # log N(X_i | mu_c, Sigma_c) + log pi_c
                denom_vals.append(log_p_x_i + log_pi_c)

            # log \sum_c exp{ log N(X_i | mu_c, Sigma_c) + log pi_c } ]
            log_denom = logsumexp(denom_vals)
            q_i = np.exp([num - log_denom for num in denom_vals])
            assert_allclose(np.sum(q_i), 1, err_msg="{}".format(np.sum(q_i)))

            self.Q[i, :] = q_i
Esempio n. 11
0
File: dsp.py Progetto: Tommliu/mx-ml
def mfcc(
    x,
    fs=44000,
    n_mfccs=13,
    alpha=0.95,
    center=True,
    n_filters=20,
    window="hann",
    normalize=True,
    lifter_coef=22,
    stride_duration=0.01,
    window_duration=0.025,
    replace_intercept=True,
):
    """
    Compute the Mel-frequency cepstral coefficients (MFCC) for a signal.

    Notes
    -----
    Computing MFCC features proceeds in the following stages:

        1. Convert the signal into overlapping frames and apply a window fn
        2. Compute the power spectrum at each frame
        3. Apply the mel filterbank to the power spectra to get mel filterbank powers
        4. Take the logarithm of the mel filterbank powers at each frame
        5. Take the discrete cosine transform (DCT) of the log filterbank
           energies and retain only the first k coefficients to further reduce
           the dimensionality

    MFCCs were developed in the context of HMM-GMM automatic speech recognition
    (ASR) systems and can be used to provide a somewhat speaker/pitch
    invariant representation of phonemes.

    Parameters
    ----------
    x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
        A 1D signal consisting of N samples
    fs : int
        The sample rate/frequency for the signal. Default is 44000.
    n_mfccs : int
        The number of cepstral coefficients to return (including the intercept
        coefficient). Default is 13.
    alpha : float in [0, 1)
        The preemphasis coefficient. A value of 0 corresponds to no
        filtering. Default is 0.95.
    center : bool
        Whether to the kth frame of the signal should *begin* at index ``x[k *
        stride_len]`` (center = False) or be *centered* at ``x[k * stride_len]``
        (center = True). Default is True.
    n_filters : int
        The number of filters to include in the Mel filterbank. Default is 20.
    normalize : bool
        Whether to mean-normalize the MFCC values. Default is True.
    lifter_coef : int in :math:[0, + \infty]`
        The cepstral filter coefficient. 0 corresponds to no filtering, larger
        values correspond to greater amounts of smoothing. Default is 22.
    window : {'hamming', 'hann', 'blackman_harris'}
        The windowing function to apply to the signal before taking the DFT.
        Default is 'hann'.
    stride_duration : float
        The duration of the hop between consecutive windows (in seconds).
        Default is 0.01.
    window_duration : float
        The duration of each frame / window (in seconds). Default is 0.025.
    replace_intercept : bool
        Replace the first MFCC coefficient (the intercept term) with the
        log of the total frame energy instead. Default is True.

    Returns
    -------
    mfccs : :py:class:`ndarray <numpy.ndarray>` of shape `(G, C)`
        Matrix of Mel-frequency cepstral coefficients. Rows correspond to
        frames, columns to cepstral coefficients
    """
    # map the power spectrum for the (framed + windowed representation of) `x`
    # onto the mel scale
    filter_energies, frame_energies = mel_spectrogram(
        x=x,
        fs=fs,
        alpha=alpha,
        center=center,
        window=window,
        n_filters=n_filters,
        mean_normalize=False,
        window_duration=window_duration,
        stride_duration=stride_duration,
    )

    log_energies = 10 * np.log10(filter_energies)

    # perform a DCT on the log-mel coefficients to further reduce the data
    # dimensionality -- the early DCT coefficients will capture the majority of
    # the data, allowing us to discard coefficients > n_mfccs
    mfccs = np.array([DCT(frame) for frame in log_energies])[:, :n_mfccs]

    mfccs = cepstral_lifter(mfccs, D=lifter_coef)
    mfccs -= np.mean(mfccs, axis=0) if normalize else 0

    if replace_intercept:
        # the 0th MFCC coefficient doesn't tell us anything about the spectrum;
        # replace it with the log of the frame energy for something more
        # informative
        mfccs[:, 0] = np.log(frame_energies)
    return mfccs
Esempio n. 12
0
    def marginal_log_likelihood(self, kernel_params=None):
        """
        Compute the log of the marginal likelihood (i.e., the log model
        evidence), :math:`p(y \mid X, \\text{kernel_params})`.

        Notes
        -----
        Under the GP regression model, the marginal likelihood is normally
        distributed:

        .. math::

            y | X, \\theta  \sim  \mathcal{N}(0, K + \\alpha I)

        Hence,

        .. math::

            \log p(y \mid X, \\theta) =
                -0.5 \log \det(K + \\alpha I) -
                    0.5 y^\\top (K + \\alpha I)^{-1} y + \\frac{n}{2} \log 2 \pi

        where :math:`K = \\text{kernel}(X, X)`, :math:`\\theta` is the set of
        kernel parameters, and `n` is the number of dimensions in `K`.

        Parameters
        ----------
        kernel_params : dict
            Parameters for the kernel function. If None, calculate the
            marginal likelihood under the kernel parameters defined at model
            initialization. Default is None.

        Returns
        -------
        marginal_log_likelihood : float
            The log likelihood of the training targets given the kernel
            parameterized by `kernel_params` and the training inputs,
            marginalized over all functions `f`.
        """
        X = self.parameters["X"]
        y = self.parameters["y"]
        alpha = self.hyperparameters["alpha"]

        K = self.parameters["GP_cov"]
        if kernel_params is not None:
            # create a new kernel with parameters `kernel_params` and recalc
            # the GP covariance matrix
            summary_dict = self.kernel.summary_dict()
            summary_dict["parameters"].update(kernel_params)
            kernel = KernelInitializer(summary_dict)()
            K = kernel(X, X)

        # add isotropic noise to kernel diagonal
        K += np.eye(K.shape[0]) * alpha

        Kinv = inv(K)
        Klogdet = -0.5 * slogdet(K)[1]
        const = K.shape[0] / 2 * np.log(2 * np.pi)

        # handle both uni- and multidimensional target values
        if y.ndim == 1:
            y = y[:, np.newaxis]

        # sum over each dimension of y

        marginal_ll = np.sum([
            Klogdet - 0.5 * np.dot(np.dot(_y.T, Kinv), _y) - const
            for _y in y.T
        ])
        return marginal_ll
Esempio n. 13
0
File: hmm.py Progetto: Tommliu/mx-ml
    def _Mstep(self, gamma, xi, phi):
        """
        Run a single M-step update for the Baum-Welch/Forward-Backward
        algorithm.

        Parameters
        ----------
        gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)`
            The estimated state-occupancy count matrix.
        xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)`
            The estimated state-state transition count matrix.
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)`
            The estimated starting count matrix for each latent state.

        Returns
        -------
        A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)`
            The estimated transition matrix.
        B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)`
            The estimated emission matrix.
        pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)`
            The estimated prior probabilities for each latent state.
        """
        eps = self.eps

        # initialize the estimated transition (A) and emission (B) matrices
        A = np.zeros((self.N, self.N))
        B = np.zeros((self.N, self.V))
        pi = np.zeros(self.N)

        count_gamma = np.zeros((self.I, self.N, self.V))
        count_xi = np.zeros((self.I, self.N, self.N))

        for i in range(self.I):
            Obs = self.O[i, :]
            for si in range(self.N):
                for vk in range(self.V):
                    # if not (Obs == vk).any():
                    if not int(Obs[0]) == vk:
                        #  count_gamma[i, si, vk] = -np.inf
                        count_gamma[i, si, vk] = np.log(eps)
                    else:
                        count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk])

                for sj in range(self.N):
                    count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :])

        pi = logsumexp(phi, axis=0) - np.log(self.I + eps)
        np.testing.assert_almost_equal(np.exp(pi).sum(), 1)

        for si in range(self.N):
            for vk in range(self.V):
                B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp(
                    count_gamma[:, si, :]
                )

            for sj in range(self.N):
                A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp(
                    count_xi[:, si, :]
                )

            np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1)
            np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1)
        return np.exp(A), np.exp(B), np.exp(pi)
Esempio n. 14
0
File: hmm.py Progetto: Tommliu/mx-ml
    def _Estep(self):
        """
        Run a single E-step update for the Baum-Welch/Forward-Backward
        algorithm. This step estimates ``xi`` and ``gamma``, the excepted
        state-state transition counts and the expected state-occupancy counts,
        respectively.

        ``xi[i,j,k]`` gives the probability of being in state `i` at time `k`
        and state `j` at time `k+1` given the observed sequence `O` and the
        current estimates for transition (`A`) and emission (`B`) matrices::

            xi[i,j,k] = P(q_k=i,q_{k+1}=j|O,A,B,pi)
                      = P(q_k=i,q_{k+1}=j,O|A,B,pi) / P(O|A,B,pi)
                      = [
                            P(o_1,o_2,...,o_k,q_k=i|A,B,pi) *
                            P(q_{k+1}=j|q_k=i) * P(o_{k+1}|q_{k+1}=j) *
                            P(o_{k+2},o_{k+3},...,o_T|q_{k+1}=j,A,B,pi)
                        ] / P(O|A,B,pi)
                      = [
                            fwd[j, k] * self.A[j, i] *
                            self.B[i, o_{k+1}] * bwd[i, k + 1]
                        ] / fwd[:, T].sum()

        The expected number of transitions from state `i` to state `j` across the
        entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``.

        ``gamma[i,j]`` gives the probability of being in state `i` at time `j`

        .. math:: \mathtt{gamma[i,j]} = P(q_j = i \mid O, A, B, \pi)

        Returns
        -------
        gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)`
            The estimated state-occupancy count matrix.
        xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)`
            The estimated state-state transition count matrix.
        phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)`
            The estimated prior counts for each latent state.
        """
        eps = self.eps

        gamma = np.zeros((self.I, self.N, self.T))
        xi = np.zeros((self.I, self.N, self.N, self.T))
        phi = np.zeros((self.I, self.N))

        for i in range(self.I):
            Obs = self.O[i, :]
            fwd = self._forward(Obs)
            bwd = self._backward(Obs)
            log_likelihood = logsumexp(fwd[:, self.T - 1])

            t = self.T - 1
            for si in range(self.N):
                gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
                phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood

            for t in range(self.T - 1):
                ot1 = Obs[t + 1]
                for si in range(self.N):
                    gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood
                    for sj in range(self.N):
                        xi[i, si, sj, t] = (
                            fwd[si, t]
                            + np.log(self.A[si, sj] + eps)
                            + np.log(self.B[sj, ot1] + eps)
                            + bwd[sj, t + 1]
                            - log_likelihood
                        )

        return gamma, xi, phi
Esempio n. 15
0
File: hmm.py Progetto: Tommliu/mx-ml
    def decode(self, O):
        """
        Given the HMM parameterized by :math:`(A, B, \pi)` and an observation
        sequence :math:`O = o_1, \ldots, o_T`, compute the most probable
        sequence of latent states, :math:`Q = q_1, \ldots, q_T`.

        Notes
        -----
        HMM decoding is done efficiently via DP using the Viterbi algorithm,
        which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the
        probability under the HMM of being in state `i` at time `j` after having
        passed through the *most probable* state sequence :math:`q_1,\ldots,q_{j-1}`:

        .. math::

            \mathtt{viterbi[i,j]} =
                \max_{q_1,\ldots,q_{j-1}} P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi)

        Here :math:`q_j = i` indicates that the hidden state at time `j` is of
        type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over
        all possible latent state sequences for the first `j-1` observations.

        The DP step is:

        .. math::

            \mathtt{viterbi[i,j]}  &=  \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot
            \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\

                                   &=  \max_{s'=1}^N
                                   P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi)
                                   P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i)

        In words, ``viterbi[i,j]`` is the weighted sum of the values computed
        on the previous timestep. The weight on each value is the product of
        the probability of transitioning from that state to state `i` and the
        probability of emitting observation `j` in state `i`.

        To compute the most probable state sequence we maintain a second
        trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the
        latent state at timestep `j-1` that is most likely to lead to latent
        state `i` at timestep `j`.

        When we have completed the ``viterbi`` and ``back_pointer`` trellises for
        all `T` timseteps/observations, we greedily move backwards through the
        ``back_pointer`` trellis to construct the best path for the full
        sequence of observations.

        Parameters
        ----------
        O : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)`
            An observation sequence of length `T`.

        Returns
        -------
        best_path : list of length `T`
            The most probable sequence of latent states for observations `O`.
        best_path_prob : float
            The probability of the latent state sequence in `best_path` under
            the HMM.
        """
        eps = self.eps

        if O.ndim == 1:
            O = O.reshape(1, -1)

        # number of observations in each sequence
        T = O.shape[1]

        # number of training sequences
        I = O.shape[0]
        if I != 1:
            raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)")

        # initialize the viterbi and back_pointer matrices
        viterbi = np.zeros((self.N, T))
        back_pointer = np.zeros((self.N, T)).astype(int)

        ot = O[0, 0]
        for s in range(self.N):
            back_pointer[s, 0] = 0
            viterbi[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps)

        for t in range(1, T):
            ot = O[0, t]
            for s in range(self.N):
                seq_probs = [
                    viterbi[s_, t - 1]
                    + np.log(self.A[s_, s] + eps)
                    + np.log(self.B[s, ot] + eps)
                    for s_ in range(self.N)
                ]

                viterbi[s, t] = np.max(seq_probs)
                back_pointer[s, t] = np.argmax(seq_probs)

        best_path_log_prob = viterbi[:, T - 1].max()

        # backtrack through the trellis to get the most likely sequence of
        # latent states
        pointer = viterbi[:, T - 1].argmax()
        best_path = [pointer]
        for t in reversed(range(1, T)):
            pointer = back_pointer[pointer, t]
            best_path.append(pointer)
        best_path = best_path[::-1]
        return best_path, best_path_log_prob