def transform(self, labels, categories=None): """ Convert a list of labels into a one-hot encoding. Parameters ---------- labels : list of length `N` A list of category labels. categories : list of length `C` List of the unique category labels for the items to encode. Default is None. Returns ------- Y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)` The one-hot encoded labels. Each row corresponds to an example, with a single 1 in the column corresponding to the respective label. """ if not self._is_fit: categories = set(labels) if categories is None else categories self.fit(categories) unknown = list(set(labels.asnumpy()) - set(self.cat2idx.keys())) assert len(unknown) == 0, "Unrecognized label(s): {}".format(unknown) N, C = len(labels), len(self.cat2idx) cols = np.array([self.cat2idx[c.item()] for c in labels]) Y = np.zeros((N, C)) Y[np.arange(N), cols] = 1 return Y
def _init_params(self): init_weights = WeightInitializer(str(self.act_fn), mode=self.init) self.X = [] b = np.zeros((1, self.n_classes)) W = init_weights((self.n_classes, self.n_in)) self.parameters = {"W": W, "b": b} self.gradients = {"W": np.zeros_like(W), "b": np.zeros_like(b)} self.derived_variables = { "y_pred": [], "target": [], "true_w": [], "true_b": [], "sampled_b": [], "sampled_w": [], "out_labels": [], "target_logits": [], "noise_samples": [], "noise_logits": [], } self.is_initialized = True
def fit(self, X): """ Store the feature-wise mean and standard deviation across the samples in `X` for future scaling. Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, C)` An array of N samples, each with dimensionality `C` """ if not isinstance(X, np.ndarray): X = np.array(X) if X.shape[0] < 2: raise ValueError("`X` must contain at least 2 samples") std = np.ones(X.shape[1]) mean = np.zeros(X.shape[1]) if self.with_mean: mean = np.mean(X, axis=0) if self.with_std: std = np.std(X, axis=0, ddof=0) self._mean = mean self._std = std self._is_fit = True
def conv2D_naive(X, W, stride, pad, dilation=0): """ A slow but more straightforward implementation of a 2D "convolution" (technically, cross-correlation) of input `X` with a collection of kernels `W`. Notes ----- This implementation uses ``for`` loops and direct indexing to perform the convolution. As a result, it is slower than the vectorized :func:`conv2D` function that relies on the :func:`col2im` and :func:`im2col` transformations. Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)` Input volume. W: :py:class:`ndarray <numpy.ndarray>` of shape `(kernel_rows, kernel_cols, in_ch, out_ch)` The volume of convolution weights/kernels. stride : int The stride of each convolution kernel. pad : tuple, int, or 'same' The padding amount. If 'same', add padding to ensure that the output of a 2D convolution with a kernel of `kernel_shape` and stride `stride` produces an output volume of the same dimensions as the input. If 2-tuple, specifies the number of padding rows and colums to add *on both sides* of the rows/columns in `X`. If 4-tuple, specifies the number of rows/columns to add to the top, bottom, left, and right of the input volume. dilation : int Number of pixels inserted between kernel elements. Default is 0. Returns ------- Z : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, out_rows, out_cols, out_ch)` The covolution of `X` with `W`. """ s, d = stride, dilation X_pad, p = pad2D(X, pad, W.shape[:2], stride=s, dilation=d) pr1, pr2, pc1, pc2 = p fr, fc, in_ch, out_ch = W.shape n_ex, in_rows, in_cols, in_ch = X.shape # update effective filter shape based on dilation factor fr, fc = fr * (d + 1) - d, fc * (d + 1) - d out_rows = int((in_rows + pr1 + pr2 - fr) / s + 1) out_cols = int((in_cols + pc1 + pc2 - fc) / s + 1) Z = np.zeros((n_ex, out_rows, out_cols, out_ch)) for m in range(n_ex): for c in range(out_ch): for i in range(out_rows): for j in range(out_cols): i0, i1 = i * s, (i * s) + fr j0, j1 = j * s, (j * s) + fc window = X_pad[m, i0:i1:(d + 1), j0:j1:(d + 1), :] Z[m, i, j, c] = np.sum(window * W[:, :, :, c]) return Z
def autocorrelate1D(x): """ Autocorrelate a 1D signal `x` with itself. Notes ----- The `k` th term in the 1 dimensional autocorrelation is .. math:: a_k = \sum_n x_{n + k} x_n NB. This is a naive :math:`O(N^2)` implementation. For a faster :math:`O(N \log N)` approach using the FFT, see [1]. References ---------- .. [1] https://en.wikipedia.org/wiki/Autocorrelation#Efficient%computation Parameters ---------- x : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` A 1D signal consisting of N samples Returns ------- auto : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The autocorrelation of `x` with itself """ N = len(x) auto = np.zeros(N) for k in range(N): for n in range(N - k): auto[k] += x[n + k] * x[n] return auto
def _M_step(self): C, N, X = self.C, self.N, self.X denoms = np.sum(self.Q, axis=0) # update cluster priors self.pi = denoms / N # update cluster means nums_mu = [np.dot(self.Q[:, c], X) for c in range(C)] for ix, (num, den) in enumerate(zip(nums_mu, denoms)): self.mu[ix, :] = num / den if den > 0 else np.zeros_like(num) # update cluster covariances for c in range(C): mu_c = self.mu[c, :] n_c = denoms[c] outer = np.zeros((self.d, self.d)) for i in range(N): wic = self.Q[i, c] xi = self.X[i, :] outer += wic * np.outer(xi - mu_c, xi - mu_c) outer = outer / n_c if n_c > 0 else outer self.sigma[c, :, :] = outer assert_allclose(np.sum(self.pi), 1, err_msg="{}".format(np.sum(self.pi)))
def _forward(self, Obs): """ Computes the forward probability trellis for an HMM parameterized by :math:`(A, B, \pi)`. Notes ----- The forward trellis (sometimes referred to as `alpha` in the HMM literature), is a 2D array where entry `i`, `j` represents the probability under the HMM of being in latent state `i` after seeing the first `j` observations: .. math:: \mathtt{forward[i,j]} = P(o_1,\ldots,o_j,q_j=i|A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`. The DP step is:: forward[i,j] = sum_{s'=1}^N forward[s',j-1] * A[s',i] * B[i,o_j] = sum_{s'=1}^N P(o_1,\ldots,o_{j-1},q_{j-1}=s'|A,B,pi) * P(q_j=i|q_{j-1}=s') * P(o_j|q_j=i) In words, ``forward[i,j]`` is the weighted sum of the values computed on the previous timestep. The weight on each previous state value is the product of the probability of transitioning from that state to state `i` and the probability of emitting observation `j` in state `i`. Parameters ---------- Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` An observation sequence of length `T`. Returns ------- forward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)` The forward trellis. """ eps = self.eps T = Obs.shape[0] # initialize the forward probability matrix forward = np.zeros((self.N, T)) ot = Obs[0] for s in range(self.N): forward[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps) for t in range(1, T): ot = Obs[t] for s in range(self.N): forward[s, t] = logsumexp( [ forward[s_, t - 1] + np.log(self.A[s_, s] + eps) + np.log(self.B[s, ot] + eps) for s_ in range(self.N) ] ) return forward
def _backward(self, Obs): """ Compute the backward probability trellis for an HMM parameterized by :math:`(A, B, \pi)`. Notes ----- The backward trellis (sometimes referred to as `beta` in the HMM literature), is a 2D array where entry `i`,`j` represents the probability of seeing the observations from time `j+1` onward given that the HMM is in state `i` at time `j` .. math:: \mathtt{backward[i,j]} = P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`. The DP step is:: backward[i,j] = sum_{s'=1}^N backward[s',j+1] * A[i, s'] * B[s',o_{j+1}] = sum_{s'=1}^N P(o_{j+1},o_{j+2},...,o_T|q_j=i,A,B,pi) * P(q_{j+1}=s'|q_{j}=i) * P(o_{j+1}|q_{j+1}=s') In words, ``backward[i,j]`` is the weighted sum of the values computed on the following timestep. The weight on each state value from the `j+1`'th timestep is the product of the probability of transitioning from state i to that state and the probability of emitting observation `j+1` from that state. Parameters ---------- Obs : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` A single observation sequence of length `T`. Returns ------- backward : :py:class:`ndarray <numpy.ndarray>` of shape `(N, T)` The backward trellis. """ eps = self.eps T = Obs.shape[0] # initialize the backward trellis backward = np.zeros((self.N, T)) for s in range(self.N): backward[s, T - 1] = 0 for t in reversed(range(T - 1)): ot1 = Obs[t + 1] for s in range(self.N): backward[s, t] = logsumexp( [ np.log(self.A[s, s_] + eps) + np.log(self.B[s_, ot1] + eps) + backward[s_, t + 1] for s_ in range(self.N) ] ) return backward
def col2im(X_col, X_shape, W_shape, pad, stride, dilation=0): """ Take columns of a 2D matrix and rearrange them into the blocks/windows of a 4D image volume. Notes ----- A NumPy reimagining of MATLAB's ``col2im`` 'sliding' function. Code extended from Andrej Karpathy's ``im2col.py``. Parameters ---------- X_col : :py:class:`ndarray <numpy.ndarray>` of shape `(Q, Z)` The columnized version of `X` (assumed to include padding) X_shape : 4-tuple containing `(n_ex, in_rows, in_cols, in_ch)` The original dimensions of `X` (not including padding) W_shape: 4-tuple containing `(kernel_rows, kernel_cols, in_ch, out_ch)` The dimensions of the weights in the present convolutional layer pad : 4-tuple of `(left, right, up, down)` Number of zero-padding rows/cols to add to `X` stride : int The stride of each convolution kernel dilation : int Number of pixels inserted between kernel elements. Default is 0. Returns ------- img : :py:class:`ndarray <numpy.ndarray>` of shape `(n_ex, in_rows, in_cols, in_ch)` The reshaped `X_col` input matrix """ if not (isinstance(pad, tuple) and len(pad) == 4): raise TypeError("pad must be a 4-tuple, but got: {}".format(pad)) s, d = stride, dilation pr1, pr2, pc1, pc2 = pad fr, fc, n_in, n_out = W_shape n_ex, in_rows, in_cols, n_in = X_shape X_pad = np.zeros((n_ex, n_in, in_rows + pr1 + pr2, in_cols + pc1 + pc2)) k, i, j = _im2col_indices((n_ex, n_in, in_rows, in_cols), fr, fc, pad, s, d) X_col_reshaped = X_col.reshape(n_in * fr * fc, -1, n_ex) X_col_reshaped = X_col_reshaped.transpose(2, 0, 1) np.add.at(X_pad, (slice(None), k, i, j), X_col_reshaped) pr2 = None if pr2 == 0 else -pr2 pc2 = None if pc2 == 0 else -pc2 return X_pad[:, :, pr1:pr2, pc1:pc2]
def _initialize_params(self): """ Randomly initialize the starting GMM parameters. """ C, d = self.C, self.d rr = np.random.rand(C) self.pi = rr / rr.sum() # cluster priors self.Q = np.zeros((self.N, C)) # variational distribution q(T) self.mu = np.random.uniform(-5, 10, C * d).reshape(C, d) # cluster means self.sigma = np.array([np.identity(d) for _ in range(C)]) # cluster covariances self.best_pi = None self.best_mu = None self.best_sigma = None self.best_elbo = -np.inf
def fit(self, X, y): """ Fit the GP prior to the training data. Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)` A training dataset of `N` examples, each with dimensionality `M`. y : :py:class:`ndarray <numpy.ndarray>` of shape `(N, O)` A collection of real-valued training targets for the examples in `X`, each with dimension `O`. """ mu = np.zeros(X.shape[0]) K = self.kernel(X, X) self.parameters["X"] = X self.parameters["y"] = y self.parameters["GP_cov"] = K self.parameters["GP_mean"] = mu
def _encode_dense(self, examples): N = len(examples) table = np.zeros([N, self.n_dim]) # dense for row, feat_dict in enumerate(examples): for f_id, val in feat_dict.items(): if isinstance(f_id, str): f_id = f_id.encode("utf-8") # use json module to convert the feature id into a unique # string compatible with the buffer API (required by hashlib) if isinstance(f_id, (tuple, dict, list)): f_id = json.dumps(f_id, sort_keys=True).encode("utf-8") h = int(self.hash(f_id).hexdigest(), base=16) col = h % self.n_dim table[row, col] += np.sign(h) * val return table
def sample(self, X, n_samples=1, dist="posterior_predictive"): """ Sample functions from the GP prior or posterior predictive distribution. Parameters ---------- X : :py:class:`ndarray <numpy.ndarray>` of shape `(N, M)` The collection of datapoints to generate predictions on. Only used if `dist` = 'posterior_predictive'. n_samples: int The number of samples to generate. Default is 1. dist : {"posterior_predictive", "prior"} The distribution to draw samples from. Default is "posterior_predictive". Returns ------- samples : :py:class:`ndarray <numpy.ndarray>` of shape `(n_samples, O, N)` The generated samples for the points in `X`. """ mvnorm = np.random.multivariate_normal if dist == "prior": mu = np.zeros((X.shape[0], 1)) cov = self.kernel(X, X) elif dist == "posterior_predictive": mu, _, cov = self.predict(X, return_cov=True) else: raise ValueError("Unrecognized dist: '{}'".format(dist)) if mu.ndim == 1: mu = mu[:, np.newaxis] samples = np.array([mvnorm(_mu, cov, size=n_samples) for _mu in mu.T]) return samples.swapaxes(0, 1)
def calc_conv_out_dims(X_shape, W_shape, stride=1, pad=0, dilation=0): """ Compute the dimension of the output volume for the specified convolution. Parameters ---------- X_shape : 3-tuple or 4-tuple The dimensions of the input volume to the convolution. If 3-tuple, entries are expected to be (`n_ex`, `in_length`, `in_ch`). If 4-tuple, entries are expected to be (`n_ex`, `in_rows`, `in_cols`, `in_ch`). weight_shape : 3-tuple or 4-tuple The dimensions of the weight volume for the convolution. If 3-tuple, entries are expected to be (`f_len`, `in_ch`, `out_ch`). If 4-tuple, entries are expected to be (`fr`, `fc`, `in_ch`, `out_ch`). pad : tuple, int, or {'same', 'causal'} The padding amount. If 'same', add padding to ensure that the output length of a 1D convolution with a kernel of `kernel_shape` and stride `stride` is the same as the input length. If 'causal' compute padding such that the output both has the same length as the input AND ``output[t]`` does not depend on ``input[t + 1:]``. If 2-tuple, specifies the number of padding columns to add on each side of the sequence. Default is 0. stride : int The stride for the convolution kernel. Default is 1. dilation : int The dilation of the convolution kernel. Default is 0. Returns ------- out_dims : 3-tuple or 4-tuple The dimensions of the output volume. If 3-tuple, entries are (`n_ex`, `out_length`, `out_ch`). If 4-tuple, entries are (`n_ex`, `out_rows`, `out_cols`, `out_ch`). """ dummy = np.zeros(X_shape) s, p, d = stride, pad, dilation if len(X_shape) == 3: _, p = pad1D(dummy, p) pw1, pw2 = p fw, in_ch, out_ch = W_shape n_ex, in_length, in_ch = X_shape _fw = fw * (d + 1) - d out_length = (in_length + pw1 + pw2 - _fw) // s + 1 out_dims = (n_ex, out_length, out_ch) elif len(X_shape) == 4: _, p = pad2D(dummy, p) pr1, pr2, pc1, pc2 = p fr, fc, in_ch, out_ch = W_shape n_ex, in_rows, in_cols, in_ch = X_shape # adjust effective filter size to account for dilation _fr, _fc = fr * (d + 1) - d, fc * (d + 1) - d out_rows = (in_rows + pr1 + pr2 - _fr) // s + 1 out_cols = (in_cols + pc1 + pc2 - _fc) // s + 1 out_dims = (n_ex, out_rows, out_cols, out_ch) else: raise ValueError("Unrecognized number of input dims: {}".format( len(X_shape))) return out_dims
def mel_filterbank(N, n_filters=20, fs=44000, min_freq=0, max_freq=None, normalize=True): """ Compute the filters in a Mel filterbank and return the corresponding transformation matrix Notes ----- The Mel scale is a perceptual scale designed to simulate the way the human ear works. Pitches judged by listeners to be equal in perceptual / psychological distance have equal distance on the Mel scale. Practically, this corresponds to a scale with higher resolution at low frequencies and lower resolution at higher (> 500 Hz) frequencies. Each filter in the Mel filterbank is triangular with a response of 1 at its center and a linear decay on both sides until it reaches the center frequency of the next adjacent filter. This implementation is based on code in the (superb) LibROSA package [1]. References ---------- .. [1] McFee et al. (2015). "librosa: Audio and music signal analysis in Python", *Proceedings of the 14th Python in Science Conference* https://librosa.github.io Parameters ---------- N : int The number of DFT bins n_filters : int The number of mel filters to include in the filterbank. Default is 20. min_freq : int Minimum filter frequency (in Hz). Default is 0. max_freq : int Maximum filter frequency (in Hz). Default is 0. fs : int The sample rate/frequency for the signal. Default is 44000. normalize : bool If True, scale the Mel filter weights by their area in Mel space. Default is True. Returns ------- fbank : :py:class:`ndarray <numpy.ndarray>` of shape `(n_filters, N // 2 + 1)` The mel-filterbank transformation matrix. Rows correspond to filters, columns to DFT bins. """ max_freq = fs / 2 if max_freq is None else max_freq min_mel, max_mel = hz2mel(min_freq), hz2mel(max_freq) fbank = np.zeros((n_filters, N // 2 + 1)) # uniformly spaced values on the mel scale, translated back into Hz mel_bins = mel2hz(np.linspace(min_mel, max_mel, n_filters + 2)) # the centers of the frequency bins for the DFT hz_bins = dft_bins(N, fs) mel_spacing = np.diff(mel_bins) # ramps[i] = mel_bins[i] - hz_bins ramps = mel_bins.reshape(-1, 1) - hz_bins.reshape(1, -1) for i in range(n_filters): # calc the filter values on the left and right across the bins ... left = -ramps[i] / mel_spacing[i] right = ramps[i + 2] / mel_spacing[i + 1] # .. and set them zero when they cross the x-axis fbank[i] = np.maximum(0, np.minimum(left, right)) if normalize: energy_norm = 2.0 / (mel_bins[2:n_filters + 2] - mel_bins[:n_filters]) fbank *= energy_norm[:, np.newaxis] return fbank
def _Mstep(self, gamma, xi, phi): """ Run a single M-step update for the Baum-Welch/Forward-Backward algorithm. Parameters ---------- gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)` The estimated state-occupancy count matrix. xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)` The estimated state-state transition count matrix. phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)` The estimated starting count matrix for each latent state. Returns ------- A : :py:class:`ndarray <numpy.ndarray>` of shape `(N, N)` The estimated transition matrix. B : :py:class:`ndarray <numpy.ndarray>` of shape `(N, V)` The estimated emission matrix. pi : :py:class:`ndarray <numpy.ndarray>` of shape `(N,)` The estimated prior probabilities for each latent state. """ eps = self.eps # initialize the estimated transition (A) and emission (B) matrices A = np.zeros((self.N, self.N)) B = np.zeros((self.N, self.V)) pi = np.zeros(self.N) count_gamma = np.zeros((self.I, self.N, self.V)) count_xi = np.zeros((self.I, self.N, self.N)) for i in range(self.I): Obs = self.O[i, :] for si in range(self.N): for vk in range(self.V): # if not (Obs == vk).any(): if not int(Obs[0]) == vk: # count_gamma[i, si, vk] = -np.inf count_gamma[i, si, vk] = np.log(eps) else: count_gamma[i, si, vk] = logsumexp(gamma[i, si, Obs == vk]) for sj in range(self.N): count_xi[i, si, sj] = logsumexp(xi[i, si, sj, :]) pi = logsumexp(phi, axis=0) - np.log(self.I + eps) np.testing.assert_almost_equal(np.exp(pi).sum(), 1) for si in range(self.N): for vk in range(self.V): B[si, vk] = logsumexp(count_gamma[:, si, vk]) - logsumexp( count_gamma[:, si, :] ) for sj in range(self.N): A[si, sj] = logsumexp(count_xi[:, si, sj]) - logsumexp( count_xi[:, si, :] ) np.testing.assert_almost_equal(np.exp(A[si, :]).sum(), 1) np.testing.assert_almost_equal(np.exp(B[si, :]).sum(), 1) return np.exp(A), np.exp(B), np.exp(pi)
def _Estep(self): """ Run a single E-step update for the Baum-Welch/Forward-Backward algorithm. This step estimates ``xi`` and ``gamma``, the excepted state-state transition counts and the expected state-occupancy counts, respectively. ``xi[i,j,k]`` gives the probability of being in state `i` at time `k` and state `j` at time `k+1` given the observed sequence `O` and the current estimates for transition (`A`) and emission (`B`) matrices:: xi[i,j,k] = P(q_k=i,q_{k+1}=j|O,A,B,pi) = P(q_k=i,q_{k+1}=j,O|A,B,pi) / P(O|A,B,pi) = [ P(o_1,o_2,...,o_k,q_k=i|A,B,pi) * P(q_{k+1}=j|q_k=i) * P(o_{k+1}|q_{k+1}=j) * P(o_{k+2},o_{k+3},...,o_T|q_{k+1}=j,A,B,pi) ] / P(O|A,B,pi) = [ fwd[j, k] * self.A[j, i] * self.B[i, o_{k+1}] * bwd[i, k + 1] ] / fwd[:, T].sum() The expected number of transitions from state `i` to state `j` across the entire sequence is then the sum over all timesteps: ``xi[i,j,:].sum()``. ``gamma[i,j]`` gives the probability of being in state `i` at time `j` .. math:: \mathtt{gamma[i,j]} = P(q_j = i \mid O, A, B, \pi) Returns ------- gamma : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, T)` The estimated state-occupancy count matrix. xi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N, N, T)` The estimated state-state transition count matrix. phi : :py:class:`ndarray <numpy.ndarray>` of shape `(I, N)` The estimated prior counts for each latent state. """ eps = self.eps gamma = np.zeros((self.I, self.N, self.T)) xi = np.zeros((self.I, self.N, self.N, self.T)) phi = np.zeros((self.I, self.N)) for i in range(self.I): Obs = self.O[i, :] fwd = self._forward(Obs) bwd = self._backward(Obs) log_likelihood = logsumexp(fwd[:, self.T - 1]) t = self.T - 1 for si in range(self.N): gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood phi[i, si] = fwd[si, 0] + bwd[si, 0] - log_likelihood for t in range(self.T - 1): ot1 = Obs[t + 1] for si in range(self.N): gamma[i, si, t] = fwd[si, t] + bwd[si, t] - log_likelihood for sj in range(self.N): xi[i, si, sj, t] = ( fwd[si, t] + np.log(self.A[si, sj] + eps) + np.log(self.B[sj, ot1] + eps) + bwd[sj, t + 1] - log_likelihood ) return gamma, xi, phi
def decode(self, O): """ Given the HMM parameterized by :math:`(A, B, \pi)` and an observation sequence :math:`O = o_1, \ldots, o_T`, compute the most probable sequence of latent states, :math:`Q = q_1, \ldots, q_T`. Notes ----- HMM decoding is done efficiently via DP using the Viterbi algorithm, which produces a 2D trellis, ``viterbi``, where entry `i`, `j` represents the probability under the HMM of being in state `i` at time `j` after having passed through the *most probable* state sequence :math:`q_1,\ldots,q_{j-1}`: .. math:: \mathtt{viterbi[i,j]} = \max_{q_1,\ldots,q_{j-1}} P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi) Here :math:`q_j = i` indicates that the hidden state at time `j` is of type `i`, and :math:`\max_{q_1,\ldots,q_{j-1}}` represents the maximum over all possible latent state sequences for the first `j-1` observations. The DP step is: .. math:: \mathtt{viterbi[i,j]} &= \max_{s'=1}^N \mathtt{viterbi[s',j-1]} \cdot \mathtt{A[s',i]} \cdot \mathtt{B[i,o_j]} \\ &= \max_{s'=1}^N P(o_1,\ldots,o_j,q_1,\ldots,q_{j-1},q_j=i \mid A,B,\pi) P(q_j=i \mid q_{j-1}=s') P(o_j \mid q_j=i) In words, ``viterbi[i,j]`` is the weighted sum of the values computed on the previous timestep. The weight on each value is the product of the probability of transitioning from that state to state `i` and the probability of emitting observation `j` in state `i`. To compute the most probable state sequence we maintain a second trellis, ``back_pointer``, whose `i`, `j` entry contains the value of the latent state at timestep `j-1` that is most likely to lead to latent state `i` at timestep `j`. When we have completed the ``viterbi`` and ``back_pointer`` trellises for all `T` timseteps/observations, we greedily move backwards through the ``back_pointer`` trellis to construct the best path for the full sequence of observations. Parameters ---------- O : :py:class:`ndarray <numpy.ndarray>` of shape `(T,)` An observation sequence of length `T`. Returns ------- best_path : list of length `T` The most probable sequence of latent states for observations `O`. best_path_prob : float The probability of the latent state sequence in `best_path` under the HMM. """ eps = self.eps if O.ndim == 1: O = O.reshape(1, -1) # number of observations in each sequence T = O.shape[1] # number of training sequences I = O.shape[0] if I != 1: raise ValueError("Can only decode a single sequence (O.shape[0] must be 1)") # initialize the viterbi and back_pointer matrices viterbi = np.zeros((self.N, T)) back_pointer = np.zeros((self.N, T)).astype(int) ot = O[0, 0] for s in range(self.N): back_pointer[s, 0] = 0 viterbi[s, 0] = np.log(self.pi[s] + eps) + np.log(self.B[s, ot] + eps) for t in range(1, T): ot = O[0, t] for s in range(self.N): seq_probs = [ viterbi[s_, t - 1] + np.log(self.A[s_, s] + eps) + np.log(self.B[s, ot] + eps) for s_ in range(self.N) ] viterbi[s, t] = np.max(seq_probs) back_pointer[s, t] = np.argmax(seq_probs) best_path_log_prob = viterbi[:, T - 1].max() # backtrack through the trellis to get the most likely sequence of # latent states pointer = viterbi[:, T - 1].argmax() best_path = [pointer] for t in reversed(range(1, T)): pointer = back_pointer[pointer, t] best_path.append(pointer) best_path = best_path[::-1] return best_path, best_path_log_prob