def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get('model') if not model: priors = RandomProbDist(self._states) transitions = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._states)) for state in self._states)) outputs = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._symbols)) for state in self._states)) model = HiddenMarkovModelTagger(self._symbols, self._states, transitions, outputs, priors) self._states = model._states self._symbols = model._symbols N = len(self._states) M = len(self._symbols) symbol_numbers = dict((sym, i) for i, sym in enumerate(self._symbols)) # update model prob dists so that they can be modified # model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._transitions[s], self._states)) for s in self._states)) if update_outputs: model._outputs = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._outputs[s], self._symbols)) for s in self._states)) model.reset_cache() # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get('max_iterations', 1000) epsilon = kwargs.get('convergence_logprob', 1e-6) while not converged and iteration < max_iterations: A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue (lpk, seq_A_numer, seq_A_denom, seq_B_numer, seq_B_denom) = self._baum_welch_step(sequence, model, symbol_numbers) # add these sums to the global A and B values for i in range(N): A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i]-lpk) B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i]-lpk) A_denom = np.logaddexp2(A_denom, seq_A_denom-lpk) B_denom = np.logaddexp2(B_denom, seq_B_denom-lpk) logprob += lpk # use the calculated values to update the transition and output # probability values for i in range(N): logprob_Ai = A_numer[i] - A_denom[i] logprob_Bi = B_numer[i] - B_denom[i] # We should normalize all probabilities (see p.391 Huang et al) # Let sum(P) be K. # We can divide each Pi by K to make sum(P) == 1. # Pi' = Pi/K # log2(Pi') = log2(Pi) - log2(K) logprob_Ai -= logsumexp2(logprob_Ai) logprob_Bi -= logsumexp2(logprob_Bi) # update output and transition probabilities si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, logprob_Ai[j]) if update_outputs: for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, logprob_Bi[k]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print('iteration', iteration, 'logprob', logprob) iteration += 1 last_logprob = logprob return model
def cpd(array, conditions, samples): d = {} for values, condition in zip(array, conditions): d[condition] = pd(values, samples) return DictionaryConditionalProbDist(d)
def train_unsupervised(self, unlabeled_sequences, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ N = len(self._states) M = len(self._symbols) symbol_dict = dict((self._symbols[i], i) for i in range(M)) # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get('model') if not model: priors = UniformProbDist(self._states) transitions = DictionaryConditionalProbDist( dict((state, UniformProbDist(self._states)) for state in self._states)) output = DictionaryConditionalProbDist( dict((state, UniformProbDist(self._symbols)) for state in self._states)) model = HiddenMarkovModelTagger(self._symbols, self._states, transitions, output, priors) # update model prob dists so that they can be modified model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._transitions[s], self._states)) for s in self._states)) model._outputs = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._outputs[s], self._symbols)) for s in self._states)) # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get('max_iterations', 1000) epsilon = kwargs.get('convergence_logprob', 1e-6) while not converged and iteration < max_iterations: A_numer = ones((N, N), float64) * _NINF B_numer = ones((N, M), float64) * _NINF A_denom = ones(N, float64) * _NINF B_denom = ones(N, float64) * _NINF logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue # compute forward and backward probabilities alpha = model._forward_probability(sequence) beta = model._backward_probability(sequence) # find the log probability of the sequence T = len(sequence) lpk = _log_add(*alpha[T - 1, :]) logprob += lpk # now update A and B (transition and output probabilities) # using the alpha and beta values. Please refer to Rabiner's # paper for details, it's too hard to explain in comments local_A_numer = ones((N, N), float64) * _NINF local_B_numer = ones((N, M), float64) * _NINF local_A_denom = ones(N, float64) * _NINF local_B_denom = ones(N, float64) * _NINF # for each position, accumulate sums for A and B for t in range(T): x = sequence[t][_TEXT] #not found? FIXME if t < T - 1: xnext = sequence[t + 1][_TEXT] #not found? FIXME xi = symbol_dict[x] for i in range(N): si = self._states[i] if t < T - 1: for j in range(N): sj = self._states[j] local_A_numer[i, j] = \ _log_add(local_A_numer[i, j], alpha[t, i] + model._transitions[si].logprob(sj) + model._outputs[sj].logprob(xnext) + beta[t+1, j]) local_A_denom[i] = _log_add( local_A_denom[i], alpha[t, i] + beta[t, i]) else: local_B_denom[i] = _log_add( local_A_denom[i], alpha[t, i] + beta[t, i]) local_B_numer[i, xi] = _log_add(local_B_numer[i, xi], alpha[t, i] + beta[t, i]) # add these sums to the global A and B values for i in range(N): for j in range(N): A_numer[i, j] = _log_add(A_numer[i, j], local_A_numer[i, j] - lpk) for k in range(M): B_numer[i, k] = _log_add(B_numer[i, k], local_B_numer[i, k] - lpk) A_denom[i] = _log_add(A_denom[i], local_A_denom[i] - lpk) B_denom[i] = _log_add(B_denom[i], local_B_denom[i] - lpk) # use the calculated values to update the transition and output # probability values for i in range(N): si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, A_numer[i, j] - A_denom[i]) for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, B_numer[i, k] - B_denom[i]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print 'iteration', iteration, 'logprob', logprob iteration += 1 last_logprob = logprob return model