def prob_classify(self, feat): probs = self.root.prob_classify(feat) # passing in self.labels() ensures it doesn't have any of label_classifiers.keys() mult = MutableProbDist(probs, self.labels(), store_logs=False) for classifier in self.label_classifiers.values(): pd = classifier.prob_classify(feat) for sample in pd.samples(): mult.update(sample, pd.prob(sample), log=False) return mult
def prob_classify(self, feat): probs = self.root.prob_classify(feat) # passing in self.labels() ensures it doesn't have any of label_classifiers.keys() mult = MutableProbDist(probs, self.labels(), store_logs=False) for classifier in self.label_classifiers.values(): pd = classifier.prob_classify(feat) for sample in pd.samples(): mult.update(sample, pd.prob(sample), log=False) return mult
def prob_dist_to_dictionary_prob_dist(dist, mutable=False, samples=None): """ Takes a probability distribution estimated in any way (e.g. from a freq dist) and produces a corresponding dictionary prob dist that just stores the probability of every sample. Can be used to turn any kind of prob dist into a dictionary-based one, including a MutableProbDist. @type mutable: bool @param mutable: if True, the returned dist is a mutable prob dist """ # We may want to give a different set of samples, for example, if there # are samples not represented in the original dist if samples is None: samples = dist.samples() probs = {} for sample in samples: probs[sample] = dist.prob(sample) # We'd expect these to sum to one, but normalize just in case dictpd = DictionaryProbDist(probs, normalize=True) if mutable: # Convert to a mutable distribution dictpd = MutableProbDist(dictpd, samples) return dictpd
def train_unsupervised(self, unlabeled_sequences, update_outputs=True, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get('model') if not model: priors = RandomProbDist(self._states) transitions = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._states)) for state in self._states)) outputs = DictionaryConditionalProbDist( dict((state, RandomProbDist(self._symbols)) for state in self._states)) model = HiddenMarkovModelTagger(self._symbols, self._states, transitions, outputs, priors) self._states = model._states self._symbols = model._symbols N = len(self._states) M = len(self._symbols) symbol_numbers = dict((sym, i) for i, sym in enumerate(self._symbols)) # update model prob dists so that they can be modified # model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._transitions[s], self._states)) for s in self._states)) if update_outputs: model._outputs = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._outputs[s], self._symbols)) for s in self._states)) model.reset_cache() # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get('max_iterations', 1000) epsilon = kwargs.get('convergence_logprob', 1e-6) while not converged and iteration < max_iterations: A_numer = _ninf_array((N, N)) B_numer = _ninf_array((N, M)) A_denom = _ninf_array(N) B_denom = _ninf_array(N) logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue (lpk, seq_A_numer, seq_A_denom, seq_B_numer, seq_B_denom) = self._baum_welch_step(sequence, model, symbol_numbers) # add these sums to the global A and B values for i in range(N): A_numer[i] = np.logaddexp2(A_numer[i], seq_A_numer[i] - lpk) B_numer[i] = np.logaddexp2(B_numer[i], seq_B_numer[i] - lpk) A_denom = np.logaddexp2(A_denom, seq_A_denom - lpk) B_denom = np.logaddexp2(B_denom, seq_B_denom - lpk) logprob += lpk # use the calculated values to update the transition and output # probability values for i in range(N): logprob_Ai = A_numer[i] - A_denom[i] logprob_Bi = B_numer[i] - B_denom[i] # We should normalize all probabilities (see p.391 Huang et al) # Let sum(P) be K. # We can divide each Pi by K to make sum(P) == 1. # Pi' = Pi/K # log2(Pi') = log2(Pi) - log2(K) logprob_Ai -= logsumexp2(logprob_Ai) logprob_Bi -= logsumexp2(logprob_Bi) # update output and transition probabilities si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, logprob_Ai[j]) if update_outputs: for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, logprob_Bi[k]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print('iteration', iteration, 'logprob', logprob) iteration += 1 last_logprob = logprob return model
def train_unsupervised(self, unlabeled_sequences, **kwargs): """ Trains the HMM using the Baum-Welch algorithm to maximise the probability of the data sequence. This is a variant of the EM algorithm, and is unsupervised in that it doesn't need the state sequences for the symbols. The code is based on 'A Tutorial on Hidden Markov Models and Selected Applications in Speech Recognition', Lawrence Rabiner, IEEE, 1989. :return: the trained model :rtype: HiddenMarkovModelTagger :param unlabeled_sequences: the training data, a set of sequences of observations :type unlabeled_sequences: list kwargs may include following parameters: :param model: a HiddenMarkovModelTagger instance used to begin the Baum-Welch algorithm :param max_iterations: the maximum number of EM iterations :param convergence_logprob: the maximum change in log probability to allow convergence """ N = len(self._states) M = len(self._symbols) symbol_dict = dict((self._symbols[i], i) for i in range(M)) # create a uniform HMM, which will be iteratively refined, unless # given an existing model model = kwargs.get('model') if not model: priors = UniformProbDist(self._states) transitions = DictionaryConditionalProbDist( dict((state, UniformProbDist(self._states)) for state in self._states)) output = DictionaryConditionalProbDist( dict((state, UniformProbDist(self._symbols)) for state in self._states)) model = HiddenMarkovModelTagger(self._symbols, self._states, transitions, output, priors) # update model prob dists so that they can be modified model._priors = MutableProbDist(model._priors, self._states) model._transitions = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._transitions[s], self._states)) for s in self._states)) model._outputs = DictionaryConditionalProbDist( dict((s, MutableProbDist(model._outputs[s], self._symbols)) for s in self._states)) # iterate until convergence converged = False last_logprob = None iteration = 0 max_iterations = kwargs.get('max_iterations', 1000) epsilon = kwargs.get('convergence_logprob', 1e-6) while not converged and iteration < max_iterations: A_numer = ones((N, N), float64) * _NINF B_numer = ones((N, M), float64) * _NINF A_denom = ones(N, float64) * _NINF B_denom = ones(N, float64) * _NINF logprob = 0 for sequence in unlabeled_sequences: sequence = list(sequence) if not sequence: continue # compute forward and backward probabilities alpha = model._forward_probability(sequence) beta = model._backward_probability(sequence) # find the log probability of the sequence T = len(sequence) lpk = _log_add(*alpha[T-1, :]) logprob += lpk # now update A and B (transition and output probabilities) # using the alpha and beta values. Please refer to Rabiner's # paper for details, it's too hard to explain in comments local_A_numer = ones((N, N), float64) * _NINF local_B_numer = ones((N, M), float64) * _NINF local_A_denom = ones(N, float64) * _NINF local_B_denom = ones(N, float64) * _NINF # for each position, accumulate sums for A and B for t in range(T): x = sequence[t][_TEXT] #not found? FIXME if t < T - 1: xnext = sequence[t+1][_TEXT] #not found? FIXME xi = symbol_dict[x] for i in range(N): si = self._states[i] if t < T - 1: for j in range(N): sj = self._states[j] local_A_numer[i, j] = \ _log_add(local_A_numer[i, j], alpha[t, i] + model._transitions[si].logprob(sj) + model._outputs[sj].logprob(xnext) + beta[t+1, j]) local_A_denom[i] = _log_add(local_A_denom[i], alpha[t, i] + beta[t, i]) else: local_B_denom[i] = _log_add(local_A_denom[i], alpha[t, i] + beta[t, i]) local_B_numer[i, xi] = _log_add(local_B_numer[i, xi], alpha[t, i] + beta[t, i]) # add these sums to the global A and B values for i in range(N): for j in range(N): A_numer[i, j] = _log_add(A_numer[i, j], local_A_numer[i, j] - lpk) for k in range(M): B_numer[i, k] = _log_add(B_numer[i, k], local_B_numer[i, k] - lpk) A_denom[i] = _log_add(A_denom[i], local_A_denom[i] - lpk) B_denom[i] = _log_add(B_denom[i], local_B_denom[i] - lpk) # use the calculated values to update the transition and output # probability values for i in range(N): si = self._states[i] for j in range(N): sj = self._states[j] model._transitions[si].update(sj, A_numer[i,j] - A_denom[i]) for k in range(M): ok = self._symbols[k] model._outputs[si].update(ok, B_numer[i,k] - B_denom[i]) # Rabiner says the priors don't need to be updated. I don't # believe him. FIXME # test for convergence if iteration > 0 and abs(logprob - last_logprob) < epsilon: converged = True print 'iteration', iteration, 'logprob', logprob iteration += 1 last_logprob = logprob return model
def train(self, emissions, max_iterations=None, \ convergence_logprob=None, logger=None, processes=1, save=True, save_intermediate=False): """ Performs unsupervised training using Baum-Welch EM. This is an instance method, because it is performed on a model that has already been initialized. You might, for example, create such a model using C{initialize_chord_types}. This is based on the training procedure in NLTK for HMMs: C{nltk.tag.hmm.HiddenMarkovModelTrainer.train_unsupervised}. @type emissions: list of lists of emissions @param emissions: training data. Each element is a list of emissions representing a sequence in the training data. Each emission is an emission like those used for L{jazzparser.misc.raphsto.RaphstoHmm.emission_log_probability}, i.e. a list of note observations @type max_iterations: int @param max_iterations: maximum number of iterations to allow for EM (default 100). Overrides the corresponding module option @type convergence_logprob: float @param convergence_logprob: maximum change in log probability to consider convergence to have been reached (default 1e-3). Overrides the corresponding module option @type logger: logging.Logger @param logger: a logger to send progress logging to @type processes: int @param processes: number processes to spawn. A pool of this many processes will be used to compute distribution updates for sequences in parallel during each iteration. @type save: bool @param save: save the model at the end of training @type save_intermediate: bool @param save_intermediate: save the model after each iteration. Implies C{save} """ from . import raphsto_d if logger is None: from jazzparser.utils.loggers import create_dummy_logger logger = create_dummy_logger() if save_intermediate: save = True # No point in creating more processes than there are sequences if processes > len(emissions): processes = len(emissions) self.model.add_history("Beginning Baum-Welch unigram training on %s" % get_host_info_string()) self.model.add_history("Training on %d sequences (with %s chords)" % \ (len(emissions), ", ".join("%d" % len(seq) for seq in emissions))) # Use kwargs if given, otherwise module options if max_iterations is None: max_iterations = self.options['max_iterations'] if convergence_logprob is None: convergence_logprob = self.options['convergence_logprob'] # Enumerate the states state_ids = dict((state,num) for (num,state) in \ enumerate(self.model.label_dom)) # Enumerate the beat values (they're probably consecutive ints, but # let's not rely on it) beat_ids = dict((beat,num) for (num,beat) in \ enumerate(self.model.beat_dom)) num_beats = len(beat_ids) # Enumerate the d-values (d-function's domain) d_ids = dict((d,num) for (num,d) in \ enumerate(self.model.emission_dist_dom)) num_ds = len(d_ids) # Make a mutable distribution for the emission distribution we'll # be updating emission_mdist = DictionaryConditionalProbDist( dict((s, MutableProbDist(self.model.emission_dist[s], self.model.emission_dist_dom)) for s in self.model.emission_dist.conditions())) # Create dummy distributions to fill the places of the transition # distribution components key_mdist = DictionaryConditionalProbDist({}) chord_mdist = DictionaryConditionalProbDist({}) chord_uni_mdist = MutableProbDist({}, []) # Construct a model using these mutable distributions so we can # evaluate using them model = self.model_cls(key_mdist, chord_mdist, emission_mdist, chord_uni_mdist, chord_set=self.model.chord_set) iteration = 0 last_logprob = None while iteration < max_iterations: logger.info("Beginning iteration %d" % iteration) current_logprob = 0.0 # ems contains the new emission numerator probabilities # ems[r][d] = Sum_{d(y_n^k, x_n)=d, r_n^k=r} # alpha(x_n).beta(x_n) / # Sum_{x'_n} (alpha(x'_n).beta(x'_n)) ems = zeros((num_beats, num_ds), float64) # And these are the denominators ems_denom = zeros(num_beats, float64) def _training_callback(result): """ Callback for the _sequence_updates processes that takes the updates from a single sequence and adds them onto the global update accumulators. """ # _sequence_updates() returns all of this as a tuple (ems_local, ems_denom_local, seq_logprob) = result # Add these probabilities from this sequence to the # global matrices # Emission numerator array_add(ems, ems_local, ems) # Denominators array_add(ems_denom, ems_denom_local, ems_denom) ## End of _training_callback # Only use a process pool if there's more than one sequence if processes > 1: # Create a process pool to use for training logger.info("Creating a pool of %d processes" % processes) pool = Pool(processes=processes) async_results = [] for seq_i, sequence in enumerate(emissions): logger.info("Iteration %d, sequence %d" % (iteration, seq_i)) T = len(sequence) if T == 0: continue # Fire off a new call to the process pool for every sequence async_results.append( pool.apply_async( _sequence_updates_uni, (sequence, model, self.model.label_dom, state_ids, beat_ids, d_ids, raphsto_d), callback=_training_callback)) pool.close() # Wait for all the workers to complete pool.join() # Call get() on every AsyncResult so that any exceptions in # workers get raised for res in async_results: # If there was an exception in _sequence_update, it # will get raised here res_tuple = res.get() # Add this sequence's logprob into the total for all sequences current_logprob += res_tuple[2] else: logger.info("One sequence: not using a process pool") sequence = emissions[0] if len(sequence) > 0: updates = _sequence_updates_uni(sequence, model, self.model.label_dom, state_ids, beat_ids, d_ids, raphsto_d) _training_callback(updates) # Update the overall logprob current_logprob = updates[2] # Update the model's probabilities from the accumulated values for beat in self.model.beat_dom: denom = ems_denom[beat_ids[beat]] for d in self.model.emission_dist_dom: if denom == 0.0: # Zero denominator prob = -logprob(len(d_ids)) else: prob = logprob(ems[beat_ids[beat]][d_ids[d]] + ADD_SMALL) - logprob( denom + len(d_ids) * ADD_SMALL) model.emission_dist[beat].update(d, prob) # Clear the model's cache so we get the new probabilities model.clear_cache() logger.info("Training data log prob: %s" % current_logprob) if last_logprob is not None and current_logprob < last_logprob: logger.error("Log probability dropped by %s" % \ (last_logprob - current_logprob)) if last_logprob is not None: logger.info("Log prob change: %s" % \ (current_logprob - last_logprob)) # Check whether the log probability has converged if iteration > 0 and \ abs(current_logprob - last_logprob) < convergence_logprob: # Don't iterate any more logger.info("Distribution has converged: ceasing training") break iteration += 1 last_logprob = current_logprob # Update the main model # Only save if we've been asked to save between iterations self.update_model(model, save=save_intermediate) self.model.add_history("Completed Baum-Welch unigram training") # Update the distribution's parameters with those we've trained self.update_model(model, save=save) return