def train_transitions(labelled_sequences, additional_transitions, estimator=None): # default to the MLE estimate if estimator is None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurrences of starting states, transitions out of each state # and output symbols observed in each state known_symbols = [] known_states = [] starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[0] symbol = token[1] if lasts is None: starting[state] += 1 else: transitions[lasts][state] += 1 outputs[state][symbol] += 1 lasts = state # update the state and symbol lists if state not in known_states: known_states.append(state) if symbol not in known_symbols: known_symbols.append(symbol) # create probability distributions (with smoothing) N = len(known_states) pi = estimator(starting, N) A = ConditionalProbDist( ConditionalFreqDist.__add__(transitions, additional_transitions), estimator, N) B = ConditionalProbDist(outputs, estimator, len(known_symbols)) return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)