コード例 #1
0
ファイル: decipher.py プロジェクト: emulhall/COMP550
def train_transitions(labelled_sequences,
                      additional_transitions,
                      estimator=None):
    # default to the MLE estimate
    if estimator is None:
        estimator = lambda fdist, bins: MLEProbDist(fdist)

    # count occurrences of starting states, transitions out of each state
    # and output symbols observed in each state
    known_symbols = []
    known_states = []

    starting = FreqDist()
    transitions = ConditionalFreqDist()
    outputs = ConditionalFreqDist()
    for sequence in labelled_sequences:
        lasts = None
        for token in sequence:
            state = token[0]
            symbol = token[1]
            if lasts is None:
                starting[state] += 1
            else:
                transitions[lasts][state] += 1
            outputs[state][symbol] += 1
            lasts = state

            # update the state and symbol lists
            if state not in known_states:
                known_states.append(state)

            if symbol not in known_symbols:
                known_symbols.append(symbol)

    # create probability distributions (with smoothing)
    N = len(known_states)
    pi = estimator(starting, N)
    A = ConditionalProbDist(
        ConditionalFreqDist.__add__(transitions, additional_transitions),
        estimator, N)
    B = ConditionalProbDist(outputs, estimator, len(known_symbols))
    return hmm.HiddenMarkovModelTagger(known_states, known_symbols, A, B, pi)