Ejemplo n.º 1
0
def train_hmm_tagger(data):
    # HMM
    # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger.
    #
    # - Add one state per tag
    #     - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$
    # - Add an edge from the starting state `basic_model.start` to each tag
    #     - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$
    # - Add an edge from each tag to the end state `basic_model.end`
    #     - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$
    # - Add an edge between _every_ pair of tags
    #     - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$
    basic_model = HiddenMarkovModel(name="base-hmm-tagger")

    state_dict = {}
    states = []
    emission_counts = pair_counts(*list(zip(
        *data.training_set.stream()))[::-1])
    for tag in emission_counts.keys():
        tag_count = tag_unigrams[tag]
        probs = {}
        for w in emission_counts[tag]:
            probs[w] = emission_counts[tag][w] / tag_count
        emission_p = DiscreteDistribution(probs)
        state = State(emission_p, name="" + tag)
        basic_model.add_state(state)
        state_dict[tag] = state

    for tag in tag_starts:
        basic_model.add_transition(basic_model.start, state_dict[tag],
                                   tag_starts[tag] / len(data.training_set.Y))
        basic_model.add_transition(state_dict[tag], basic_model.end,
                                   tag_ends[tag] / tag_unigrams[tag])

    for (tag1, tag2) in tag_bigrams:
        basic_model.add_transition(
            state_dict[tag1], state_dict[tag2],
            tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1])

    # finalize the model
    basic_model.bake()

    assert all(
        tag in set(s.name for s in basic_model.states)
        for tag in data.training_set.tagset
    ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
    assert basic_model.edge_count() == 168, (
        "Your network should have an edge from the start node to each state, one edge between every "
        +
        "pair of tags (states), and an edge from each state to the end node.")
    HTML(
        '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>'
    )
    return basic_model
model.add_transition(model.start, sunny_state, 0.5)
model.add_transition(model.start, rainy_state, 0.5)

# add sunny day transitions (we already know estimates of these probabilities
# from the problem statement)
model.add_transition(sunny_state, sunny_state, 0.8)  # 80% sunny->sunny
model.add_transition(sunny_state, rainy_state, 0.2)  # 20% sunny->rainy

# TODO: add rainy day transitions using the probabilities specified in the transition table
model.add_transition(rainy_state, sunny_state, 0.4)  # 40% rainy->sunny
model.add_transition(rainy_state, rainy_state, 0.6)  # 60% rainy->rainy

# finally, call the .bake() method to finalize the model
model.bake()

assert model.edge_count() == 6, "There should be two edges from model.start, two from Rainy, and two from Sunny"
assert model.node_count() == 4, "The states should include model.start, model.end, Rainy, and Sunny"
print("Great! You've finished the model.")





show_model(model, figsize=(5, 5), filename="example.png", overwrite=True, show_ends=False)


column_order = ["Example Model-start", "Sunny", "Rainy", "Example Model-end"]  # Override the Pomegranate default order
column_names = [s.name for s in model.states]
order_index = [column_names.index(c) for c in column_order]

# re-order the rows/columns to match the specified column order
Ejemplo n.º 3
0
# End - Number of senteces ending with tag over count of tag appereances
for tag in tag_starts:
    basic_model.add_transition(basic_model.start, s[tag],
                               tag_starts[tag] / len(data.training_set.Y))
    basic_model.add_transition(s[tag], basic_model.end,
                               tag_ends[tag] / tag_unigrams[tag])

for (tag1, tag2) in tag_bigrams:
    basic_model.add_transition(s[tag1], s[tag2],
                               tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1])

basic_model.bake()

assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset), \
       "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
assert basic_model.edge_count() == 168, \
       ("Your network should have an edge from the start node to each state, one edge between every " +
        "pair of tags (states), and an edge from each state to the end node.")

hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y,
                            basic_model)
print("training accuracy basic hmm model: {:.2f}%".format(100 *
                                                          hmm_training_acc))

hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, basic_model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 *
                                                         hmm_testing_acc))

assert hmm_training_acc > 0.97, "Uh oh. Your HMM accuracy on the training set doesn't look right."
assert hmm_training_acc > 0.955, "Uh oh. Your HMM accuracy on the training set doesn't look right."
Ejemplo n.º 4
0
    basic_model.add_transition(tag_state,basic_model.end,end_prob[tag_state.name])
    


transition_prob_pair={}

for key in tag_bigrams.keys():
    transition_prob_pair[key]=tag_bigrams.get(key)/tags_count[key[0]]
for tag_state in to_pass_states :
    for next_tag_state in to_pass_states :
        basic_model.add_transition(tag_state,next_tag_state,transition_prob_pair[(tag_state.name,next_tag_state.name)])

basic_model.bake()

assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset),        "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
assert basic_model.edge_count() == 168,        ("Your network should have an edge from the start node to each state, one edge between every " +
        "pair of tags (states), and an edge from each state to the end node.")
HTML('<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>')

hmm_training_acc = accuracy(data.training_set.X, data.training_set.Y, basic_model)
print("training accuracy basic hmm model: {:.2f}%".format(100 * hmm_training_acc))

hmm_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, basic_model)
print("testing accuracy basic hmm model: {:.2f}%".format(100 * hmm_testing_acc))

assert hmm_training_acc > 0.97, "Uh oh. Your HMM accuracy on the training set doesn't look right."
assert hmm_testing_acc > 0.955, "Uh oh. Your HMM accuracy on the testing set doesn't look right."
HTML('<div class="alert alert-block alert-success">Your HMM tagger accuracy looks correct! Congratulations, you\'ve finished the project.</div>')

for key in data.testing_set.keys[:3]:
    print("Sentence Key: {}\n".format(key))
        transition_probability = tag_bigrams[bigram] / tag_unigrams[tag1]
        sum_of_probabilities += transition_probability
        basic_model.add_transition(state1, state2, transition_probability)



#==============================================================
# finalize the model
#==============================================================


# NOTE: YOU SHOULD NOT NEED TO MODIFY ANYTHING BELOW THIS LINE
basic_model.bake()
print("Number of nodes or states: ", basic_model.node_count())
print("Number of edges: ", basic_model.edge_count())

assert all(tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset), \
       "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
assert basic_model.edge_count() == 168, \
       ("Your network should have an edge from the start node to each state, one edge between every " +
        "pair of tags (states), and an edge from each state to the end node.")
HTML('<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>')




#==============================================================
# evaluate train / test data set metrics
#==============================================================
Ejemplo n.º 6
0
from pomegranate import HiddenMarkovModel, DiscreteDistribution, State
import numpy as np

model = HiddenMarkovModel(name='weather')

sunny_emissions = DiscreteDistribution({'yes': 0.1, 'no': 0.9})
sunny_state = State(sunny_emissions, name='sunny')

rainy_emissions = DiscreteDistribution({'yes': 0.8, 'no': 0.2})
rainy_state = State(rainy_emissions, name='rainy')

model.add_states(sunny_state, rainy_state)

model.add_transition(model.start, sunny_state, 0.5)
model.add_transition(model.start, rainy_state, 0.5)

model.add_transition(sunny_state, rainy_state, 0.2)
model.add_transition(sunny_state, sunny_state, 0.8)

model.add_transition(rainy_state, rainy_state, 0.6)
model.add_transition(rainy_state, sunny_state, 0.4)

model.bake()

states = np.array([s.name for s in model.states])
print('{} states: {}'.format(model.node_count(), states))
transitions = model.dense_transition_matrix()

print('{} transitions probabilities between states \n{}'.format(
    model.edge_count(), transitions))