/
HMMTagger.py
149 lines (124 loc) · 5.03 KB
/
HMMTagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
from nltk import FreqDist, ConditionalProbDist, ConditionalFreqDist, MLEProbDist, bigrams, ngrams
import time
import re
class HMMTagger(object):
global START_TAG
START_TAG = "<s>"
global END_TAG
END_TAG = "</s>"
global UNK
UNK = "UNK"
def __init__(self, training_sents, n=2, smoothing=None):
self.n = n
self.smoothing = smoothing
self.tagged_sents = self.addStartAndEndMarkers(training_sents) # this takes a lot of time
self.train() # this takes almost 4 seconds
def train(self):
""" Construct the conditional frequencies and probabilities """
#extract tags from sentences
tags = [tag for (_,tag) in self.tagged_sents]
self.replaceUnique()
self.emission_frequencies = ConditionalFreqDist([tup[::-1] for tup in self.tagged_sents])
self.tagset_size = len(self.emission_frequencies.conditions())
# emission - probability that a certain tag is a certain word
# e.g. probability that a VB is 'race'
self.emission_probabilities = ConditionalProbDist(self.emission_frequencies, MLEProbDist)
self.transition_frequencies = ConditionalFreqDist(bigrams(tags))
self.transition_probabilities = ConditionalProbDist(self.transition_frequencies, MLEProbDist)
self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
def replaceUnique(self):
""" Replaces unique words with the UNK label """
word_frequencies = FreqDist([word for (word, _) in self.tagged_sents])
self.lexicon_size = len(word_frequencies)
hap = set(word_frequencies.hapaxes())
res = [(UNK,tag) if word in hap else (word,tag) for (word,tag) in self.tagged_sents]
self.tagged_sents = res
def addStartAndEndMarkers(self, training_sents):
""" returns a flat list of tokens """
res = []
for sent in training_sents:
res += [(START_TAG, START_TAG)]
res += sent
res += [(END_TAG, END_TAG)]
return res
def get_transition_probability(self, prev_tag, tag):
""" Returns probability of prev_tag being followed by tag.
Performs smoothing if specified in the command line."""
if self.smoothing == "LAP":
prev_tag_count = self.transition_frequencies[prev_tag].N()
bigram_count = self.transition_frequencies[prev_tag].freq(tag) * prev_tag_count
return (bigram_count + 1 ) / (1.0 * prev_tag_count + self.lexicon_size)
else:
return self.transition_probabilities[prev_tag].prob(tag)
def viterbi_col(self, word, prev=None):
""" General algorithm for a viterbi table column.
This is only called once for every word. """
vit = {}
back = {}
for tag in self.word_tag_frequencies[word].keys():
if tag != START_TAG:
if prev:
best_prev_tag = self.get_prev_tag(tag, prev, word)
transition_prob = self.get_transition_probability(best_prev_tag, tag)
vit[tag] = prev[best_prev_tag] * transition_prob * self.emission_probabilities[ tag ].prob( word )
back[tag] = best_prev_tag
else:
transition_prob = self.get_transition_probability(START_TAG, tag)
vit[tag] = transition_prob * self.emission_probabilities[ tag ].prob( word )
back[tag] = START_TAG
return (vit, back)
def viterbi(self, words_to_tag):
""" Viterbi algorithm """
res = [] # a list of dicts denoting probability of best path to get to state q after scanning input up to pos i
backpointers = [] # a list of dicts
for wordindex in range(len(words_to_tag)):
current_word = words_to_tag[wordindex]
if self.is_unknown(current_word):
current_word = UNK
if wordindex == 0:
vit, back = self.viterbi_col(current_word)
else:
vit, back = self.viterbi_col(current_word, res[-1])
res.append(vit)
backpointers.append(back)
prev = res[-1]
backpointers.reverse()
return self.construct_solution(backpointers, prev)
def is_unknown(self, word):
""" Checks if the word is unknown """
for tag in set(self.emission_probabilities.conditions()):
pr = self.emission_probabilities[tag]
if pr.prob( word ) > 0:
return False
return True
def construct_solution(self, back, prev):
""" Constructs solution by following the back pointers on a ready viterbi table """
current_best_tag = self.get_prev_tag(END_TAG, prev)
best_seq = [ END_TAG, current_best_tag ]
for p in back:
to_append = p[current_best_tag]
best_seq.append(to_append)
current_best_tag = p[current_best_tag]
best_seq.reverse()
return best_seq
def get_prev_tag(self, tag, prev, curr_word=None):
""" Finds a previous tag A for the current tag B s.t. the probability of AB was the highest
for the current word.
Called for every word and every tag """
best_prev = prev.keys()[0] # assign at least something to avoid None exception
best_prob = 0.0
for prevtag in prev.keys():
# find the maximum probability
prob = prev[ prevtag ] * self.transition_probabilities[prevtag].prob(tag)
if curr_word:
prob *= self.emission_probabilities[ tag ].prob( curr_word )
if prob > best_prob:
best_prob = prob
best_prev = prevtag
return best_prev
def tag_sents(self, test_sents):
"""Tag the given text sentence by sentence"""
res = []
for sent in test_sents:
res.append(self.viterbi(sent)[1:-1])# remove start and end tags
return res