forked from athnlp/athnlp-labs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
1_perceptron_pos.py
201 lines (161 loc) · 6.45 KB
/
1_perceptron_pos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
import random
import abc
from copy import deepcopy
import math
from nltk import ConditionalFreqDist
from tqdm import tqdm
from athnlp.readers.brown_pos_corpus import BrownPosTag
MAX_EPOCHS_WITHOUT_IMPROVEMENTS = 5
class POSTagger:
def __init__(self):
self.name = "POS Tagger"
self.corpus = corpus
self.pos_tags = corpus.dictionary.y_dict.names
self.vocab_size = len(corpus.dictionary.x_dict)
@abc.abstractmethod
def _get_word_feature(self, word, sent):
"""
Encode the given word (using the sent information if applicable)
:param word: word to encode
:type word: str
:param sent: context (sent the word is from)
:type sent: Sequence
:return: encoded word
:rtype: Any
"""
pass
@abc.abstractmethod
def _get_prediction(self, word):
"""
Get a prediction for the given word
:param word: word to get tag for
:type word: str
:return: tag prediction
:rtype: str
"""
pass
@abc.abstractmethod
def train(self):
"""
Train tagger model on the given corpus
"""
pass
def evaluate(self):
"""
Evaluate the tagger
:return: list of dataset-acurracy-tuples
:rtype: list[(str, float)]
"""
accuracies = []
# Evaluate all data splits
for (split_name, split_data) in [("Train", self.corpus.train), ("Dev", self.corpus.dev), ("Test", self.corpus.test)]:
# Loop over all sentences and all words in it and count the share of correct predictions
total = 0
correct = 0
for sent in tqdm(split_data, desc=f"Evaluating {split_name}"):
for (word, gold_tag) in sent.get_tag_word_tuples():
total += 1
correct += self._get_prediction(self._get_word_feature(word, sent)) == gold_tag
accuracies.append((split_name, correct / total * 100))
accuracies_string = ' '.join(f"{name}: {acc:3.2f}%" for name, acc in accuracies)
print(f"Accuracies:\n{accuracies_string}")
return accuracies
class MajorityClassPOSTagger(POSTagger):
def __init__(self, corpus):
super().__init__()
self.name = "Majority Class POS Tagger"
self.word_pos_cfd = None
def train(self):
"""
This trains a simple baseline which just uses majority class voting for every word in vocabulary
disregarding of its context
"""
self.word_pos_cfd = ConditionalFreqDist(tp for seq_list in self.corpus.train for tp in seq_list.get_tag_word_tuples())
def _get_word_feature(self, word, sent):
return word
def _get_prediction(self, word):
if word in self.word_pos_cfd:
return self.word_pos_cfd[word].most_common(1)[0][0]
return ""
class UnigramPerceptronPOSTagger(POSTagger):
def __init__(self, corpus):
"""
Constructor
:param corpus: the corpus to train and evaluate on
:type corpus: BrownPosTag
"""
super().__init__()
self.name = "Unigram Perceptron POS Tagger"
self.weights_per_label = {}
self.weights_per_label_best = {}
def train(self):
"""
Train POS Tagger
"""
# Init weight list/vector (no weights for anything at first)
for tag in self.corpus.dictionary.y_dict.names:
self.weights_per_label[tag] = {}
print("Training perceptron")
mistakes_best = math.inf
epochs_without_improvements = 0
epoch = 0
# Train until no improvements happen anymore
while epochs_without_improvements < MAX_EPOCHS_WITHOUT_IMPROVEMENTS:
# Multiple epochs are only useful when you change something (shuffle the learning order)
random.shuffle(self.corpus.train)
epoch += 1
mistakes = 0
count_examples = 0
# Loop over all examples and learn from them
for sent in tqdm(self.corpus.train, desc=f"Epoch {epoch}"):
for (word, tag) in sent.get_tag_word_tuples():
count_examples += 1
feature = self._get_word_feature(word, sent)
# Get prediction
pred_tag = self._get_prediction(feature)
# If there was a misprediction
if pred_tag != tag:
# Update both affected weights
for f in feature:
self.weights_per_label[tag][f] = self.weights_per_label[tag].get(f, 0) + 1
self.weights_per_label[pred_tag][f] = self.weights_per_label[tag].get(f, 0) - 1
mistakes += 1
# Break condition based upon last changes to the score
print(f"Error: {(mistakes/count_examples*100):3.2f} %")
if mistakes < mistakes_best:
self.weights_per_label_best = deepcopy(self.weights_per_label)
epochs_without_improvements = 0
mistakes_best = mistakes
else:
epochs_without_improvements += 1
self.weights_per_label = self.weights_per_label_best
def _get_word_feature(self, word, sent):
return [word]
def _get_prediction(self, feature):
best_label = ""
best_score = -1
for (label, weights) in self.weights_per_label.items():
score = sum(weights.get(f, 0) for f in feature)
if score > best_score:
best_label = label
best_score = score
return best_label
class BOWPerceptronPOSTagger(UnigramPerceptronPOSTagger):
def __init__(self, corpus):
super().__init__(corpus)
self.name = "Bag of words perceptron POS tagger"
def _get_word_feature(self, word, sent):
# Add all words of a sentence to the list of features (and mark them as context)
feature = [f"context_{w}" for w in sent.x]
# Add the word search for multiple times to give them a higher weight
feature.extend([word] * len(feature))
return feature
if __name__ == "__main__":
print("Loading corpus")
corpus = BrownPosTag()
print("Testing models")
for tagger_type in [MajorityClassPOSTagger, UnigramPerceptronPOSTagger, BOWPerceptronPOSTagger]:
tagger = tagger_type(corpus)
print(f"\n\n==={tagger.name}===\n")
tagger.train()
tagger.evaluate()