-
Notifications
You must be signed in to change notification settings - Fork 0
/
NLTK_Book_Ch6.py
364 lines (268 loc) · 13.9 KB
/
NLTK_Book_Ch6.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
# -*- coding: utf-8 -*-
"""
Created on Sun Apr 16 18:41:17 2017
@author: Eric Nelson M07296609
"""
#2 Using any of the three classifiers described in this chapter, and any features you can think of, build the best
#name gender classifier you can. Begin by splitting the Names Corpus into three subsets: 500 words for the test set,
#500 words for the dev-test set, and the remaining 6900 words for the training set. Then, starting with the example
#name gender classifier, make incremental improvements. Use the dev-test set to check your progress. Once you are
#satisfied with your classifier, check its final performance on the test set. How does the performance on the test set
#compare to the performance on the dev-test set? Is this what you'd expect?
import nltk
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["first_letters"] = name[:2].lower()
features["length"] = len(name)
features["last_letter"] = name[-1].lower()
features["last_letters"] = name[-3:].lower()
for letter in 'aeiouy':
features["count({})".format(letter)] = name.lower().count(letter)
features["has({})".format(letter)] = (letter in name.lower())
return features
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
dev_test_set, test_set, train_set = featuresets[:500], featuresets[500:1000], featuresets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(50)
print(nltk.classify.accuracy(classifier, dev_test_set))
#after much comparison between train and dev_test and reworking the model, the dev_test results are 0.824
print(nltk.classify.accuracy(classifier, test_set))
#the results from test_set which was never used when building the model is 0.812
#it would make sense that our test_set performance would be slightly worse than dev_test_set
#We built the model on train and test on dev_test_set and reworked the model until we had good
#results on both train and dev_test_set. However, that means we probably still overfit a bit to those two
#datasets. We would expect to perform slightly worse on the test_set and that is what happened.
#3 The Senseval 2 Corpus contains data intended to train word-sense disambiguation classifiers. It contains data for
#four words: hard, interest, line, and serve. Choose one of these four words, and load the corresponding data:
#Using this dataset, build a classifier that predicts the correct sense tag for a given instance. See the corpus HOWTO
#at http://nltk.org/howto for information on using the instance objects returned by the Senseval 2 Corpus.
import nltk
from nltk.corpus import senseval
import random
#I chose to use the word 'serve'
instances = senseval.instances('serve.pos')
size = int(len(instances) * 0.1)
for inst in instances[:5]:
p = inst.position
left = ' '.join(w for (w,t) in inst.context[p-2:p])
word = ' '.join(w for (w,t) in inst.context[p:p+1])
right = ' '.join(w for (w,t) in inst.context[p+1:p+3])
senses = ' '.join(inst.senses)
def features(instance):
feat = dict()
p = instance.position
## previous word and tag
if p: ## > 0
feat['wp'] = instance.context[p-1][0]
feat['tp'] = instance.context[p-1][1]
## use BOS if it is the first word
else: #
feat['wp'] = (p, 'BOS')
feat['tp'] = (p, 'BOS')
## following word and tag
feat['wf'] = instance.context[p+1][0]
feat['tf'] = instance.context[p+1][1]
return feat
featureset =[(features(i), i.senses[0]) for i in
instances if len(i.senses)==1]
### shuffle them randomly
random.shuffle(featureset)
train, test = featureset[size:], featureset[:size]
classifier = nltk.NaiveBayesClassifier.train(train)
print (nltk.classify.accuracy(classifier, train))
#0.710
print (nltk.classify.accuracy(classifier, test))
#0.661
#4 Using the movie review document classifier discussed in this chapter, generate a list of the 30 features that the
#classifier finds to be most informative. Can you explain why these particular features are informative? Do you find
#any of them surprising?
import random
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(30)
#of the top 30 feature results from the classifier, some words are easy to identify why they are informative
#from a negative review standpoint, it makes sense that terms like amateurish, nagging, and fluke would be
#strong indicators of a negative review. On the other side, terms like palpable, layered, and indelible make
#sense for reviews that are positive about a movie.
#Some surprising results include that unfairly (positive), weaknesses (positive), and dread (positive) are
#categorized as they are. It might help if we knew the context of these words or at least the words that came
#before and aftre each of them. For example, if the word weaknesses is preceeded by the word 'few' than a negative
#word takes on a positive connotation.
#5 Select one of the classification tasks described in this chapter, such as name gender detection, document
#classification, part-of-speech tagging, or dialog act classification. Using the same training and test data, and the
#same feature extractor, build three classifiers for the task: a decision tree, a naive Bayes classifier, and a Maximum
#Entropy classifier. Compare the performance of the three classifiers on your selected task. How do you think that your
#results might be different if you used a different feature extractor?
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
random.shuffle(labeled_names)
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["first_letters"] = name[:2].lower()
features["length"] = len(name)
features["last_letter"] = name[-1].lower()
features["last_letters"] = name[-3:].lower()
for letter in 'aeiouy':
features["count({})".format(letter)] = name.lower().count(letter)
features["has({})".format(letter)] = (letter in name.lower())
return features
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
test_set, train_set = featuresets[:500], featuresets[500:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, train_set))
#0.84
print(nltk.classify.accuracy(classifier, test_set))
#0.82
classifier2 = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(classifier2, train_set))
#.96
print(nltk.classify.accuracy(classifier2, test_set))
#.744
classifier3 = nltk.MaxentClassifier.train(train_set)
print(nltk.classify.accuracy(classifier3, train_set))
#0.88
print(nltk.classify.accuracy(classifier3, test_set))
#0.812
# In this scenario, the Naive Bayes Classifier reigned supreme. This was while using the first letter, first two
#letters, lenth, last letter, last two letters, and count of vowels to classify a given name's gender. The shear
#number of possible features causes the decision tree to strongly overfit on the training data and perform much
#worse on the test data. If we limited the number of features therefor limiting the branches in the tree, it would
#likely have less of an overfitting issue. I would have expected the Naive Bayes to perform worse out of sample given
#that the model assumes the features are independent and in this case there are many features that are highly dependent
#such as the first letter and first 3 letters of a name and the last letter and last 3 letters of a name.
#There is a chance the naive bayes would perform better if we kept the features more independent.
#In this caes, even though the entropy classifier is similar to the naive bayes and runs iteratively, it had more
#significant overfitting issues.
#6 The synonyms strong and powerful pattern differently (try combining them with chip and sales). What features are
#relevant in this distinction? Build a classifier that predicts when each word should be used.
from nltk.util import ngrams
from nltk.corpus import brown
import random
words = brown.words()
bigs = list(nltk.bigrams(words))
trigrams=list(ngrams(words,3))
power_gram = [(a, c) for (a, b, c) in trigrams if b in ('powerful')]
strong_gram = [(a, c) for (a, b, c) in trigrams if b in ('strong')]
labels = ([(a, b, 'powerful') for (a,b) in power_gram] + [(a, b, 'strong') for (a,b) in strong_gram])
random.shuffle(labels)
def features(before, after):
features={}
features["pre"]=before
features["post"]=after
return features
sets=[(features(a,b), word) for (a, b, word) in labels]
dev_test, test, train = sets[:500], sets[500:1000], sets[1000:]
classifier = nltk.NaiveBayesClassifier.train(train)
classifier.show_most_informative_features(50)
print(nltk.classify.accuracy(classifier, train))
#0.984
print(nltk.classify.accuracy(classifier, dev_test))
#0.942 accuracy
print(nltk.classify.accuracy(classifier, test))
#0.916 accuracy
#Looking at the words that come before and after the words 'powerful' and 'strong', we can predict which
#word should be used in the context with over 90% accuracy. Top indicators include preceeding the
#word 'the' (365x strong), preceding the word 'have' (260x powerful), following the word 'which' (86x powerful),
#and following the word 'if' (71x powerful).
import nltk
from nltk.corpus import names
labeled_names = ([(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')])
import random
random.shuffle(labeled_names)
def gender_features2(name):
features = {}
features["first_letter"] = name[0].lower()
features["first_letters"] = name[:2].lower()
features["length"] = len(name)
features["last_letter"] = name[-1].lower()
features["last_letters"] = name[-3:].lower()
for letter in 'aeiouy':
features["count({})".format(letter)] = name.lower().count(letter)
features["has({})".format(letter)] = (letter in name.lower())
return features
featuresets = [(gender_features2(n), gender) for (n, gender) in labeled_names]
dev_test_set, test_set, train_set = labels[:500], labels[500:1000], labels[1000:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
classifier.show_most_informative_features(50)
print(nltk.classify.accuracy(classifier, dev_test_set))
print(nltk.classify.accuracy(classifier, test_set))
import nltk
from nltk.corpus import brown
words = brown.words()
size = int(len(words) * 0.1)
train, test = words[size:], words[:size]
def pos_features(sentence, i, history): [1]
features = {"suffix(1)": sentence[i][-1:],
"suffix(2)": sentence[i][-2:],
"suffix(3)": sentence[i][-3:]}
if i == 0:
features["prev-word"] = "<START>"
features["prev-tag"] = "<START>"
else:
features["prev-word"] = sentence[i-1]
features["prev-tag"] = history[i-1]
return features
class ConsecutivePosTagger(nltk.TaggerI):
def __init__(self, train_sents):
train_set = []
for tagged_sent in train_sents:
untagged_sent = nltk.tag.untag(tagged_sent)
history = []
for i, (word, tag) in enumerate(tagged_sent):
featureset = pos_features(untagged_sent, i, history)
train_set.append( (featureset, tag) )
history.append(tag)
self.classifier = nltk.NaiveBayesClassifier.train(train_set)
def tag(self, sentence):
history = []
for i, word in enumerate(sentence):
featureset = pos_features(sentence, i, history)
tag = self.classifier.classify(featureset)
history.append(tag)
return zip(sentence, history)
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))
pos_features()
tagger=ConsecutivePosTagger(train)
print(tagger.evaluate(test))
import random
import nltk
from nltk.corpus import brown
words = brown.words()
random.shuffle(words)
size = int(len(words) * 0.1)
train, test = words[size:], words[:size]
all_words = nltk.FreqDist(w.lower() for w in words)
word_features = list(all_words)[:2000]
def document_features(document):
document_words = set(document)
features = {}
for word in word_features:
features['contains({})'.format(word)] = (word in document_words)
return features
print(document_features(brown.words('powerful')))
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)