-
Notifications
You must be signed in to change notification settings - Fork 0
/
test.py
70 lines (54 loc) · 1.9 KB
/
test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/evn python
import nltk
import re
import random
from nltk.corpus import movie_reviews
def gender_features(word):
return {'last_letter': word[-1]}
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = all_words.keys()[:2000]
def document_features(document):
#print document
#print
document_words = set(document)
#print document_words
#print
features = {}
for word in word_features:
features['contains(%s)' % word] = (word in document_words)
#print features
#print
return features
def main():
#from nltk.corpus import names
#names = ([(name, 'male') for name in names.words('male.txt')] +
# [(name, 'female') for name in names.words('female.txt')])
#random.shuffle(names)
#print names
#print
#train_set = names
#classifier = nltk.NaiveBayesClassifier.train(train_set)
#name = classifier.classify("Jaime")
#print name
#featuresets = [(gender_features(n), g) for (n,g) in names]
#train_set = [(gender_features(n), g) for (n, g) in names]
#train_set, test_set = featuresets[500:], featuresets[:500]
#print train_set
#print
#classifier = nltk.NaiveBayesClassifier.train(train_set)
#name = classifier.classify(gender_features('Neo'))
#print name
documents = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
print train_set
print
classifier = nltk.NaiveBayesClassifier.train(train_set)
print nltk.classify.accuracy(classifier, test_set)
classifier.show_most_informative_features(5)
print 'done'
if __name__ == '__main__':
main()