-
Notifications
You must be signed in to change notification settings - Fork 0
/
uebung_1.py
173 lines (122 loc) · 4.32 KB
/
uebung_1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#Sentiment Analyse und Media Monitoring
#### Heath Gordon ####
### 16 March 2016 #####
"""
Some thoughts in retrospect:
probably should have make a 'get all words' function
There are some nested fors that could have been those list maker things
"""
################
import random, string
from nltk.corpus import movie_reviews, stopwords
from nltk.util import ngrams
import nltk
from sklearn import cross_validation
def get_stopwords():
# long story short, the NLTK's stopword list doesn't include punctuation
the_stopwords = stopwords.words('english')
the_stopwords.extend(list(set(string.punctuation)))
return the_stopwords
def get_documents():
"""
Retrieve shuffled movie reviews from the nltk
"""
print("Retrieving Movie Reviews\n")
reviews = [(list(movie_reviews.words(fileid)), category)
for category in movie_reviews.categories()
for fileid in movie_reviews.fileids(category)]
# so I have no idea why, but shuffle() gives me a none type
return random.sample(reviews, len(reviews))
def get_features(documents):
"""
Extract unigrams from the training set
"""
unigrams = {}
print("\tExtracting Unigram features\n")
# get stopwords
my_stopwords = get_stopwords()
test_words = []
# not proud of this, but make a list of every word, not in stopword list
for document in documents:
for word in document[0]:
if word not in my_stopwords:
test_words.append(word)
# top 200 from a freqdist
test_words = nltk.FreqDist(test_words)
top_words = [i[0] for i in list(test_words.most_common(200))]
return top_words
def get_more_features(documents):
"""
Get bigrams
"""
print ("\tExtracting Bigram features\n")
my_stopwords = get_stopwords()
# get list of all the words
all_words = []
for document in documents:
for word in document[0]:
if word not in my_stopwords:
all_words.append(word)
# list of bigrams
bigrams = list(ngrams(all_words, 2))
# get top 200
freq_dist_bi = list(nltk.FreqDist(bigrams).most_common(200))
return [i[0] for i in freq_dist_bi]
def featuresets(features, documents):
"""
filters the training documents for the features
the 'pos or negative' is the last element of each review
review[x][1] should retrieve it
"""
feature_set = []
print("\tCreating Feature Set\n")
for review in documents:
review_hash = {}
bigrams = list(ngrams(review[0], 2))
for feature in features:
# bigrams, god this is inelegant
if type(feature) == tuple:
if feature in bigrams:
review_hash[feature] = True;
# unigrams
else:
if feature in review[0]:
review_hash[feature] = True;
# build the list
feature_set.append((review_hash, review[1]))
return feature_set
def train_test(train, test):
"""
I amended this a bit because I didn't really get it
but now it just runs the classifier and returns the accuracy
"""
print "\tTraining and Testing Classifier\n"
classifier = nltk.NaiveBayesClassifier.train(train)
# un-comment the next line if you want to see the most informative feats
classifier.show_most_informative_features(10)
return nltk.classify.util.accuracy(classifier, test)
def main():
# homebrew cross_fold
folds = 10
accuracy = 0
# get the review
reviews_list = get_documents()
fold_size = len(reviews_list)/folds
for i in range(folds):
print "Round ", i+1
# split up set
test_set = reviews_list[i*fold_size:][:fold_size]
train_set = reviews_list[:i*fold_size] + reviews_list[(i+1)*fold_size:]
# get unigram features
unigram_feats = get_features(train_set)
# get bigram features
bigram_feats = get_more_features(train_set)
all_feats = unigram_feats + bigram_feats
train_set = featuresets(all_feats, train_set)
test_set = featuresets(all_feats, test_set)
accuracy += train_test(train_set, test_set)
print "Final Accuracy: ", (accuracy/folds)
if __name__ == '__main__':
main()