/
EmotionAnalysis.py
176 lines (135 loc) · 5.76 KB
/
EmotionAnalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
# This is a program for analysing the sentiment of an input sentence
# Author: Sophist Wu
# 15:40 2011-8-10
import time
before = time.strftime('%X %x')
print before
print "importing dependencies from nltk..."
import collections
from datetime import datetime
from nltk import metrics # for defining function precision_recall(.., ..)
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import movie_reviews
from nltk.corpus import reuters
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy # for later testing the accuracy of the classifier
from nltk.tokenize import word_tokenize # for word_tokenize method
from nltk.probability import FreqDist, ConditionalFreqDist
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
word_fd = FreqDist()
label_word_fd = ConditionalFreqDist()
for label, words in labelled_words:
for word in words:
word_fd.inc(word)
label_word_fd[label].inc(word)
n_xx = label_word_fd.N()
high_info_words = set()
for label in label_word_fd.conditions():
n_xi = label_word_fd[label].N()
word_scores = collections.defaultdict(int)
for word, n_ii in label_word_fd[label].iteritems():
n_ix = word_fd[word]
score = score_fn(n_ii, (n_ix, n_xi), n_xx)
word_scores[word] = score
bestwords = [word for word, score in word_scores.iteritems() if score >= min_score]
high_info_words |= set(bestwords)
return high_info_words
# precision_recall function is one which calculates the precision and recall of the clssifier
def precision_recall(classifier, testfeats):
refsets = collections.defaultdict(set)
testsets = collections.defaultdict(set)
for i, (feats, label) in enumerate(testfeats):
refsets[label].add(i)
precisions = {}
recalls = {}
for label in classifier.labels():
precisions[label] = metrics.precision(refsets[label], testsets[label])
recalls[label] = metrics.recall(refsets[label], testsets[label])
return precisions, recalls
def bag_of_words(words):
return dict([(word, True) for word in words])
def bag_of_words_not_in_set(words, badwords):
return bag_of_words(set(words) - set(badwords))
def bag_of_non_stopwords(words, stopfile='english'):
badwords = stopwords.words(stopfile)
return bag_of_words_not_in_set(words, badwords)
def bag_of_bigrams_words(words, score_fn=BigramAssocMeasures.chi_sq, n=200):
bigram_finder = BigramCollocationFinder.from_words(words)
bigrams = bigram_finder.nbest(score_fn, n)
return bag_of_words(words + bigrams)
def label_feats_from_corpus(corp, feature_detector=bag_of_words):
label_feats = collections.defaultdict(list)
for label in corp.categories():
for fileid in corp.fileids(categories=[label]):
feats = feature_detector(corp.words(fileids=[fileid]))
label_feats[label].append(feats)
return label_feats
def split_label_feats(lfeats, split=0.75):
train_feats = []
test_feats = []
for label, feats in lfeats.iteritems():
cutoff = int(len(feats) * split)
train_feats.extend([(feat, label) for feat in feats[:cutoff]])
test_feats.extend([(feat, label) for feat in feats[cutoff:]])
return train_feats, test_feats
def reuters_high_info_words(score_fn=BigramAssocMeasures.chi_sq):
labeled_words = []
for label in reuters.categories():
labeled_words.append((label, reuters.words(categories=[label])))
return high_information_words(labeled_words, score_fn=score_fn)
def reuters_train_test_feats(feature_detector=bag_of_words):
train_feats = []
test_feats = []
for fileid in reuters.fileids():
if fileid.startswith('training'):
featlist = train_feats
else: # fileid.startswith('test')
featlist = test_feats
feats = feature_detector(reuters.words(fileid))
labels = reuters.categories(fileid)
featlist.append((feats, labels))
return train_feats, test_feats
lfeats = label_feats_from_corpus(movie_reviews)
train_feats, test_feats = split_label_feats(lfeats)
# Define the classifier to analyse the emotion of the input sentence
nb_classifier = NaiveBayesClassifier.train(train_feats)
# Some tests
def a():
while True:
print "\nplease enter a sentense to be analysed:"
sentence = raw_input('--> ')
start = datetime.now().microsecond
print "starting to analyse..."
sent = word_tokenize(sentence)
posfeat = bag_of_words(sent)
result = nb_classifier.classify(posfeat)
after = datetime.now().microsecond
print "analyse finished..."
print "result:", result
timeconsuming = after-start
print "It takes ", timeconsuming, "milliseconds to analyse the sentence"
#print accuracy(nb_classifier, test_feats)
#probs = nb_classifier.prob_classify(test_feats[0][0])
#print probs.samples()
a()
#negfeat = bag_of_words(['baidu', 'sucks'])
#print nb_classifier.classify(negfeat)
# precision functioning
# nb_precisions, nb_recalls = precision_recall(nb_classifier, posfeat)
#accuracy(nb_classifier, posfeat)
#probs = nb_classifier.prob_classify(posfeat[0][0])
#print accuracy(nb_classifier, test_feats)
#nb_classifier.show_most_informative_features(n=100)
#from nltk.probability import LaplaceProbDist
#nb_classifier = NaiveBayesClassifier.train(train_feats, estimator=LaplaceProbDist)
#from nltk.probability import FreqDist, MLEProbDist, entropy
#fd = FreqDist({'pos': 30, 'neg': 10})
#print entropy(MLEProbDist(fd))
#fd['neg'] = 25
#print entropy(MLEProbDist(fd))
#fd['neg'] = 30
#print entropy(MLEProbDist(fd))
#fd['neg'] = 1
#print entropy(MLEProbDist(fd))