/
sentiment.py
70 lines (53 loc) · 1.96 KB
/
sentiment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
@package sentiment
Twitter sentiment analysis.
This code performs sentiment analysis on Tweets.
A custom feature extractor looks for key words and emoticons. These are fed in
to a naive Bayes classifier to assign a label of 'positive', 'negative', or
'neutral'. Optionally, a principle components transform (PCT) is used to lessen
the influence of covariant features.
"""
import csv, random
import nltk
import tweet_features, tweet_pca
#import pdb
# read all tweets and labels
fp = open( 'sentiment.csv', 'rb' )
reader = csv.reader( fp, delimiter=',', quotechar='"', escapechar='\\' )
tweets = []
for row in reader:
#tweets.append( [row[1], row[4]] );
text = nltk.word_tokenize(row[1], language='english')
text = nltk.pos_tag(text)
tweets.append([text,row[4]])
#added: pos tag is added as feature
print tweets
# treat neutral and irrelevant the same
for t in tweets:
if t[1] == 'irrelevant':
t[1] = 'neutral'
# split in to training and test sets
random.shuffle(tweets)
#pdb.set_trace()
fvecs = [(tweet_features.make_tweet_dict(t),s) for (t,s) in tweets]
v_train = fvecs[:10]
v_test = fvecs[:10]
# dump tweets which our feature selector found nothing
#for i in range(0,len(tweets)):
# if tweet_features.is_zero_dict( fvecs[i][0] ):
# print tweets[i][1] + ': ' + tweets[i][0]
# apply PCA reduction
(v_train, v_test) = \
tweet_pca.tweet_pca_reduce( v_train, v_test, output_dim=1.0 )
# train classifier
classifier = nltk.NaiveBayesClassifier.train(v_train);
#classifier = nltk.classify.maxent.train_maxent_classifier_with_gis(v_train);
# classify and dump results for interpretation
print '\nAccuracy %f\n' % nltk.classify.accuracy(classifier, v_test)
#print classifier.show_most_informative_features(200)
# build confusion matrix over test set
test_truth = [s for (t,s) in v_test]
test_predict = [classifier.classify(t) for (t,s) in v_test]
print test_predict
print 'Confusion Matrix'
print nltk.ConfusionMatrix( test_truth, test_predict )