/
classifier_main.py
151 lines (130 loc) · 5.45 KB
/
classifier_main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
''' This a main class of a sort. See the second half of the document for the part that does anything.
'''
import dataset_loader as loader
import svm_classifier as svm
import sentiment_lexicon as sentlex
import numpy as np
import scipy.stats as stats
train_set = ''
lexicon_set = ''
test_set = ''
train_labels = ''
train_tweets = ''
lexicon = ''
def get_labels(tweets,dictionary):
labels = []
for tweet in tweets:
labels.append(dictionary[tweet])
return labels
'''Calculates the accuracy of the results.
'''
def calculate_accuracy(predicted, actual):
count = 0
for key in predicted.keys():
if predicted[key] == actual[key]:
count +=1
return float(count)/len(predicted.keys())
''' Loads the tst and train set, trains the SVM and prepares the lexicon
'''
def load_sets(training_file, test_file):
global train_set, lexicon_set, test_set, train_labels, train_tweets, lexicon
#Loading the training and lexicon sets
train_set = loader.tweets_to_tweetlist(training_file, neutral = True)
lexicon_set = loader.tweets_to_wordlist(training_file, neutral = False)
#Loading the test set
test_set = loader.tweets_to_tweetlist(test_file, neutral = True)
#training the svm anlyzer
train_labels = train_set.values()
train_tweets = svm.vectorize_tweets(train_set.keys())
svm.train(train_tweets, train_labels)
#Building the lexicon
lexicon = sentlex.get_ratioDict(lexicon_set['positive'], lexicon_set['negative'], 10)
# Extracting the labels and tweets for train set and trnsforming the tweets into vectors
''' RUns a prediction. If lexicon only is true it returns only the lexicon prediction.
'''
def predict(test_tweets, lexicon_only = False):
global lexicon
predicted_from_lexicon = sentlex.predict(test_tweets, lexicon)
if lexicon_only:
return predicted_from_lexicon
undecided = sentlex.get_unclassified(predicted_from_lexicon)
#undecided_labels = get_labels(undecided, test_set)
undecided_transformed = svm.vectorize_tweets(undecided, False)
predicted_from_svm = svm.predict(undecided_transformed)
final = {}
for i in range(len(undecided)):
final[undecided[i]] = predicted_from_svm[i]
predicted_from_lexicon.update(final)
return predicted_from_lexicon
'''Runs a test on the test set. It divides the set in groups of 100 tweeets
and outputs a list with the accuracy on each set.
lexicon_only tells it if it should return olny the lexicon results
same with SVM only
'''
def conduct_test(test_set, lexicon_only = False, svm_only = False, number_of_tweets = 3700):
scores = []
if svm_only:
for i in range(0,number_of_tweets,100):
transformed = svm.vectorize_tweets(test_set.keys()[i:i+100], False)
predicted = svm.score(transformed, test_set.values()[i:i+100])
scores.append(predicted)
transformed = svm.vectorize_tweets(test_set.keys()[number_of_tweets:len(test_set.keys())], False)
predicted = svm.score(transformed, test_set.values()[number_of_tweets:len(test_set.keys())])
scores.append(predicted)
else:
for i in range(0,number_of_tweets,100):
predicted = predict(test_set.keys()[i:i+100], lexicon_only)
expected = test_set.items()[i:i+100]
scores.append(calculate_accuracy(predicted, dict(expected)))
predicted = predict(test_set.keys()[number_of_tweets:len(test_set.keys())],lexicon_only)
expected = test_set.items()[number_of_tweets:len(test_set.keys())]
scores.append(calculate_accuracy(predicted, dict(expected)))
return scores
'''Returns the descriptive statistics of a list of results.
'''
def get_stats(measures):
descriptives = {}
descriptives['mean'] = np.mean(measures)
descriptives['median'] = np.median(measures)
descriptives['std'] = np.std(measures)
descriptives['min - max'] = (np.min(measures), np.max(measures))
descriptives['number of samples'] = len(measures)
global test_set
descriptives['test set size'] = len(test_set.keys())
descriptives['Shapiro normality'] = stats.shapiro(measures)
return descriptives
###############################################################################
''' This part loads and runs the classifiers, prints statistics and conducts the t-tests
'''
#load sets
load_sets("train.train", "test.test")
#conduct experiments
print " SVM + Lexicon Analyzer: "
svm_lex = conduct_test(test_set, lexicon_only = False, svm_only = False)
print get_stats(svm_lex)
print "SVM only: "
svm_res = conduct_test(test_set, lexicon_only = False, svm_only = True)
print get_stats(svm_res)
print " Lexicon only: "
lex = conduct_test(test_set, lexicon_only = True, svm_only = False)
print get_stats(lex)
# Test equality of variances. We already know they're equal, so we're running the t-tests right away.
variance = stats.levene(svm_lex, svm_res,lex)
print variance
#t-tests
print "-------------------------"
print "T-test: SVM lexicon and svm only:"
print stats.ttest_rel(svm_lex,svm_res)
print "-------------------------"
print "T-test: SVM lexicon and lexicon"
print stats.ttest_rel(svm_lex,lex)
print "--------------------------"
print "T-test: SVM only and lexicon:"
print stats.ttest_rel(svm_res, lex)
print "T-test baseline(41.03: "
print "SVM lexicon: "
print stats.ttest_1samp(svm_lex, 41.03)
print "SVM: "
print stats.ttest_1samp(svm_res, 41.03)
print "lexicon: "
print stats.ttest_1samp(lex, 41.03)