-
Notifications
You must be signed in to change notification settings - Fork 1
/
polarity.py
94 lines (69 loc) · 5.03 KB
/
polarity.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
from features import features_polarity as features
from classifiers import LogisticRegression, SVM, MajorityClassifier
import learningCurves
from utilities import *
from feature_selection import selection
import regularization
from evaluation import measures
from feature_selection import selection
def classify(messages_train,labels_train,messages_test,process_messages_train,process_messages_test,tokens_train,tokens_test,process_tokens_train,process_tokens_test,pos_tags_train,pos_tags_test,negationList,clusters,slangDictionary,lexicons,mpqa_lexicons):
# 0 - negative messages
# 1 - positives messages
labels_train = [0 if x=="negative" else 1 for x in labels_train]
#compute pos tag bigrams for all messages
pos_bigrams_train = getBigrams(pos_tags_train)
pos_bigrams_test = getBigrams(pos_tags_test)
#compute pos tag trigrams for all messages
pos_trigrams_train = getTrigrams(pos_tags_train)
pos_trigrams_test = getTrigrams(pos_tags_test)
#get the unique pos bigrams and trigrams from training set
unique_pos_tags = getPosTagsSet(pos_tags_train)
unique_bigrams = getBigramsSet(pos_bigrams_train)
unique_trigrams= getTrigramsSet(pos_trigrams_train)
#calculate pos bigrams score for all categories
#both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages)
pos_tags_scores_negative = posTagsScore(unique_pos_tags,0,pos_tags_train,labels_train)
pos_tags_scores_positive = posTagsScore(unique_pos_tags,1,pos_tags_train,labels_train)
#calculate pos bigrams score for all categories
#both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages)
pos_bigrams_scores_negative = posBigramsScore(unique_bigrams,0,pos_bigrams_train,labels_train)
pos_bigrams_scores_positive = posBigramsScore(unique_bigrams,1,pos_bigrams_train,labels_train)
#calculate pos bigrams score for all categories
#both dictionaries will be used for training and testing (cannot create new for testing because we don't know the labels of the new messages)
pos_trigrams_scores_negative = posTrigramsScore(unique_trigrams,0,pos_trigrams_train,labels_train)
pos_trigrams_scores_positive = posTrigramsScore(unique_trigrams,1,pos_trigrams_train,labels_train)
#assign a precision and F1 score to each word of to all mpqa lexicons
mpqaScores = getScores(mpqa_lexicons,process_messages_train,labels_train)
#get features from train messages
features_train = features.getFeatures(messages_train,process_messages_train,tokens_train,process_tokens_train,pos_tags_train,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_train,pos_trigrams_train,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters)
#regularize train features
features_train=regularization.regularize(features_train)
#get features from test messages
features_test = features.getFeatures(messages_test,process_messages_test,tokens_test,process_tokens_test,pos_tags_test,slangDictionary,lexicons,mpqa_lexicons,pos_bigrams_test,pos_trigrams_test,pos_bigrams_scores_negative,pos_bigrams_scores_positive,pos_trigrams_scores_negative,pos_trigrams_scores_positive,pos_tags_scores_negative,pos_tags_scores_positive,mpqaScores,negationList,clusters)
#regularize test features
features_test=regularization.regularize(features_test)
#feature selection
#features_train, features_test = selection.feature_selection(features_train,labels_train,features_test,1150)
#C parameter of SVM
C = 0.001953125
#C = 19.3392161013
#train classifier and return trained model
#model = LogisticRegression.train(features_train,labels_train)
model = SVM.train(features_train,labels_train,c=C,k="linear")
#predict labels
#prediction = LogisticRegression.predict(features_test,model)
prediction = SVM.predict(features_test,model)
return prediction
def evaluate(prediction,labels_test):
labels_test = [0 if x=="neutral" else 1 if x=="positive" else -1 for x in labels_test]
#logistic regression evaluation
print "Average F1 : " +str(measures.avgF1(labels_test,prediction,-1,1))
#print "Baseline AverageF1 : " +str(measures.avgF1(labels_test,baseline_prediction))
print "Accuracy : " +str(measures.accuracy(labels_test,prediction))
#print "Baseline Accuracy : "+str(measures.accuracy(labels_test,baseline_prediction))
print "F1 negative : " +str(measures.F1(labels_test,prediction,-1))
print "F1 positive : " +str(measures.F1(labels_test,prediction,1))
print "Precision negative: " +str(measures.precision(labels_test,prediction,-1))
print "Precision positive: " +str(measures.precision(labels_test,prediction,1))
print "Recall negative : " +str(measures.recall(labels_test,prediction,-1))
print "Recall positive : " +str(measures.recall(labels_test,prediction,1))