forked from stathius/yenlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
sentiwordnet.py
118 lines (108 loc) · 4.1 KB
/
sentiwordnet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from nltk.corpus import wordnet as wordnet
from nltk.corpus import sentiwordnet as swn
import nltk
def wordnet_pos_code(tag):
'''Translation from nltk tags to Wordnet code'''
if tag.startswith('NN'):
return wordnet.NOUN
elif tag.startswith('VB'):
return wordnet.VERB
elif tag.startswith('JJ'):
return wordnet.ADJ
elif tag.startswith('RB'):
return wordnet.ADV
else:
return ''
def pos_tag(sentence):
'''POS tagging of a sentence.'''
tagged_words = []
tokens = nltk.word_tokenize(sentence)
tag_tuples = nltk.pos_tag(tokens)
for (string, tag) in tag_tuples:
token = {'word':string, 'pos':tag}
tagged_words.append(token)
return tagged_words
def word_sense_cdf(word, context, wn_pos):
'''Word sense disambiguation in terms of matching words frequency
between the context each sense's definition. Adapted from
www.slideshare.net/faigg/tutotial-of-sentiment-analysis'''
senses = wordnet.synsets(word, wn_pos)
if len(senses) > 0:
cfd = nltk.ConditionalFreqDist((sense, def_word)
for sense in senses
for def_word in sense.definition().split()
if def_word in context)
best_sense = senses[0]
for sense in senses:
try:
if cfd[sense].max() > cfd[best_sense].max():
best_sense = sense
except:
pass
return best_sense
else:
return None
def word_sense_similarity(word, context, dummy = None):
'''Another word sense disambiguation technique. It's VERY SLOW.
Adapted from: pythonhosted.org/sentiment_classifier'''
wordsynsets = wordnet.synsets(word)
bestScore = 0.0
result = None
for synset in wordsynsets:
for w in nltk.word_tokenize(context):
score = 0.0
for wsynset in wordnet.synsets(w):
sim = wordnet.path_similarity(wsynset, synset)
if(sim == None):
continue
else:
score += sim
if (score > bestScore):
bestScore = score
result = synset
return result
def sentiwordnet_classify(text):
'''Breaks a multi sentence text to separate sentences.
This improves context for the word sense disambiguation.
Returns a class'''
score_tot = 0
score_tot_thr = 0
class_tot = 0
class_tot_thr = 0
sentences = nltk.sent_tokenize(text)
for sentence in sentences:
(score, score_thr) = sentence_score(sentence)
score_tot += score
score_tot_thr += score_thr
#Trust the thresholded value more when classifying
if score_tot_thr != 0:
clss = 'pos' if score_tot_thr > 0 else 'neg'
elif score_tot != 0:
clss = 'pos' if score_tot > 0 else 'neg'
else:
clss = None
return clss
def sentence_score(text, threshold = 0.75, wsd = word_sense_cdf):
'''Classifies a phrase according to sentiment analysis based
on WordNet and SentiWordNet. It also computes a thresholded
score by ignoring strongly objective words.'''
tagged_words = pos_tag(text)
obj_score = 0 # object score
pos_score=0 # positive score
neg_score=0 #negative score
pos_score_thr=0
neg_score_thr=0
for word in tagged_words:
# print word
if 'punct' not in word :
sense = wsd(word['word'], text, wordnet_pos_code(word['pos']))
if sense is not None:
sent = swn.senti_synset(sense.name())
if sent is not None and sent.obj_score() <> 1:
obj_score = obj_score + float(sent.obj_score())
pos_score = pos_score + float(sent.pos_score())
neg_score = neg_score + float(sent.neg_score())
if sent.obj_score() < threshold:
pos_score_thr = pos_score_thr + float(sent.pos_score())
neg_score_thr = neg_score_thr + float(sent.neg_score())
return (pos_score - neg_score, pos_score_thr - neg_score_thr)