-
Notifications
You must be signed in to change notification settings - Fork 0
/
Osgood.py
44 lines (35 loc) · 925 Bytes
/
Osgood.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import nltk
import unicodedata
from pickle import dump
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
#sort out words that are subjective or have a connection to both good and bad
vocab = open('aclImdb/imdb.vocab')
words = vocab.readlines()
subject_words = []
forms = ['.a.01','.a.02']
#initiliaze subject_words with subjective words from vocab list
for w in words:
lemmatizer = WordNetLemmatizer()
try:
root = lemmatizer.lemmatize(w)
except:
pass
if isinstance(root, unicode):
root = unicodedata.normalize('NFKD', root).encode('ascii','ignore')
root = root.strip()
for f in forms:
j = root + f
try:
if (swn.senti_synset(j).obj_score() < 0.5):
subject_words.append(root)
break
else:
pass
except:
pass
out = open('sword.set', 'w')
dump(subject_words, out)
out.close()
print len(subject_words)