/
util.py
65 lines (59 loc) · 1.63 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import EnglishStemmer
import nltk
from nltk import word_tokenize
import urllib2
import json
URL_SENTIMENT140 = "http://www.sentiment140.com/api/bulkClassifyJson"
def remove_stopwords(sentence):
"""
Removes stopwords from the sentence
:param sentence: (str) sentence
:returns: cleaned sentence without any stopwords
"""
sw = set(stopwords.words('english'))
cleaned = []
words = get_words(sentence)
sentence = ' '.join([c for c in words if c not in sw])
return sentence
def get_words(sentence):
"""
Extracts words/tokens from a sentence
:param sentence: (str) sentence
:returns: list of tokens
"""
words = word_tokenize(sentence)
return words
def stem_word(word):
"""
Stem words
:param word: (str) text word
:returns: stemmed word
"""
stemmer = EnglishStemmer()
return stemmer.stem(word)
def sentiment(data):
"""
sentiment analysis using sentiment140
:param data: (list) tweets
:returns: tweets tagged with polarity
"""
data = json.dumps(data)
response = urllib2.urlopen(URL_SENTIMENT140, data)
json_response = json.loads(response.read())
return json_response
def intent(data):
"""
define intent of the tweet
:param data: (str) cleaned up tweet
:returns: intent
"""
accept = ['VB', 'VBG', 'VBD', 'VBN', 'VBP', 'VBZ']
tweet = word_tokenize(data)
tagged = nltk.pos_tag(tweet)
result = []
for tags in tagged:
if tags[1] in accept:
result.append(stem_word(tags[0]))
return result