/
feature_and_opinion_extraction.py
191 lines (174 loc) · 7.43 KB
/
feature_and_opinion_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
from spacy.en import English
from pattern.en import lemma, sentiment
from spacy.parts_of_speech import NOUN, VERB, ADV, ADJ
from nltk.corpus import stopwords
import apriori
import pandas as pd
import numpy as np
class FeatureAndOpinionExtractor(object):
"""
Extracts Features and Opinions from reviews.
"""
def __init__(self, data, lang):
self.data = data
self.nlp = lang()
self.frequent_features = []
self.feature_phrases = []
self.feature_words = []
self.features = []
self._preprocess()
self._get_sentiment()
def _preprocess(self):
"""
Preprocesses the data and calls the functions to extract features and its opinions.
"""
self.data['sentences'] = self.data['text'].apply(self._tokenize_sent)
self.data['nouns'] = self.data['sentences'].apply(self._get_nouns)
# self._get_frequent_features()
# self._compactness_pruning()
# self._redundancy_pruning()
# self._get_features()
self._extract_opinions()
def _tokenize_sent(self, review):
"""
input : string
output : list
Returns list of sentences of a review.
"""
return review.decode('ascii','ignore').split('.')
def _get_nouns(self, review):
"""
Returns features(nouns) from each sentence of a review.
"""
review_features = []
for sent in review:
doc = self.nlp(sent)
# noun_phrase = [np.text for np in doc.noun_chunks]
nouns = [unicode(lemma(str(word).lower())) for word in doc if word.pos == NOUN]
review_features.append(nouns)
return review_features
def _get_frequent_features(self):
"""Frequent Features are found using apriori algorithm"""
feature_terms = [sub_items for items in self.data['noun_and_np'].values for sub_items in items]
C1 = apriori.createC1(feature_terms)
D = map(set, feature_terms)
L1, support_data = apriori.scanD(D,C1,0.01) # minimum support 0.01
self.frequent_features = map(lambda x: "".join(list(x)), L1)
def _distance(self, sentence, feature_phrase):
"""Returns True if distance between words is less than or equals to 3 else False"""
words = feature_phrase.split()
if len(words) == 2:
if sentence.find(words[0]) != -1 and sentence.find(words[1]) != -1:
if len(sentence[sentence.find(words[0]) + len(words[0]):sentence.find(words[1])].split()) <= 3:
return True
return False
return False
else:
if len(sentence[sentence.find(words[0]) + len(words[0]):sentence.find(words[1])].split()) <=3 and \
len(sentence[sentence.find(words[1]) + len(words[1]):sentence.find(words[2])].split()) <= 3:
return True
return False
def _is_compact(self, feature_phrase):
"""
input : string
output : bool
Returns whether the input feature phrase is compact or not
"""
count = 0
if 1 < len(feature_phrase.split()) <= 3:
temp_fp = self.data[self.data['text'].str.contains(feature_phrase)]
for review in temp_fp['sentences'].values:
for sent in review:
if self._distance(sent, feature_phrase):
count += 1
if count == 2:
return True
return False
else:
return False
def _compactness_pruning(self):
"""Checks if there are more than two words between the words of a feature in a review"""
feature_phrases = [phrase for phrase in self.frequent_features if self._is_compact(phrase)]
self.features_phrases = feature_phrases
def _is_redundant(self, ftr, phrase_list):
"""input: string, list"""
"""output: bool"""
"""Returns whether the input feature is redundant or not"""
temp_fw = self.data[self.data['text'].str.contains(ftr)]
if phrase_list:
for n in temp_fw['noun_and_np'].values:
count = 0
for phrase in phrase_list:
if frozenset(phrase).issubset(frozenset(n)):
break
count += 1
if count == 3:
return True
return False
else:
if temp_fw.count()['text'] >= 3:
return True
return False
def _redundancy_pruning(self):
"""Prunes redundant single word features"""
feature_words = [feature for feature in self.frequent_features if len(feature.split()) == 1]
for ftr in feature_words:
phrase_list = []
if self.feature_phrases:
for phrase in self.feature_phrases:
if ftr in phrase:
phrase_list.append(phrase)
if self._is_redundant(ftr, phrase_list):
self.feature_words.append(ftr)
def _get_features(self):
stop = set(stopwords.words('english'))
features = self.feature_words + self.feature_phrases
self.features = [feature for feature in features if feature not in stop]
def _remove_stop_words(self, review):
review_list = []
stop = stopwords.words('english')
for sent in review:
sent_list = []
for item in sent:
if item.lower() not in stop:
sent_list.append(item)
review_list.append(sent_list)
return review_list
def _extract_pos(self, review, pos):
"""
input : string, string
output : list of strings
Returns the list of words that has parts of speech as given in the input.
"""
pos_list = []
stop = stopwords.words('english')
for sent in review:
doc = self.nlp(unicode(sent))
pos_ext = [unicode(word) for word in doc if word.pos == pos and str(word).lower().encode('utf-8') not in stop]
pos_list.append(pos_ext)
return pos_list
def _extract_opinions(self):
"""
Extracts adjectives, adverbs, verbs for each sentence of a review.
"""
self.data['adjectives'] = self.data['sentences'].apply(lambda x: self._extract_pos(x, ADJ))
self.data['adverbs'] = self.data['sentences'].apply(lambda x: self._extract_pos(x, ADV))
self.data['verbs'] = self.data['sentences'].apply(lambda x: self._extract_pos(x, VERB))
def _get_polarity(self):
self.data['polarity'] = self.data['sentences'].apply(lambda x: [sentiment(i) for i in x])
polarities = [polarity for sent_polarities in self.data['polarity'].values for polarity in sent_polarities]
self._get_normalized_score(polarities)
def _get_normalized_score(self, polarities):
ind = 0
scores = []
normalized_scores = pd.cut(polarities, bins=9, right=True, labels = [1,1.5,2,2.5,3,3.5,4,4.5,5],
retbins=False, precision=2, include_lowest=True)
for review in self.data['sentences']:
l = len(review)
scores.append(normalized_scores[ind:ind+l])
ind += l
self.data['scores'] = np.array(scores)
# if __name__ == "__main__":
# reviews = pd.read_pickle("../data/cleaned_review_data.pkl")
# foe = FeatureAndOpinionExtractor(reviews, English)
# reviews = pd.read_pickle("../data/italian_cleaned_review_data.pkl")