/
nlp_amazon_data_preparation_prediction_deploy.py
302 lines (175 loc) · 7.7 KB
/
nlp_amazon_data_preparation_prediction_deploy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# -*- coding: utf-8 -*-
"""NLP_Amazon_Data_Preparation_prediction-deploy.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1yRYpeoDWvhvD9ZXk6xfCAs13sGs5GUwD
"""
## import all the necessary libraries
import warnings
#Ignoring unnecessory warnings
warnings.filterwarnings("ignore")
import numpy as np #for large and multi-dimensional arrays
import pandas as pd #for data manipulation and analysis
import nltk
##reading dataset
df = pd.read_csv('F:\Data Science projects\By_me\ML\Amazon\Dataset/Reviews.csv')
print(df.shape)
df.head()
df.columns
"""### Data Preparation"""
### add some columns for upvote metrics
df['Helpful %'] = np.where(df['HelpfulnessDenominator'] > 0, df['HelpfulnessNumerator'] / df['HelpfulnessDenominator'], -1)
df.head()
"""#### assigning different different labels to helpful% according to its value"""
df['Helpful %'].unique()
pd.cut(df['Helpful %'] , bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0], labels = ['Empty', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])
df['%upvote'] = pd.cut( df['Helpful %'] , bins = [-1, 0, 0.2, 0.4, 0.6, 0.8, 1.0], labels = ['Empty', '0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])
df.head()
df.groupby(['Score', '%upvote']).agg('count')
"""#### considering only Id Column, as I have to count Total Upvotes for different different categories"""
df.groupby(['Score', '%upvote']).agg({'Id':'count'})
df_s=df.groupby(['Score', '%upvote']).agg({'Id':'count'}).reset_index()
df_s
"""#### create Pivot Table for better conclusion"""
df_s.pivot(index='%upvote',columns='Score')
"""#### create heatmap of it,for better Visualisations"""
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(df_s.pivot(index='%upvote',columns='Score'),annot=True,cmap = 'YlGnBu')
plt.title('How helpful users find among user scores')
"""Key message from above:
Reviews are skewed towards positive
More than half of the reviews are with zero votes
Many people agree with score 5 reviews
"""
df.shape
df.head()
df['Score'].unique()
df2 = df[df['Score'] != 3]
X = df2['Text']
y_dict = {1:0, 2:0, 4:1, 5:1}
y = df2['Score'].map(y_dict)
"""#### Score prediction"""
"""### convert your text into vectors using NLP"""
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')
##takes almost 2 mins to execute
X_c = c.fit_transform(X)
print('features: {}'.format(X_c.shape[1]))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_c, y)
print(' train records: {}'.format(X_train.shape[0]))
from sklearn.linear_model import LogisticRegression
log=LogisticRegression()
ml =log.fit(X_train, y_train)
acc = ml.score(X_test, y_test)
print ('Model Accuracy: {}'.format(acc))
"""#### fetch Top 20 Positive & Top 20 negative words"""
w = c.get_feature_names()
w
coef = ml.coef_.tolist()[0]
coef
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'],ascending=False)
coeff_df
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('\n')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))
"""#### lets try to create a function so that I can apply mutliple NLP Techniques + Multiple Ml algos in such a way that I will acheive my best accuracy"""
### create a function
def text_fit(X, y, nlp_model,ml_model,coef_show=1):
X_c = nlp_model.fit_transform(X)
print('features: {}'.format(X_c.shape[1]))
X_train, X_test, y_train, y_test = train_test_split(X_c, y)
print(' train records: {}'.format(X_train.shape[0]))
print(' test records: {}'.format(X_test.shape[0]))
ml =ml_model.fit(X_train, y_train)
acc = ml.score(X_test, y_test)
print ('Model Accuracy: {}'.format(acc))
if coef_show == 1:
w = nlp_model.get_feature_names()
coef = ml.coef_.tolist()[0]
coeff_df = pd.DataFrame({'Word' : w, 'Coefficient' : coef})
coeff_df = coeff_df.sort_values(['Coefficient', 'Word'], ascending=[0, 1])
print('\n')
print('-Top 20 positive-')
print(coeff_df.head(20).to_string(index=False))
print('\n')
print('-Top 20 negative-')
print(coeff_df.tail(20).to_string(index=False))
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')
from sklearn.linear_model import LogisticRegression
text_fit(X, y, c, LogisticRegression())
"""### Lets define a predict function"""
from sklearn.metrics import confusion_matrix,accuracy_score
def predict(X, y, nlp_model,ml_model):
X_c = nlp_model.fit_transform(X)
print('features: {}'.format(X_c.shape[1]))
X_train, X_test, y_train, y_test = train_test_split(X_c, y)
print(' train records: {}'.format(X_train.shape[0]))
print(' test records: {}'.format(X_test.shape[0]))
ml =ml_model.fit(X_train, y_train)
predictions=ml.predict(X_test)
cm=confusion_matrix(predictions,y_test)
print(cm)
acc=accuracy_score(predictions,y_test)
print(acc)
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer(stop_words = 'english')
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
predict(X,y,c,lr)
"""#### Accuracy is around 93.9% - not bad. However we notice that some of those significant coefficients are not meaningful, e.g. 280mg."""
from sklearn.dummy import DummyClassifier
### calling function for dummy classifier
text_fit(X, y, c, DummyClassifier(),0)
"""#### Logistic regression model on TFIDF"""
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
text_fit(X, y, tfidf, LogisticRegression())
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words = 'english')
predict(X, y, tfidf, LogisticRegression())
"""Accurany is roughly the same - 93.5%. However we notice that the significant words make much more sense now, with higher coefficient magnitude as well!"""
"""#### Upvote prediction
We will be focusing on score 5 reviews, and get rid of comments with neutral votes
"""
data = df[df['Score'] == 5]
data.columns
data2 = data[data['%upvote'].isin(['0-20%', '20-40%', '60-80%', '80-100%'])]
data2.shape
X = data2['Text']
y_dict = {'0-20%': 0, '20-40%': 0, '60-80%': 1, '80-100%': 1}
y = data2['%upvote'].map(y_dict)
print(y.value_counts())
"""#### The target class 'y' is highly skewed , we will observe positive upvotes are too much higher than negative ones.
Let's resample the data to get balanced data:
"""
from sklearn.feature_extraction.text import TfidfVectorizer
tf=TfidfVectorizer()
X_c=tf.fit_transform(X)
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X_c,y,train_size=0.7)
y_test.value_counts()
## RandomOverSampler to handle imbalanced data
from imblearn.over_sampling import RandomOverSampler
os = RandomOverSampler()
X_train_res, y_train_res = os.fit_sample(X_c, y)
X_train_res.shape,y_train_res.shape
from collections import Counter
print('Original dataset shape {}'.format(Counter(y)))
print('Resampled dataset shape {}'.format(Counter(y_train_res)))
from sklearn.linear_model import LogisticRegression
log_class=LogisticRegression()
grid={'C':10.0 **np.arange(-2,3),'penalty':['l1','l2']}
import numpy as np
from sklearn.model_selection import GridSearchCV
clf=GridSearchCV(estimator=log_class,param_grid=grid,cv=5,n_jobs=-1,scoring='f1_macro')
clf.fit(X_train_res,y_train_res)
y_pred=clf.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(accuracy_score(y_test,y_pred))