/
review_classifier.py
364 lines (277 loc) · 13.5 KB
/
review_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
import html, re, string, gzip, time, json
from pprint import pprint
import numpy as np
from scipy.stats import randint
import argparse
import multiprocessing
import tensorflow as tf
from gensim import utils
from gensim.models.doc2vec import LabeledSentence
from gensim.models import Doc2Vec
from random import shuffle
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import cross_validation, grid_search
from sklearn.grid_search import RandomizedSearchCV, GridSearchCV
# random seed
seed = 7
cores = multiprocessing.cpu_count()
regex = re.compile('[%s]' % re.escape(string.punctuation))
class LabeledLineSentence(object):
# this class was taken from the turorial in https://linanqiu.github.io/2015/10/07/word2vec-sentiment/
def __init__(self, sources):
self.sources = sources
flipped = {}
# make sure that keys are unique
for key, value in sources.items():
if value not in flipped:
flipped[value] = [key]
else:
raise Exception('Non-unique prefix encountered')
def __iter__(self):
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
yield LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no])
def to_array(self):
self.sentences = []
for source, prefix in self.sources.items():
with utils.smart_open(source) as fin:
for item_no, line in enumerate(fin):
self.sentences.append(LabeledSentence(utils.to_unicode(line).split(), [prefix + '_%s' % item_no]))
return self.sentences
def sentences_perm(self):
shuffle(self.sentences)
return self.sentences
# write a quick function that parses the datafile
def parse(path):
with gzip.open(path, 'r') as g:
for l in g:
yield eval(l)
# this was only a test function used to debug
def test_parse():
for i,review in enumerate(parse('reviews_Electronics_5.json.gz')):
print(int(review['overall']))
print(type(review['overall']))
time.sleep(5)
def create_json():
#here, I created a full json file, where the reviews have been edited (no punctuation, lower case)
f = open("reviews_strict.json", 'w')
for l in parse('reviews_Electronics_5.json.gz'):
l['reviewText'] = regex.sub('', html.unescape(l['reviewText'])).lower()
l = json.dumps(l)
f.write(l + '\n')
f.close()
def review_lines(train_size,test_size):
# here I create a document where each line is a review in order to train Doc2Vec
# in order to get this done in finite time, I had to only go through the first O(10^5)-O(10^6) reviews.
# I pick random documents from the corpus
f = open("train.txt", 'w')
g = open("test.txt", 'w')
h = open("train_target.txt",'w')
q = open("test_target.txt",'w')
limit = 300000
shuffle_indices = np.random.permutation(np.arange(limit))
for i,review in enumerate(parse('reviews_Electronics_5.json.gz')):
if i in shuffle_indices[:train_size]:
if int(review['overall']) == 5 or int(review['overall']) == 4:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
f.write(review + '\n')
h.write('1' + '\n')
elif int(review['overall']) == 1 or int(review['overall']) == 2:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
f.write(review + '\n')
h.write('0' + '\n')
elif i in shuffle_indices[train_size:train_size + test_size]:
if int(review['overall']) == 5 or int(review['overall']) == 4:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
g.write(review + '\n')
q.write('1' + '\n')
elif int(review['overall']) == 1 or int(review['overall']) == 2:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
g.write(review + '\n')
q.write('0' + '\n')
elif i > limit:
break
f.close()
g.close()
h.close()
q.close()
print('Completed generating test and training data...')
def d2v_source(train_size):
# here I create a document where each line is a review in order to train Doc2Vec
# in order to get this done in finite time, I had to only go through the first O(10^5)-O(10^6) reviews. This can be increased to increase performance
f = open("d2v_train.txt", 'w')
limit = 300000
shuffle_indices = np.random.permutation(np.arange(limit))
for i,review in enumerate(parse('reviews_Electronics_5.json.gz')):
if i in shuffle_indices[:train_limit]:
if int(review['overall']) == 5 or int(review['overall']) == 4:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
f.write(review + '\n')
elif int(review['overall']) == 1 or int(review['overall']) == 2:
review = regex.sub('', html.unescape(review['reviewText'])).lower()
f.write(review + '\n')
elif i > limit:
break
f.close()
def create_doc2vec_model(vectorsize):
# this creates the Doc2Vec model from
sources = {'d2v_train.txt':'TRAIN'} #,'test.txt':'TEST' }
sentences = LabeledLineSentence(sources)
model = Doc2Vec(min_count=1, window=10, size=vectorsize, sample=1e-4, negative=5, workers=cores,alpha=0.025, min_alpha=0.025)
model.build_vocab(sentences.to_array())
print('Starting to train...')
for epoch in range(10):
print('Epoch ',epoch)
model.train(sentences.sentences_perm()) # this is done so that SGD (stochastic gradient descent) can meanigfully converge
model.save('./amzn.d2v')
return model
# print(model.most_similar('good'))
def transform_input(vectorsize):
# this loads the premade model saved as amzn.d2v and transforms writes its vectors into arrays that can be input into the scikit learn algorithms
print('Loading Doc2Vec model...')
try:
model = Doc2Vec.load('./amzn.d2v')
except Exception as exception:
print('No existing model found. Starting to create a model...')
train_size = 50000
d2v_source(train_size)
model = create_doc2vec_model(vectorsize)
# load or generate train and test data
try:
with open('train.txt') as f:
train_raw = np.asarray([line.rstrip('\n') for line in f])
with open('test.txt') as f:
test_raw = np.asarray([line.rstrip('\n') for line in f])
with open('train_target.txt') as f:
target = np.asarray([int(line.rstrip('\n')) for line in f])
with open('test_target.txt') as f:
target_test = np.asarray([int(line.rstrip('\n')) for line in f])
except Exception as exception:
print('No train data found. Generating new train and test files....')
train_size = 50000
test_size = 20000
review_lines(train_size,test_size)
with open('train.txt') as f:
train_raw = np.asarray([line.rstrip('\n') for line in f])
with open('test.txt') as f:
test_raw = np.asarray([line.rstrip('\n') for line in f])
with open('train_target.txt') as f:
target = np.asarray([int(line.rstrip('\n')) for line in f])
with open('test_target.txt') as f:
target_test = np.asarray([int(line.rstrip('\n')) for line in f])
# infer vectors for the sentences of the train and test sets
# I do this by creating a list of strings out of the document and then converting that into a vector
# this takes forever...so for further use, I will only do this for new train and test sets and save the vectors
try:
train_arrays = np.loadtxt('train_vectors.txt')
test_arrays = np.loadtxt('test_vectors.txt')
except Exception as exception:
train_arrays = np.zeros((target.shape[0],vectorsize))
test_arrays = np.zeros((target_test.shape[0],vectorsize))
print('Vectorizing the train and test data...')
for i in range(target.shape[0]):
train_arrays[i,:] = model.infer_vector(train_raw[i].split())
for i in range(target_test.shape[0]):
test_arrays[i,:] = model.infer_vector(test_raw[i].split())
np.savetxt('train_vectors.txt',train_arrays)
np.savetxt('test_vectors.txt',test_arrays)
return train_arrays, target, test_arrays, target_test
def train_ML_model(M_var,vectorsize):
# in this function we define the logistic regression as the default choice for our machine learning algorithm. Other optional options are
# random forest classifier or gradient boosting classifier with the flag -M
train, target, test, target_test = transform_input(vectorsize)
if M_var == 'lr':
classifier = LogisticRegression()
print('Logistic regression chosen...')
if M_var == 'rf':
classifier = RandomForestClassifier(n_estimators=100, n_jobs=2,oob_score=True, min_samples_split=3,min_samples_leaf=4 )
print('Random Forest chosen...')
if M_var == 'gb':
classifier = GradientBoostingClassifier(n_estimators=200, learning_rate=0.6, subsample=0.9, max_depth=1, random_state=0)
print('Gradient Boosting chosen...')
if M_var == 'nn':
print('Neural network chosen...')
neural_network(train, target, test, target_test,vectorsize)
if M_var in ['lr','rf','gb']:
classifier.fit(train, target)
print('The model has a %s test score' % classifier.score(test,target_test))
def batch_iter(input_data, target, batch_size, shuffle=True):
# generates an iterator over batches of th training data
data_size = np.shape(input_data)[0]
num_batches_per_epoch = int(data_size/batch_size) + 1
if shuffle:
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = input_data[shuffle_indices]
shuffled_target = target[shuffle_indices]
else:
shuffled_data = input_data
shuffled_target = target
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
yield shuffled_data[start_index:end_index], shuffled_target[start_index:end_index]
def create_neural_network_model(data,keep_hidden,vectorsize):
# the input dimension depends on the size of the doc2vec vectors
n_input = vectorsize
n_hidden = int(1.5*vectorsize)
n_hidden2 = int(1.5*vectorsize)
hidden_layer1 = {'weights' : tf.Variable(tf.random_normal([n_input,n_hidden])), 'biases' : tf.Variable(tf.random_normal([n_hidden]))}
# hidden_layer2 = {'weights' : tf.Variable(tf.random_normal([n_hidden,n_hidden2])), 'biases' : tf.Variable(tf.random_normal([n_hidden2]))}
output_layer = {'weights' : tf.Variable(tf.random_normal([n_hidden,2])), 'biases' : tf.Variable(tf.random_normal([2]))}
layer1 = tf.add(tf.matmul(data,hidden_layer1['weights']),hidden_layer1['biases'])
layer1 = tf.nn.relu(layer1)
layer1 = tf.nn.dropout(layer1, keep_hidden)
# layer2 = tf.add(tf.matmul(layer1,hidden_layer2['weights']),hidden_layer2['biases'])
# layer2 = tf.nn.relu(layer2)
# layer2 = tf.nn.dropout(layer2, keep_hidden)
output = tf.add(tf.matmul(layer1,output_layer['weights']),output_layer['biases'])
return output
def neural_network(train, target, test, target_test,vectorsize):
# this initializes and trains the neural network
x = tf.placeholder('float',[None,vectorsize]) #data
y = tf.placeholder(dtype=tf.int64)
keep_hidden = tf.placeholder('float',)
prediction = create_neural_network_model(x,keep_hidden,vectorsize)
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(prediction,y))
optimizer = tf.train.AdamOptimizer().minimize(cost)
n_epochs = 20
batch_size = 100
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
for epoch in range(n_epochs):
batches = batch_iter(train, target, batch_size)
epoch_loss = 0
for batch in batches:
x_batch, y_batch = batch
_, c = sess.run([optimizer, cost], feed_dict={x : x_batch, y : y_batch, keep_hidden : 0.7})
epoch_loss += c
print('Epoch', (epoch+1), 'completed out of', n_epochs, 'loss:', "%.3f" % epoch_loss)
correct = tf.equal(tf.argmax(prediction, 1), y)
accuracy = tf.reduce_mean(tf.cast(correct, 'float'))
print('Accuracy', accuracy.eval({x : test, y : target_test, keep_hidden : 1.0}))
def main():
# size of the word vectors
vectorsize = 200
# number of documents that are used to create the Doc2Vec model
train_size = 50000
parser = argparse.ArgumentParser()
parser.add_argument("-M", dest = "M_input", help="ML method", type=str, choices=['lr', 'rf', 'gb', 'nn'])
parser.add_argument("-r",default = False, dest="remake", help='create new Doc2Vec model, default: no', type=bool, choices=[True])
args = parser.parse_args()
if args.M_input:
M_var = args.M_input
else:
M_var = 'lr'
if args.remake:
remake_doc2vec = bool(args.remake)
else:
remake_doc2vec = False
if remake_doc2vec:
print('Generating a new Doc2Vec model...')
d2v_source(train_size)
create_doc2vec_model(vectorsize)
train_ML_model(M_var,vectorsize)
if __name__ == '__main__':
main()