/
train.py
141 lines (115 loc) · 4.75 KB
/
train.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
from collections import Counter
from pybrain.datasets import ClassificationDataSet
from pybrain.datasets import SupervisedDataSet
from pybrain.utilities import percentError
from pybrain.tools.shortcuts import buildNetwork
from pybrain.supervised.trainers import BackpropTrainer
from pybrain.structure.modules import SoftmaxLayer
from pybrain.tools.validation import ModuleValidator
from pybrain.tools.validation import CrossValidator
from sklearn import preprocessing
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import nltk
from nltk.stem import PorterStemmer
import pickle
# This function takes the raw text as the param, and returns the
# normalised text back
def normalise_review( raw_review ):
stemmer=PorterStemmer()
raw_review = re.sub("[^a-zA-Z]", " ", raw_review)
raw_review = raw_review.lower().split()
stops = set(stopwords.words("english"))
meaningful_words = [w for w in raw_review if not w in stops]
meaningful_words = nltk.pos_tag(meaningful_words)
# We consider all the adjectives and adverbs
tag_list = ['JJ','JJR','JJS','RB','RBS','RBR']
normalised_data = []
for values in meaningful_words:
if values[1] in tag_list:
x = stemmer.stem(values[0])
x = str(x)
normalised_data.append(x)
return( " ".join( normalised_data ))
clean_train_reviews = []
rating_list = []
#Reading the CSV file for reviews
fields = ['rating','review']
reviews = pd.read_csv('reviews.csv',skipinitialspace = True, usecols=fields)
print "Collecting Reviews."
for i in range (0, len(reviews.rating)):
try:
rating = float(reviews.rating[i])
review = str(reviews.review[i])
normalised_data = normalise_review(review)
if (len(normalised_data) != 0):
clean_train_reviews.append(normalised_data)
rating_list.append(rating)
except:
pass
# Here we initialize CountVectorizer object, scikit-learn's
# bag of words tool
vectorizer = CountVectorizer(analyzer = "word", tokenizer = None, \
preprocessor = None, stop_words = None, \
max_features = 1600)
# fit_transform transforms our data into feature vectors.
# Here we extract the top features (max_features)
train_data_features = vectorizer.fit_transform(clean_train_reviews)
vocab = vectorizer.get_feature_names()
# We represent every review in the terms of (max_features)
# only, ie, the top features which exist in vocab
list_features = []
for review in clean_train_reviews:
review_token = nltk.word_tokenize(review)
review_count = Counter(review_token)
temp_list = []
for words in vocab:
temp_list.append(review_count[words])
list_features.append(temp_list)
# We standardise them in the range -1 to 1
scaler = preprocessing.StandardScaler().fit(list_features)
list_features = np.array(list_features)
# The standardized object is stored
fileObject = open('STDSCALER', 'wb')
pickle.dump(scaler, fileObject)
fileObject.close()
list_features = scaler.transform(list_features)
# We now start building our network.
alldata = ClassificationDataSet(1600, 1, nb_classes=2)
for i in range (0,len(rating_list)):
alldata.addSample(list_features[i], rating_list[i])
# Our dataset is divided, with 80% as training data and
# 20% as test data
tstdata_temp, trndata_temp = alldata.splitWithProportion(0.2)
tstdata = ClassificationDataSet(1600, 1, nb_classes=2)
for n in xrange(0, tstdata_temp.getLength()):
tstdata.addSample( tstdata_temp.getSample(n)[0], tstdata_temp.getSample(n)[1] )
trndata = ClassificationDataSet(1600, 1, nb_classes=2)
for n in xrange(0, trndata_temp.getLength()):
trndata.addSample( trndata_temp.getSample(n)[0], trndata_temp.getSample(n)[1] )
trndata._convertToOneOfMany( )
tstdata._convertToOneOfMany( )
fnn = buildNetwork( trndata.indim,120,trndata.outdim, outclass=SoftmaxLayer )
trainer = BackpropTrainer(fnn, dataset=trndata, momentum=0.1, verbose=True, weightdecay=0.00001)
modval = ModuleValidator()
# We define the number of iterations we want to train our model.
for i in range(100):
trainer.trainEpochs(1)
trnresult = percentError(trainer.testOnClassData(dataset=trndata),trndata['class'])
print "epoch : " , trainer.totalepochs," train error: " , trnresult
# We validate our model by applying the n-folds technique and check the Mean Square Error
cv = CrossValidator( trainer, trndata, n_folds=5, valfunc=modval.MSE )
print "MSE %f at loop %i"%(cv.validate(),i)
# Finally we test our data on the model we built
perror = percentError(trainer.testOnClassData(dataset=tstdata),tstdata['class'])
print " Percent error on test data is - ",100.0 - perror
# We also take a dump of the model and the top features
fileObject2 = open('ANNDUMP', 'wb')
pickle.dump(fnn, fileObject2)
fileObject2.close()
fObject = open('VOCABDUMP','wb')
pickle.dump(vocab, fObject)
fObject.close()