/
dlt_clstm_w2v_v4.py
266 lines (201 loc) · 10.3 KB
/
dlt_clstm_w2v_v4.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
# -*- coding: utf-8 -*-
"""
v3 implements the fixed validation dataset approach and fixes numpy seed
v2 introduces sequential expansion with fixed seed
Implements a CNN-bi_LSTM model for text classification using w2v embeddings
Created on Wed Jan 09 09:14:51 2019
Trains a Bidirectional LSTM on As data
@author: 13963
"""
from __future__ import print_function
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, LSTM, Bidirectional
from keras.preprocessing.text import Tokenizer
from keras.layers import Conv1D, MaxPooling1D
from keras.preprocessing.sequence import pad_sequences
import pandas as pd
import csv
from sklearn import metrics
from sklearn.metrics import roc_curve
import os
import time
from keras.initializers import Constant
from gensim.models import Word2Vec
from numpy.random import seed
seed(1)
from sklearn.metrics import precision_recall_curve,average_precision_score
import matplotlib.pyplot as plt
if __name__ == "__main__":
path = os.path.dirname(os.path.abspath(__file__))
def plot_epoch_loss(history): # function to plot loss and accuracy over epochs
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(1, len(loss) + 1)
plt.clf()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
pn=ofp_p+r'\\'+ofn_p+"TVLoss_"+str(X_train.shape[0])+".png"
plt.savefig(pn)
plt.close()
plt.clf()
acc = history.history['acc']
val_acc = history.history['val_acc']
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Acc')
plt.legend()
pn=ofp_p+r'\\'+ofn_p+"TVAccuracy_"+str(X_train.shape[0])+".png"
plt.savefig(pn)
plt.close()
return()
def cnnlstm_fit():
start_time = time.time()
global X_val,X_train,X_test,y_train,y_val,y_test
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
y_train = np.array(y_train)
model = Sequential()
model.add(Embedding(num_words,EMBEDDING_DIM,
embeddings_initializer=Constant(embedding_matrix),
input_length=MAX_SEQUENCE_LENGTH,
trainable=False))
# model.add(Embedding(num_words, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
# sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# embedded_sequences = embedding_layer(sequence_input)
model.add(Conv1D(128, 5, activation='relu'))#(embedded_sequences)
model.add(MaxPooling1D(pool_size=4))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
# try using different optimizers and different optimizer configs
model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])
print('Train...')
history=model.fit(X_train, y_train,
batch_size=batch_size,
epochs=epochs,
validation_data=[X_val, y_val])
plot_epoch_loss(history) # plot loss curve
prob_val=model.predict(X_val, verbose=0) # make prob predictions on val data
prob_val=[i for i in prob_val] # get prob relevance
tr_val=[float(i) for i in y_val] # true label for val data as float
fpr, tpr, thresholds =roc_curve(tr_val,prob_val)
optco=thresholds[np.argmax(tpr>0.95)] # optimal prob cutoff
pred_val=[1. if i>optco else 0. for i in prob_val]# pred is 1 if prob>optimal cutoff as determined from val data
pf1=metrics.f1_score(tr_val,pred_val) # predicted f1
ppr=metrics.precision_score(tr_val,pred_val) # predicted precision
prec=metrics.recall_score(tr_val,pred_val) # predicted recall
proc=metrics.roc_auc_score(tr_val,prob_val) # predicted roc auc measured on val data
precision, recall, pr_thresholds = precision_recall_curve(tr_val, prob_val) # pr curve
#p_prre_auc= metrics.auc(recall, precision,reorder=True) # pr auc
p_prre_auc= metrics.average_precision_score(tr_val,prob_val) # pr auc
prob_test=model.predict(X_test, verbose=0) # make prob predictions on unclassified (test) data
prob_test=[i for i in prob_test] # get prob relevance
pred_test=[1. if i>optco else 0. for i in prob_test]# pred is 1 if prob on the test data > optimal cutoff as determined from val data
tr_test=[float(i) for i in y_test] # true label for test data as float
af1=metrics.f1_score(tr_test,pred_test) #actual f1
apr=metrics.precision_score(tr_test,pred_test) #actual precision
arec=metrics.recall_score(tr_test,pred_test) #actual recall
aroc=metrics.roc_auc_score(tr_test,prob_test) #actual roc auc
precision, recall, a_thresholds = precision_recall_curve(tr_test, prob_test)
#a_prre_auc= metrics.auc(recall,precision,reorder=True)
a_prre_auc= metrics.average_precision_score(tr_test,prob_test) # pr auc
ndata=X_train.shape[0]
t=(time.time() - start_time)/3600. # time taken in seconds
r = [ndata,t,pf1, af1, ppr,apr, prec,arec,proc,aroc,p_prre_auc,a_prre_auc] # list of results for output
print("Time to run CNN_LSTM classification model = --- %s hours ---" % ((time.time() - start_time)/3600.))
print (r)
return(r)
ver="W2V_v3_MoA"
MAX_SEQUENCE_LENGTH = 1000 # max length of document
MAX_NUM_WORDS = 20000 # vocab size
EMBEDDING_DIM = 1000 # len of W2V embedding
ifp=path # Filepath for input file
ifn=r"\Data\As_All.csv" # Input filename
txtcol=2 # text column zero index
targcol=4 # target column zero index
ofp=path+r"\Output-CNN_BiLSTM" # output file path
if not os.path.isdir(ofp):
os.makedirs(ofp)
ofn=r'\CNN_biLSTM_LearningCurve_'+ver+'.csv' # output file name
ofp_p=ofp # output file path
ofn_p=r'CNN_biLSTM_' # output file name
csize=500 # chunk size for incrementing the training data
vsize=500 # val data set size
tsize=2500 # test dataset ssize
batch_size=128 # batch size
epochs=10 # num epochs
fpm=path+r'\W2V\Models' # Filepath to stored w2v models
w2vmodel = Word2Vec.load(fpm+r'\abstracts_phrases_dl.model') # load w2v model
voc=w2vmodel.wv.vocab # vocabulary of model
voc=list(voc.keys()) # make list of vocab
print('Found %s word vectors.' % len(voc))
ifpn=ifp+ifn # input filepath and name
df = pd.read_csv(ifpn, header=0) # read original data as dataframe
df=df.sample(frac=1,replace=False,random_state=45) # reshuffle with fixed seed
print (df.head(3))
header = list(df.columns.values) # original data header
txtcolhd=header[txtcol] # header of text column in input file
targcolhd=header[targcol] # header of target column in input file
df_text = df[txtcolhd] # text vector from training dataframe
df_text=df_text.tolist() # text vector as list
txt = [x.decode('ascii', 'ignore') for x in df_text if isinstance(x,basestring)] # convert to ascii
df_target = df[targcolhd].values # target values as vector
# second, prepare text samples and their labels
print('Processing text dataset')
texts=txt
labels=df_target
print('Found %s texts.' % len(texts))
# finally, vectorize the text samples into a 2D integer tensor
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
if i > MAX_NUM_WORDS:
continue
if word in voc: # if term in vocab
embedding_matrix[i]=w2vmodel.wv[word] # look up w2v representation
else: # if word not present in w2v model vocab
embedding_matrix[i]=w2vmodel.wv['computer'] # use dummy word representation
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
# carve out validation data
X_val=data[0:vsize,:]
y_val=labels[0:vsize]
y_val = np.array(y_val).astype(int);
# carve out test data
X_test=data[vsize:tsize+vsize,:]
y_test=labels[vsize:tsize+vsize]
y_test = np.array(y_test).astype(int);
X_r=data[tsize+vsize:,:] # residual X for training
y_r=labels[tsize+vsize:] # residual Y
print('Shape of X_test:', X_test.shape)
print('Shape of X_val:', X_val.shape)
num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
finres=[] # initialize final results list
for i in range(1,int(X_r.shape[0]/csize)+1): # loop over chunks
X_train = X_r[:i*csize,:]
y_train=y_r[:i*csize]
print('Shape of X_train:', X_train.shape)
finres.append(cnnlstm_fit()) # call cnn_fit function and append results
op = map(None, finres) # output list
print (op)
header_pr = ["n_Training","RunTime-Hrs","Predicted F1-score", "Actual F1-score","Predicted Precision", "Actual Precision", "Predicted Recall","Actual Recall","Predicted ROC-AUC","Actual ROC-AUC","Predicted PrRec-AUC","Actual PrRec-AUC"]
ofpn=ofp+ofn
with open(ofpn, "w") as output: # Output results
writer = csv.writer(output, lineterminator='\n')
writer.writerow(header_pr)
for row in op:
tempRow = row
writer.writerow(tempRow)