-
Notifications
You must be signed in to change notification settings - Fork 0
/
modelGeneration.py
249 lines (202 loc) · 9.42 KB
/
modelGeneration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
from __future__ import division
import math
import csv
import numpy as np
import gensim
from sklearn import svm
import time
import datetime
import os
import preprocess
REPORT_FILES = ['nlp_data/CleanedBrainsFull.csv','nlp_data/CleanedCTPAFull.csv','nlp_data/CleanedPlainabFull.csv','nlp_data/CleanedPvabFull.csv']
REPORT_FILES_BRAINS = ['nlp_data/CleanedBrainsFull.csv']
REPORT_FILES_CTPA = ['nlp_data/CleanedCTPAFull.csv']
REPORT_FILES_PLAINAB = ['nlp_data/CleanedPlainabFull.csv']
REPORT_FILES_PVAB = ['nlp_data/CleanedPvabFull.csv']
REPORT_FILES_LABELLED = ['nlp_data/CleanedBrainsLabelled.csv','nlp_data/CleanedCTPALabelled.csv','nlp_data/CleanedPlainabLabelled.csv','nlp_data/CleanedPvabLabelled.csv']
REPORT_FILES_LABELLED_BRAINS = ['nlp_data/CleanedBrainsLabelled.csv']
REPORT_FILES_LABELLED_CTPA = ['nlp_data/CleanedCTPALabelled.csv']
REPORT_FILES_LABELLED_PLAINAB = ['nlp_data/CleanedPlainabLabelled.csv']
REPORT_FILES_LABELLED_PVAB = ['nlp_data/CleanedPvabLabelled.csv']
DIAGNOSES = ['Brains','CTPA','Plainab','Pvab']
# builds and saves dictionary and corpus (in BOW form) from report files
def buildDictionary():
reports = preprocess.getProcessedReports()
print("files loaded")
# build dictionary
dictionary = gensim.corpora.Dictionary(reports)
# dictionary.filter_extremes(no_below=3)
dictionary.save('./model_files/reports.dict')
print(dictionary)
print("dictionary finished")
# build corpus
corpus = [dictionary.doc2bow(report) for report in reports]
gensim.corpora.MmCorpus.serialize('./model_files/reports.mm', corpus)
# print(corpus)
print("corpus finished")
# NO NEED TO CALL THIS FUNCTION DIRECTLY
# builds and saves the index file used to compute similarity between documents
# input is the corpus file
def build_similarityIndex(corpus):
index = gensim.similarities.SparseMatrixSimilarity(corpus,num_features=corpus.num_terms)
index.save('./model_files/reports.index')
# NO NEED TO CALL THIS FUNCTION DIRECTLY
# apply Tf-Idf transformation to generate new model, corpus and index
# input is the corpus file
def transform_tfidf(corpus):
tfidf_model = gensim.models.TfidfModel(corpus)
tfidf_model.save('./model_files/reports.tfidf_model')
newCorpus = tfidf_model[corpus]
gensim.corpora.MmCorpus.serialize('./model_files/reports_tfidf.mm', newCorpus)
index = gensim.similarities.SparseMatrixSimilarity(newCorpus,num_features=corpus.num_terms)
index.save('./model_files/reports_tfidf.index')
# NO NEED TO CALL THIS FUNCTION DIRECTLY
# apply LSI transformation to generate new model, corpus and index
# input is the corpus file and dictionary file
def transform_lsi(corpus,dictionary):
lsi_model = gensim.models.LsiModel(corpus, id2word=dictionary, num_topics=10)
lsi_model.save('./model_files/reports.lsi_model')
newCorpus = lsi_model[corpus]
gensim.corpora.MmCorpus.serialize('./model_files/reports_lsi.mm', newCorpus)
index = gensim.similarities.MatrixSimilarity(newCorpus)
index.save('./model_files/reports_lsi.index')
# NO NEED TO CALL THIS FUNCTION DIRECTLY
# apply LDA transformation to generate new model, corpus and index
# input is the corpus file and dictionary file
# num_topics tested with 10,20,30,35,40,50
def transform_lda(corpus,dictionary):
# lda_model = gensim.models.LdaMulticore(corpus, id2word=dictionary, num_topics=30)
lda_model = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=30)
lda_model.save('./model_files/reports.lda_model')
newCorpus = lda_model[corpus]
gensim.corpora.MmCorpus.serialize('./model_files/reports_lda.mm', newCorpus)
index = gensim.similarities.MatrixSimilarity(newCorpus)
index.save('./model_files/reports_lda.index')
# calls the model building and transformation functions to create the model files for the BOW, TFIDF and LSI
def buildModels():
# load the dictionary
dictionary = gensim.corpora.Dictionary.load('./model_files/reports.dict')
print(dictionary)
# print(dictionary.token2id)
# load the corpus
corpus = gensim.corpora.MmCorpus('./model_files/reports.mm')
# print(corpus)
print('Example case report under BOW representation: ')
print(corpus[200])
# print(list(corpus))
# build index for similarity comparison using BOW representation
build_similarityIndex(corpus)
# transform model using TFIDF
transform_tfidf(corpus)
tfidf_corpus = gensim.corpora.MmCorpus('./model_files/reports_tfidf.mm')
print('Example case report under Tf-Idf transformation: ')
print(list(tfidf_corpus)[200])
# transform model using LSI
transform_lsi(tfidf_corpus,dictionary)
lsi_corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
# lsi_model.print_topics()
print('Example case report under LSI transformation: ')
print(list(lsi_corpus)[200])
# # transform model using LDA
# transform_lda(corpus,dictionary)
# lda_corpus = gensim.corpora.MmCorpus('./model_files/reports_lda.mm')
# # lda_model.print_topics()
# print('Example case report under LDA transformation: ')
# print(list(lda_corpus)[200])
# function to test the functionality of Word2Vec
def buildWord2VecModel():
reports = preprocess.getProcessedReports()
model = gensim.models.Word2Vec(reports, min_count=3)
model.init_sims(replace=True)
model.save("./model_files/reports.word2vec_model")
print(model)
# model = gensim.models.Word2Vec.load("zzmodel")
print("----------------------------------similarity test")
print(model.similarity("head","brain"))
print("----------------------------------raw numpy vector of word")
print(model["age"])
print("----------------------------------remove outlier")
print(model.doesnt_match("hours four age".split()))
print("----------------------------------similar words")
print(model.most_similar("haem"))
print("script finished")
# builds and saves the Doc2Vec model of all the processed reports
# doc2vec performs better with dbow than dm
# tested with hidden layer size 100,200,300
def buildDoc2VecModel():
reports = preprocess.getProcessedReports()
# construct sentences from reports
taggedDocuments = []
for i in range(len(reports)):
taggedDocument = gensim.models.doc2vec.TaggedDocument(words= reports[i], tags= [i])
taggedDocuments.append(taggedDocument)
# model = gensim.models.Doc2Vec(taggedDocuments)
model = gensim.models.Doc2Vec(size=300, min_count=5, workers=16,dm=1, dbow_words=1,negative=20)
model.build_vocab(taggedDocuments)
model.alpha = 0.025 # learning rate
for epoch in range(10):
print(epoch)
model.train(taggedDocuments)
model.alpha -= 0.001
model.min_alpha = model.alpha
model.save("./model_files/reports.doc2vec_model")
# generated a svm model for classifying reports as either positive or negative based on diagnosis
# uses a MmCorpus file
def testClassification():
threashold = 0.001
corpus = gensim.corpora.MmCorpus('./model_files/reports_lsi.mm')
#convert the corpus to a numpy matrix, take the transpose and convert it to a list
corpusList = [list(x) for x in zip(*gensim.matutils.corpus2dense(corpus,corpus.num_terms,dtype=np.float64))]
# corpusList = [list(x) for x in np.asarray(corpus)[:,:,1]]
reports = preprocess.getReports()
numFolds = 5 # number of folds for cross validation
# Create the output directory
directory = "label_tests/" + datetime.datetime.now().strftime('%m_%d_%H_%M') +"/"
os.makedirs(directory)
with open(directory+"labelClassification.csv",'w') as writeFile:
writer = csv.writer(writeFile)
writer.writerow(["score","output label","expected label","report"])
for j in range(len(REPORT_FILES_LABELLED)):
writer.writerow("")
writer.writerow("")
writer.writerow([DIAGNOSES[j]])
# fetch corpus and labels
labelledCorpus = []
unlabelledCorpus = []
# The labeled data is at the start of the data set
# Get the ids in the corpus of these first labeled examples for each class
for i in range(preprocess.getNumReports(REPORT_FILES[:j]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]])):
labelledCorpus.append(corpusList[i])
for i in range(preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES_LABELLED[j]]),preprocess.getNumReports(REPORT_FILES[:j])+preprocess.getNumReports([REPORT_FILES[j]])):
unlabelledCorpus.append(corpusList[i])
labels = np.asarray(preprocess.getData([REPORT_FILES_LABELLED[j]]))[:,2]
############### THIS CODE BLOCK REMOVES THE NUMBER OF NEGATIVE LABELS TO EQUALISE THE DISTRIBUTION OF CLASS LABELS. TO BE REMOVED IN FUTURE.
count = 0
deletes = []
for x in range(len(labels)):
if (labels[x] == "negative"):
count = count + 1
deletes.append(x)
if (count == (len(labels)-(list(labels).count("positive"))*2)):
break
labelledCorpus = np.delete(labelledCorpus,deletes,axis=0)
labels = np.delete(labels,deletes)
##################
numData = len(labels) # size of the labelled data set
# build classifier
classifier = svm.SVC(kernel='linear').fit(labelledCorpus,labels)
# compute output label and corresponding score
output_test = classifier.predict(unlabelledCorpus)
output_scores_test = classifier.decision_function(unlabelledCorpus)
# sort scores and labels in order
sortList = list(zip(output_scores_test,output_test,unlabelledCorpus))
sortList.sort()
output_scores_test,output_test,unlabelledCorpus = zip(*sortList)
# save result to file
for r in range(len(unlabelledCorpus)):
if (abs(output_scores_test[r]) < threashold):
reportIdx = corpusList.index(list(unlabelledCorpus[r]))
writer.writerow("")
writer.writerow([reportIdx,output_scores_test[r],output_test[r]])
writer.writerow([reports[reportIdx]])
writeFile.close()