forked from RockStarCoders/alienMarkovNetworks
/
LRClassifier.py
520 lines (416 loc) · 20.7 KB
/
LRClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
import os
import sys
import datetime
import numpy as np
from cStringIO import StringIO
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
import pickle
import pomio, FeatureGenerator
import matplotlib.pyplot as plt
import matplotlib
import NeuralNet
import sklearn.ensemble
def trainLogisticRegressionModel(
featureData, labels, Cvalue, outputClassifierFile, scaleData=True, requireAllClasses=True
):
# See [http://scikit-learn.org/dev/modules/generated/sklearn.linear_model.LogisticRegression.html]
# Features are numPixel x numFeature np arrays, labels are numPixel np array
numTrainDataPoints = np.shape(featureData)[0]
numDataLabels = np.size(labels)
assert ( np.size( np.shape(labels) ) == 1) , ("Labels should be a 1d array. Shape of labels = " + str(np.shape(labels)))
assert ( numTrainDataPoints == numDataLabels) , ("The length of the feature and label data arrays must be equal. Num data points=" + str(numTrainDataPoints) + ", labels=" + str(numDataLabels) )
classLabels = np.unique(labels)
assert not requireAllClasses or \
( np.size(classLabels) == pomio.getNumClasses() or np.size(classLabels) == pomio.getNumLabels() ), \
"Training data does not contains all classes::\n\t" + str(classLabels)
if scaleData == True:
featureData = preprocessing.scale(featureData)
# sklearn.linear_model.LogisticRegression(penalty='l2', dual=False, tol=0.0001, C=1.0, fit_intercept=True, intercept_scaling=1, class_weight=None, random_state=None)
lrc = LogisticRegression(penalty='l1' , dual=False, tol=0.0001, C=Cvalue, fit_intercept=True, intercept_scaling=1)
lrc.fit(featureData, labels)
pickleObject(lrc, outputClassifierFile)
print "LogisticRegression classifier saved to " + str(outputClassifierFile)
return lrc
def testClassifier(classifier, testFeatureData, testLabels, resultsFile, scaleData=True):
# predict on testFeatures, compare to testClassLabels, return the results
accuracy = classifier.score(testFeatureData, testLabels)
return accuracy
def crossValidation_Cparam(trainingData, validationData, classifierBaseFile, classifierBaseTestOutputFile, C_min, C_max, C_increment):
assert (C_min <= C_max) , "C_min param value should be less than or equal to C_max param value."
assert C_increment != 0, "You specified min and max for C param; C_increment value must NOT be 0"
assert (np.size(np.shape(trainingData)) == 2 and np.size(np.shape(validationData)) == 2) , ("Training data and test data must be list of length 2 (first element giving feature array, second giving label array. #Training=" + str(np.size(np.shape(trainingData)) + ", #test=" + str(np.shape(validationData))))
# Get training data
trainFeatureData = trainingData[0]
trainLabels = trainingData[1]
numTrainData = np.shape(trainFeatureData)[0]
numTrainLabels = np.size(trainLabels)
assert ( numTrainData == numTrainLabels ) , "TRAIN data error:: Number of feature vectors must equal number of labels. #trainingFeatures=" + ""
# Get test data
validationFeatureData = validationData[0]
validationLabels = validationData[1]
numValidData = np.shape(validationFeatureData)[0]
numValidLabels = np.size(validationLabels)
assert ( numValidData == numValidLabels ) , "VALIDATION data error:: Number of feature vectors must equal number of labels. #trainingFeatures=" + ""
print "Now training classifiers for specified C params::"
cvResult = None
# Just train and test on single C_value
if C_min == C_max:
print "\nTraining classifier C="+str(C_min)
classifierName = classifierBaseFile + "_" + str(C_min) + ".pkl"
if C_min <= 0:
print "C-param is 0 - must be greater than 0. Added 10^-6 to give 0.000001 as first param value"
C_min = 0.0 + 10**(-6)
classifier = trainLogisticRegressionModel(trainFeatureData, trainLabels, 0.1, classifierName, True)
meanAccuracy = testClassifier(classifier, validationFeatureData, validationLabels, classifierBaseTestOutputFile + "_" + str(C_min) + ".csv", True)
cvResult = np.array( [ C_min, meanAccuracy])
return cvResult
else:
# Now run cross-validation on each C param, persisting classifier CV performance and corresponding classifier object to file
Crange = np.arange(0, C_max+C_increment, C_increment)
print "Training classifiers with C params in range:" , Crange
for idx in range(0, len(Crange)):
C_param = Crange[idx]
if C_param <= 0:
print "Min C_param is 0 - must be greater than 0. Added 10^-6 to give 0.000001 as first param value"
C_param = 0.0 + 10**(-6)
print "\nTraining classifier C="+str(C_param)
classifier = trainLogisticRegressionModel(trainFeatureData, trainLabels, C_param, classifierBaseFile + "_" + str(C_param) + ".pkl", True)
trainAccuracy = classifier.score(trainFeatureData, trainLabels)
print "Classifier trained, now using scikit learn test function, trainingAccuracy = " , (trainAccuracy * 100) , "%"
cvAccuracy = testClassifier(classifier, validationFeatureData, validationLabels, classifierBaseTestOutputFile + str(C_param) + "_" + ".csv", True)
print "LR classifier, C_para =" , C_param , " cv_accuracy = ", (cvAccuracy * 100) , "%"
if cvResult == None:
cvResult = np.array( [C_param , trainAccuracy, cvAccuracy ])
else:
cvResult = np.vstack( [ cvResult , np.array( [ C_param, trainAccuracy, cvAccuracy ] ) ] )
return cvResult
#
# Prediction util methods
#
def generateImagePredictionClassDist(rgbImage, classifier, requireAllClasses=True):
"""This image takes an RGB image as an (i,j,3) numpy array, a scikit-learn classifier and produces probability distribution over each pixel and class.
Returns an (i,j,N) numpy array where N= total number of classes for use in subsequent modelling."""
# TODO Broaden to cope with more classifiers :)
#assert (str(type(classifier)) == "<class 'sklearn.linear_model.logistic.LogisticRegression'>") , "Check classifier type value:: " + str(type(classifier))
testClassifier = None
imageDimensions = rgbImage[:,:,0].shape
nbCols = imageDimensions[1]
nbRows = imageDimensions[0]
#params = classifier.get_params(deep=True)
#print "Classifier paras::" , params
# Take image, generate features, use classifier to predict labels, ensure normalised dist and shape to (i,j,N) np.array
# generate predictions for the image
imagePixelFeatures = FeatureGenerator.generatePixelFeaturesForImage(rgbImage)
#print imagePixelFeatures
predictedPixelLabels = classifier.predict(imagePixelFeatures)
predictionProbs = classifier.predict_proba(imagePixelFeatures)
print "\nShape of predicted labels::" , np.shape(predictedPixelLabels)
print "\nShape of prediction probs::" , np.shape(predictionProbs)
numClasses = pomio.getNumClasses()
assert not requireAllClasses or \
(np.shape(predictionProbs)[1] == numClasses or \
np.shape(predictionProbs)[1] == numClasses+1) , \
"Classifer prediction does not match all classes (23 or 24):: " + \
str(np.shape(predictionProbs)[1])
print predictionProbs
#!!predictionProbs = np.reshape(predictionProbs, (nbCols, nbRows, numClasses ))
print 'reshaping to ', (nbCols, nbRows, predictionProbs.shape[1] )
predictionProbs = np.reshape(predictionProbs, (nbRows, nbCols, predictionProbs.shape[1] ))
return predictionProbs
#
# File IO util functions
#
def persistLabeledDataToFile(features, labels, baseFileLocation, persistenceType):
assert (persistenceType == "numpy" or persistenceType == "pickle" or persistenceType == "csv"), "persistenceType must be string value \"pickle\" or \"csv\""
if persistenceType == "csv":
writeLabeledDataToCSVFile(features, labels, baseFileLocation)
elif persistenceType == "numpy":
saveLabeledDataToNumpyFile(features, labels, baseFileLocation)
elif persistenceType == "pickle":
info = [ features, labels ]
pickleObject(info , baseFileLocation)
#
# CSV file utils
#
def writeLabeledDataToCSVFile(features, labels, baseFileLocation):
dataFile = str(baseFileLocation + "Data.csv")
labelFile = str(baseFileLocation + "Labels.csv")
print "Saving image features and labels to::\n\t", dataFile, "\n\t", labelFile
writeFeaturesToCsv(features, baseFileLocation)
writeLabelsToCsv(labels , baseFileLocation)
def writeFeaturesToCsv(features, baseCsvFilename):
"""This function appends a feature array as a new row to an existing CSV file. If no file exists, one is created."""
# http://stackoverflow.com/questions/12218945/formatting-numpy-array
# read back with np.fromstring(s.getvalue(), sep=',')
featuresFile = str(baseCsvFilename + "Data.csv")
sio = StringIO()
np.savetxt(sio, features, fmt='%.08f', delimiter=',')
dataFile = open(featuresFile, 'a')
dataFile.write(sio.getvalue())
dataFile.flush()
dataFile.close()
def writeLabelsToCsv(labels, baseCsvFilename):
"""This function appends an integer class label to an existing CSV file. If no file exists, one is created."""
# http://stackoverflow.com/questions/12218945/formatting-numpy-array
# read back with np.fromstring(s.getvalue(), sep=',')
labelFile = str(baseCsvFilename + "Labels.csv")
sio = StringIO()
dataFile = open(labelFile, 'a')
np.savetxt(sio, labels.astype('int'), fmt='%d', delimiter=',')
dataFile.write(sio.getvalue())
dataFile.flush()
dataFile.close()
def readLabeledDataFromCsv(baseFilename):
featureData = np.loadtxt(baseFilename + "Data.csv" , dtype='float' , delimiter=',')
labelData = np.loadtxt(baseFilename + "Labels.csv" , dtype='float' , delimiter=',')
return [featureData, labelData]
# Numpy serialisation utils
def saveLabeledDataToNumpyFile(features, labels, baseFileLocation):
saveNumpyData(features, str(baseFileLocation + "Data.npy"))
saveNumpyData(labels , str(baseFileLocation + "Labels.npy"))
def saveNumpyData(data, filename):
np.save(filename, data)
def readLabeledNumpyData(baseFileLocation):
features = loadNumpyData(str(baseFileLocation + "Data.npy"))
labels = loadNumpyData(str(baseFileLocation + "Labels.npy"))
return [features, labels]
def loadNumpyData(filename):
data = np.load( open(filename , "r") )
return data
# Classifier IO utils
def pickleObject(obj, filename):
if filename.endswith(".pkl"):
f = open( filename , "w")
pickle.dump(obj, f , True)
f.close()
else:
print "Input filename did not end in .pkl - adding .pkl to filename."
filename= str(filename)+".pkl"
f = open( filename , "w")
pickle.dump(obj, f , True)
f.close()
def loadObject(filename):
filetype = '.pkl'
if filename.endswith(filetype):
f = open(filename, 'rb')
obj = pickle.load(f)
f.close()
return obj
else:
print "Input filename did not end in .pkl - trying filename with type appended...."
f = open( ( str(filename)+".pkl" ), "rb")
obj = pickle.load(f)
f.close()
return obj
def loadClassifier(fullFilename):
return loadObject(fullFilename)
def scaleInputData(inputFeatureData):
# Assumes numeric numpy array [[ data....] , [data....] ... ]
return preprocessing.scale(inputFeatureData.astype('float'))
def makedear( base, dirname ):
fn = base + dirname
if not os.path.isdir( fn ):
os.mkdir(fn)
if __name__ == "__main__":
#
# Simple runtime tests
#
# TODO look at sklearn pipeline to get some automation here
#
# Usage:
#
# LRClassifier.py <MSRCDataPath> <outputDir> <subsetType=1(20 imgs),2(1 img),3(all imgs)> <examplesPerImage> <nbToTrainOn>
#
#
# PARAMETERS
#
# Number of pixels per image for which to make a feature vector.
examplesPerImage = int(sys.argv[4])
nbToTrainOn = int(sys.argv[5])
doVal = 0
doTest = 0
msrcData = sys.argv[1] #"/home/amb/dev/mrf/data/MSRC_ObjCategImageDatabase_v2"
timestamp = str(datetime.datetime.now())
# Output file resources
outDir = sys.argv[2]
trainingPixelBaseFilename = outDir + "/training/trainPixelFeature"
validationPixelBaseFilename = outDir + "/crossValidation/data/cvPixelFeature"
validationResultsBaseFilename = outDir + "/crossValidation/results/logRegClassifier_CrossValidResult"
testingPixelBaseFilename = outDir + "/testing/data/testPixelFeature"
testingResultsBaseFilename = outDir + "/testing/results/testPixelFeature"
classifierBaseFilename = outDir + "/classifierModels/randForestClassifier"
makedear( outDir, "" )
makedear( outDir, "/training" )
makedear( outDir, "/crossValidation" )
makedear( outDir, "/crossValidation/data" )
makedear( outDir, "/crossValidation/results" )
makedear( outDir, "/testing" )
makedear( outDir, "/testing/data" )
makedear( outDir, "/testing/results" )
makedear( outDir, "/classifierModels" )
subsetType = int(sys.argv[3])
# Load dem images
if subsetType == 1:
msrcImages = pomio.msrc_loadImages(msrcData, [\
'Images/10_1_s.bmp',\
'Images/10_2_s.bmp',\
'Images/11_1_s.bmp',\
'Images/11_2_s.bmp',\
'Images/12_1_s.bmp',\
'Images/12_2_s.bmp',\
'Images/13_1_s.bmp',\
'Images/13_2_s.bmp',\
'Images/14_1_s.bmp',\
'Images/14_2_s.bmp',\
'Images/15_1_s.bmp',\
'Images/15_2_s.bmp',\
'Images/16_1_s.bmp',\
'Images/16_2_s.bmp',\
'Images/17_1_s.bmp',\
'Images/17_2_s.bmp',\
'Images/18_1_s.bmp',\
'Images/18_2_s.bmp',\
'Images/19_1_s.bmp',\
'Images/19_2_s.bmp',\
'Images/1_1_s.bmp',\
'Images/1_2_s.bmp',\
'Images/20_1_s.bmp',\
'Images/20_2_s.bmp',\
'Images/2_1_s.bmp',\
'Images/2_2_s.bmp',\
'Images/3_1_s.bmp',\
'Images/3_2_s.bmp',\
'Images/4_1_s.bmp',\
'Images/4_2_s.bmp',\
'Images/5_1_s.bmp',\
'Images/5_2_s.bmp',\
'Images/6_1_s.bmp',\
'Images/6_2_s.bmp',\
'Images/7_1_s.bmp',\
'Images/7_2_s.bmp',\
'Images/8_1_s.bmp',\
'Images/8_2_s.bmp',\
'Images/9_1_s.bmp',\
'Images/9_2_s.bmp'])
elif subsetType == 2:
msrcImages = pomio.msrc_loadImages(msrcData, ['Images/7_3_s.bmp'] )
else:
# Load all images
msrcImages = pomio.msrc_loadImages(msrcData)
if doVal or doTest:
scale = 0.1
# Generate data from images and save to file
print "\nProcessing " + str(scale*100) + \
"% of MSRC data on a 60/20/20 split serialised for easier file IO"
splitData = pomio.splitInputDataset_msrcData(
msrcImages,
datasetScale=scale,
keepClassDistForTraining=True,
trainSplit=0.6,
validationSplit=0.2,
testSplit=0.2
)
validationDataset = splitData[1]
testDataset = splitData[2]
if doVal:
print "Processing validation data::"
validationData = FeatureGenerator.processLabeledImageData(validationDataset, ignoreVoid=True)
if doTest:
print "Processing test data::"
testingData = FeatureGenerator.processLabeledImageData(testDataset, ignoreVoid=True)
else:
# Just training data
splitData = [msrcImages,None,None]
# prepare training data
trainDataset = splitData[0]
trainLabels = None
for idx in range(0, np.size(trainDataset)):
if trainLabels == None:
trainLabels = FeatureGenerator.reshapeImageLabels(trainDataset[idx])
else:
trainLabels = np.append( trainLabels , FeatureGenerator.reshapeImageLabels(trainDataset[idx]) )
print "\nProcessing training data::"
trainingData = FeatureGenerator.processLabeledImageData( trainDataset, ignoreVoid = True, nbPerImage = examplesPerImage)
if doVal:
# cross-validation on C param
print "\nNow using validation data set to evaluate different C param values @" , datetime.datetime.now()
C_min = 0
C_max = 1.0
C_increment = 0.5
cvResult = crossValidation_Cparam(trainingData, validationData, classifierBaseFilename, validationResultsBaseFilename, C_min, C_max, C_increment)
print "Completed @ " + str(datetime.datetime.now()), "\nCV results for different C params:\n" , cvResult
else:
# Just train on a subset!!
# train on a set number of examples (or total pixels if less than specified value)
totalExamples = trainingData[0].shape[0] # num rows = num pixel features
if (totalExamples < nbToTrainOn):
print "Required number of samples (" , nbToTrainOn , " ) > number of pixels in image (" , totalExamples , " ). Will use all pixel features."
nbToTrainOn = totalExamples
print 'TRAINING CLASSIFIER on %d-sample subset of %d examples of dimension %d...' % \
( nbToTrainOn, trainingData[0].shape[0], trainingData[0].shape[1] )
subset = np.random.choice( trainingData[0].shape[0], nbToTrainOn, replace=False ) # Just specified samples
if 0:
# logistic regression
# Use fixed C
C = 0.5
classifier = trainLogisticRegressionModel(
trainingData[0][subset,:], trainingData[1][subset], C, classifierBaseFilename, \
scaleData=True, \
requireAllClasses=False
)
elif 0:
# Neural Network
# Construct nn dataset
datmat = trainingData[0][subset,:]
labvec = trainingData[1][subset]
nbFeatures = datmat.shape[1]
nbClasses = pomio.getNumClasses()
nbHidden = 100
maxIter = 200
classifier = NeuralNet.NNet(nbFeatures, nbClasses, nbHidden)
nnds = classifier.createTrainingSetFromMatrix( datmat, labvec )
classifier.trainNetworkBackprop(nnds,maxIter)
else:
#classifier = None
# Random forest
datmat = trainingData[0][subset,:]
labvec = trainingData[1][subset]
print '**Training a random forest on %d examples...' % len(labvec)
print 'Labels represented: ', np.unique( labvec )
classifier = sklearn.ensemble.RandomForestClassifier(\
n_estimators=100)
classifier = classifier.fit( datmat, labvec )
pickleObject(classifier, classifierBaseFilename)
print "Rand forest classifier saved to " + str(classifierBaseFilename)
#
# classifierVersion = "_0.5"
# filetype = ".pkl"
# classifierFilename = classifierBaseFilename + classifierVersion + filetype
#
# print "Reading pre-built classifier from file::" , classifierFilename , "@" , datetime.datetime.now()
# classifier = loadClassifier(classifierFilename)
#
# predictImage = pomio.msrc_loadImages(msrcData)[0]
plt.interactive(1)
plt.figure()
pomio.showClassColours()
plt.figure()
for img in msrcImages:
print "\nRead in image %s from the MSRC dataset::" % img.m_imgFn
imageFeatures = FeatureGenerator.generatePixelFeaturesForImage(img.m_img)
predLabs = classifier.predict(imageFeatures)
print "\nGenerating prediction of shape ", predLabs.shape, "::" , predLabs
predImg = np.reshape( predLabs, img.m_img[:,:,0].shape )
#print "\nGenerating the probability dist. for each pixel over class labels @" , datetime.datetime.now()
#imageClassDist = generateImagePredictionClassDist(img.m_img, classifier,False)
# Display
print "Unique labels from clfr = ", np.unique(predLabs)
plt.subplot(1,2,1)
plt.imshow(img.m_img)
plt.subplot(1,2,2)
pomio.showLabels( predImg )
plt.waitforbuttonpress()
print "\tCompleted @ " + str(datetime.datetime.now())
plt.interactive(0)
plt.show()