-
Notifications
You must be signed in to change notification settings - Fork 0
/
MainScript.py
123 lines (74 loc) · 3.46 KB
/
MainScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import time
import ClassifierSelector as classifierSelector
import DataReader as dataReader
import RegularFeatureExtractor as regularFeatExtr
import Validator as validator
import Utils as utils
import Visualizer as visualizer
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import *
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
def predictForSubmission():
startTime = time.time()
allAlgorithmStartTime = startTime
numberOfTrainingExamples = -1
classifier = trainClassifierOnTrainingData(numberOfTrainingExamples)
print "Beginning to load test data..."
partitionNumber = utils.numberOfPartitions
for index in range(partitionNumber):
miniTestData = dataReader.getSerializedMiniTestData(index)
xTest,yTest = constructTestData(miniTestData)
print "Predicting..."
yPred = classifier.predict_proba(xTest)
dataReader.writePredToCsv(yPred,index)
print "Post processing..."
dataReader.postProcessCsv()
print("Total run time:{}".format(time.time() - allAlgorithmStartTime))
def trainClassifierOnTrainingData(trainData=None, numberOfTrainingExamples = -1, margins=None):
if trainData is None:
trainData = dataReader.getTrainData(numberOfTrainingExamples,margins)
# feature engineering
trainData = regularFeatExtr.convertTargetFeatureToNumeric(trainData)
xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True)
# classifier training
classifier = classifierSelector.trainClassifier(xTrain, yTrain)
return classifier
def constructTestData(testData):
testData = regularFeatExtr.convertTargetFeatureToNumeric(testData)
xTest, yTest = regularFeatExtr.getRegularFeatures(testData, False)
return xTest,yTest
def constructTrainingData(trainDataSize):
#training data
trainData = dataReader.getTrainData(trainDataSize)
trainData = trainData.append(dataReader.getSuffixDataFrame())
# feature engineering
trainData = regularFeatExtr.convertTargetFeatureToNumeric(trainData)
xTrain, yTrain = regularFeatExtr.getRegularFeatures(trainData, True)
return xTrain,yTrain
def testGeneralPerformanceUsingCrossValidationScore():
# train 28k and test = 7k
# trainDataSize = 35000
trainDataSize = 150000
classifier = classifierSelector.constructGradientBoostingClassifier()
# classifier = classifierSelector.constructRandomForestClassifier()
# classifier = SVC(verbose=1)
xTrain,yTrain = constructTrainingData(trainDataSize)
cv = StratifiedShuffleSplit(yTrain,n_iter=1,train_size=50000,test_size=100000)
cv_scores = cross_val_score(classifier, xTrain, yTrain, cv=cv, n_jobs=-1,scoring="log_loss",verbose=1)
scoreMean = cv_scores.mean()
print "Mean score is {}".format(scoreMean)
def testParameterPerformance():
startTime = time.time()
allAlgorithmStartTime = startTime
# define sizes
trainDataSize = 10000
testDataSize = 100000
trainData,testData = utils.getDifferentTrainAndTestData(trainDataSize,testDataSize)
#in order to assure that we have members form each class present
testData = testData.append(dataReader.getSuffixDataFrame())
classifier = trainClassifierOnTrainingData(trainData=trainData)
xTest,yTest = constructTestData(testData)
yPred = classifier.predict(xTest)
validator.performValidation(yPred, yTest)
print("Total run time:{} s".format((time.time() - allAlgorithmStartTime)))