def createEnsembleBasedODifferentTrainingSets(): # constructing the limits margins = np.linspace(0,878000,5,dtype=int) marginTuples=[] for i in range(len(margins)-1): marginTuples.append((margins[i],margins[i+1])) # training classifiers allClassifiers = Parallel(n_jobs=-1)(delayed(mainScript.trainClassifierOnTrainingData)(margins=marginTuple) for marginTuple in marginTuples) # Predicting on batch test data partitionNumber = utils.numberOfPartitions for batchIndex in range(partitionNumber): print "Predicting batch {}".format(batchIndex) miniTestData = dataReader.getSerializedMiniTestData(batchIndex) xTest,yTest = mainScript.constructTestData(miniTestData) for classifierIndex,currentClassifier in enumerate(allClassifiers): constructPredictionWithOutput(currentClassifier,classifierIndex,xTest,batchIndex) # post process print "Post processing everything..." outputFileNames = ["data\\ensembleTraining\\out"+str(index)+".csv" for index in range(len(allClassifiers))] for outputFileName in outputFileNames: dataReader.postProcessCsv(outputFileName=outputFileName) #Merging everything together print "Merging all solutions...." fileRegex = "data\\ensembleTraining\\*.csv" createEnsembleBasedOnExitingPredictions(fileRegex=fileRegex)
def predictForSubmission(): startTime = time.time() allAlgorithmStartTime = startTime numberOfTrainingExamples = -1 classifier = trainClassifierOnTrainingData(numberOfTrainingExamples) print "Beginning to load test data..." partitionNumber = utils.numberOfPartitions for index in range(partitionNumber): miniTestData = dataReader.getSerializedMiniTestData(index) xTest,yTest = constructTestData(miniTestData) print "Predicting..." yPred = classifier.predict_proba(xTest) dataReader.writePredToCsv(yPred,index) print "Post processing..." dataReader.postProcessCsv() print("Total run time:{}".format(time.time() - allAlgorithmStartTime))