def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
Example #2
0
trainX = X[0:numTrainExamples, :]
trainY = y[0:numTrainExamples]
validX = X[numTrainExamples:numTrainExamples+numValidExamples, :]
validY = y[numTrainExamples:numTrainExamples+numValidExamples]
testX = X[numTrainExamples+numValidExamples:, :]
testY = y[numTrainExamples+numValidExamples:]

learner = DecisionTreeLearner(minSplit=1, maxDepth=50)
learner.learnModel(trainX, trainY)


#Seem to be optimal 
alphaThreshold = 100.0
learner.setAlphaThreshold(alphaThreshold)
learner.repPrune(validX, validY)
#learner.tree = learner.tree.cut(3)

predY = learner.predict(testX)

plt.figure(0)
plt.scatter(testX[:, 0], testX[:, 1], c=testY, s=50, vmin=0, vmax=1)
plt.colorbar()

plt.figure(1)
plt.scatter(testX[:, 0], testX[:, 1], c=predY, s=50, vmin=0, vmax=1)
plt.colorbar()

colormap  = matplotlib.cm.get_cmap()

def displayTree(learner, vertexId, minX0, maxX0, minX1, maxX1, colormap):