def testCARTPrune(self): 
        numExamples = 500
        X, y = data.make_regression(numExamples)  
        
        y = Standardiser().standardiseArray(y)
        
        numTrain = numpy.round(numExamples * 0.33)     
        numValid = numpy.round(numExamples * 0.33) 
        
        trainX = X[0:numTrain, :]
        trainY = y[0:numTrain]
        validX = X[numTrain:numTrain+numValid, :]
        validY = y[numTrain:numTrain+numValid]
        testX = X[numTrain+numValid:, :]
        testY = y[numTrain+numValid:]
        
        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)    
        
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 1000)
        predY = learner.predict(trainX)

        learner.setGamma(200)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 200)
        
        learner.setGamma(100)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 100)
        

        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)
        predY2 = learner.predict(trainX)
        
        #Gamma = 0 implies no pruning 
        nptst.assert_array_equal(predY, predY2)
        
        #Full pruning 
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1)
        learner.learnModel(trainX, trainY)
        self.assertEquals(learner.tree.getNumVertices(), 1)
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
 def testPredict(self): 
     
     generator = ExamplesGenerator()         
     
     for i in range(10):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 20)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(0, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         y = numpy.array(y, numpy.float)
             
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)    
         
         predY = learner.predict(X)
         
         tree = learner.tree            
         
         for vertexId in tree.getAllVertexIds(): 
             
             nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds())
             
         #Compare against sklearn tree  
         regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0)
         regressor.fit(X, y)
         
         sktree = regressor.tree_
         
         #Note that the sklearn algorithm appears to combine nodes with same value 
         #self.assertEquals(sktree.node_count, tree.getNumVertices())
         self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd())
         self.assertEquals(sktree.value[0], tree.getRoot().getValue())
         self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3)
         
         predY2 = regressor.predict(X)
         
         #Note that this is not always precise because if two thresholds give the same error we choose the largest 
         #and not sure how it is chosen in sklearn (or if the code is correct)
         self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)  
Example #4
0
            meanPenalties[k] += currentPenalties
            meanTrainError += trainErrors
            predY = bestLearner.predict(testX)
            meanErrors[k] += bestLearner.getMetricMethod()(testY, predY)
    
            
            #Compute ideal penalties and error on training data 
            meanIdealPenalities[k] += learner.parallelPenaltyGrid(trainX, trainY, testX, testY, paramDict)
            for i in range(len(paramDict["setGamma"])):
                allError = 0    
                learner.setGamma(paramDict["setGamma"][i])
                for trainInds, testInds in idx: 
                    validX = trainX[trainInds, :]
                    validY = trainY[trainInds]
                    learner.learnModel(validX, validY)
                    predY = learner.predict(trainX)
                    allError += learner.getMetricMethod()(predY, trainY)
                meanAllErrors[i] += allError/float(len(idx))
            
        k+= 1
        
        
    numRealisations = float(numRealisations)
    meanErrors /=  numRealisations 
    meanPenalties /=  numRealisations 
    meanIdealPenalities /=  numRealisations 

    print(meanErrors)
    
    plt.plot(sampleSizes, meanPenalties*numpy.sqrt(sampleSizes), label="Penalty")
    plt.plot(sampleSizes, meanIdealPenalities*numpy.sqrt(sampleSizes), label="Ideal penalty")
Example #5
0
validX = X[numTrainExamples:numTrainExamples+numValidExamples, :]
validY = y[numTrainExamples:numTrainExamples+numValidExamples]
testX = X[numTrainExamples+numValidExamples:, :]
testY = y[numTrainExamples+numValidExamples:]

learner = DecisionTreeLearner(minSplit=1, maxDepth=50)
learner.learnModel(trainX, trainY)


#Seem to be optimal 
alphaThreshold = 100.0
learner.setAlphaThreshold(alphaThreshold)
learner.repPrune(validX, validY)
#learner.tree = learner.tree.cut(3)

predY = learner.predict(testX)

plt.figure(0)
plt.scatter(testX[:, 0], testX[:, 1], c=testY, s=50, vmin=0, vmax=1)
plt.colorbar()

plt.figure(1)
plt.scatter(testX[:, 0], testX[:, 1], c=predY, s=50, vmin=0, vmax=1)
plt.colorbar()

colormap  = matplotlib.cm.get_cmap()

def displayTree(learner, vertexId, minX0, maxX0, minX1, maxX1, colormap): 
    vertex = learner.tree.getVertex(vertexId)
    if learner.tree.isLeaf(vertexId):
        p = mpatches.Rectangle([minX0, minX1], maxX0-minX0, maxX1-minX1, facecolor=colormap(vertex.getValue()), edgecolor="black")