def testRecursiveSetPrune(self): 
     numExamples = 1000
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().normaliseArray(y)
     
     numTrain = numpy.round(numExamples * 0.66)     
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     testX = X[numTrain:, :]
     testY = y[numTrain:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     
     rootId = (0,)
     learner.tree.getVertex(rootId).setTestInds(numpy.arange(testX.shape[0]))
     learner.recursiveSetPrune(testX, testY, rootId)
     
     for vertexId in learner.tree.getAllVertexIds(): 
         tempY = testY[learner.tree.getVertex(vertexId).getTestInds()]
         predY = numpy.ones(tempY.shape[0])*learner.tree.getVertex(vertexId).getValue()
         error = numpy.sum((tempY-predY)**2)
         self.assertAlmostEquals(error, learner.tree.getVertex(vertexId).getTestError())
         
     #Check leaf indices form all indices 
     inds = numpy.array([])        
     
     for vertexId in learner.tree.leaves(): 
         inds = numpy.union1d(inds, learner.tree.getVertex(vertexId).getTestInds())
         
     nptst.assert_array_equal(inds, numpy.arange(testY.shape[0]))
 def testLearningRate(self): 
     numExamples = 100
     trainX, trainY = data.make_regression(numExamples) 
     trainX = Standardiser().normaliseArray(trainX)
     trainY = Standardiser().normaliseArray(trainY)
     learner = DecisionTreeLearner(pruneType="CART", maxDepth=20, minSplit=1)
     
     
     foldsSet = numpy.arange(2, 7, 2)
     
     gammas = numpy.array(numpy.round(2**numpy.arange(1, 8, 1)-1), dtype=numpy.int)
     paramDict = {} 
     paramDict["setGamma"] = gammas
     
     betaGrid = learner.learningRate(trainX, trainY, foldsSet, paramDict)
     
     #Compute beta more directly 
     numParams = gammas.shape[0]
     sampleSize = trainX.shape[0]
     sampleMethod = Sampling.crossValidation
     Cvs = numpy.array([1])
     penalties = numpy.zeros((foldsSet.shape[0], numParams))
     betas = numpy.zeros(gammas.shape[0])
     
     for k in range(foldsSet.shape[0]): 
         folds = foldsSet[k]
         logging.debug("Folds " + str(folds))
         
         idx = sampleMethod(folds, trainX.shape[0])   
         
         #Now try penalisation
         resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs)
         bestLearner, trainErrors, currentPenalties = resultsList[0]
         penalties[k, :] = currentPenalties
     
     for i in range(gammas.shape[0]): 
         inds = numpy.logical_and(numpy.isfinite(penalties[:, i]), penalties[:, i]>0)
         tempPenalties = penalties[:, i][inds]
         tempfoldsSet = numpy.array(foldsSet, numpy.float)[inds]                            
         
         if tempPenalties.shape[0] > 1: 
             x = numpy.log((tempfoldsSet-1)/tempfoldsSet*sampleSize)
             y = numpy.log(tempPenalties)+numpy.log(tempfoldsSet)   
         
             clf = linear_model.LinearRegression()
             clf.fit(numpy.array([x]).T, y)
             betas[i] = clf.coef_[0]    
             
     betas = -betas   
     
     nptst.assert_array_equal(betaGrid, betas)
 def profileLearnModel(self):
     numExamples = 1000
     numFeatures = 50
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
     y = numpy.array(y, numpy.float)
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") 
     #learner.learnModel(X, y)
     #print("Done")
     ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals())
     
     print(learner.getTree().getNumVertices())
Example #4
0
 def profileLearnModel(self):
     numExamples = 1000
     numFeatures = 50
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
     y = numpy.array(y, numpy.float)
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") 
     #learner.learnModel(X, y)
     #print("Done")
     ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals())
     
     print(learner.getTree().getNumVertices())
 def profilePredict(self): 
     #Make the prdiction function faster 
     numExamples = 1000
     numFeatures = 20
     minSplit = 1
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
     learner.learnModel(X, y)
     
     print(learner.getTree().getNumVertices())
     ProfileUtils.profile('learner.predict(X)', globals(), locals())
     
     print(learner.getTree().getNumVertices())
 def testPredict(self): 
     
     generator = ExamplesGenerator()         
     
     for i in range(10):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 20)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(0, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         y = numpy.array(y, numpy.float)
             
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)    
         
         predY = learner.predict(X)
         
         tree = learner.tree            
         
         for vertexId in tree.getAllVertexIds(): 
             
             nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds())
             
         #Compare against sklearn tree  
         regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0)
         regressor.fit(X, y)
         
         sktree = regressor.tree_
         
         #Note that the sklearn algorithm appears to combine nodes with same value 
         #self.assertEquals(sktree.node_count, tree.getNumVertices())
         self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd())
         self.assertEquals(sktree.value[0], tree.getRoot().getValue())
         self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3)
         
         predY2 = regressor.predict(X)
         
         #Note that this is not always precise because if two thresholds give the same error we choose the largest 
         #and not sure how it is chosen in sklearn (or if the code is correct)
         self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)  
 def testParallelPen(self): 
     #Check if penalisation == inf when treeSize < gamma 
     numExamples = 100
     X, y = data.make_regression(numExamples) 
     learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2)
     
     paramDict = {} 
     paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int)
     
     folds = 3
     alpha = 1.0
     Cvs = numpy.array([(folds-1)*alpha])
     
     idx = Sampling.crossValidation(folds, X.shape[0])
     
     resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs)
     
     learner, trainErrors, currentPenalties = resultsList[0]
     
     learner.setGamma(2**10)
     treeSize = 0
     #Let's work out the size of the unpruned tree 
     for trainInds, testInds in idx: 
         trainX = X[trainInds, :]
         trainY = y[trainInds]
         
         learner.learnModel(trainX, trainY)
         treeSize += learner.tree.size 
     
     treeSize /= float(folds)         
     
     self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all())      
     self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
 def testprune(self): 
     learner = DecisionTreeLearner(minSplit=5)
     learner.learnModel(self.X, self.y)
     
     unprunedTree = learner.getTree().copy()
     
     learner.cartPrune(self.X, self.y)
 
     self.assertTrue(learner.tree.isSubtree(unprunedTree))
Example #9
0
    def profileModelSelect(self):
        learner = DecisionTreeLearner(minSplit=5, maxDepth=30, pruneType="CART") 
        numExamples = 1000
        numFeatures = 10
        
        folds = 5
        
        paramDict = {} 
        paramDict["setGamma"] =  numpy.array(numpy.round(2**numpy.arange(1, 7.5, 0.5)-1), dtype=numpy.int)

        X = numpy.random.rand(numExamples, numFeatures)
        Y = numpy.array(numpy.random.rand(numExamples) < 0.1, numpy.int)*2-1

        def run():
            for i in range(5):
                print("Iteration " + str(i))
                idx = Sampling.crossValidation(folds, numExamples)
                learner.parallelModelSelect(X, Y, idx, paramDict)

        ProfileUtils.profile('run()', globals(), locals())
Example #10
0
 def profilePredict(self): 
     #Make the prdiction function faster 
     numExamples = 1000
     numFeatures = 20
     minSplit = 1
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
     learner.learnModel(X, y)
     
     print(learner.getTree().getNumVertices())
     ProfileUtils.profile('learner.predict(X)', globals(), locals())
     
     print(learner.getTree().getNumVertices())
    def testCARTPrune(self): 
        numExamples = 500
        X, y = data.make_regression(numExamples)  
        
        y = Standardiser().standardiseArray(y)
        
        numTrain = numpy.round(numExamples * 0.33)     
        numValid = numpy.round(numExamples * 0.33) 
        
        trainX = X[0:numTrain, :]
        trainY = y[0:numTrain]
        validX = X[numTrain:numTrain+numValid, :]
        validY = y[numTrain:numTrain+numValid]
        testX = X[numTrain+numValid:, :]
        testY = y[numTrain+numValid:]
        
        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)    
        
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 1000)
        predY = learner.predict(trainX)

        learner.setGamma(200)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 200)
        
        learner.setGamma(100)
        learner.learnModel(trainX, trainY)
        self.assertTrue(learner.tree.getNumVertices() <= 100)
        

        learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2)
        learner.learnModel(trainX, trainY)
        predY2 = learner.predict(trainX)
        
        #Gamma = 0 implies no pruning 
        nptst.assert_array_equal(predY, predY2)
        
        #Full pruning 
        learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1)
        learner.learnModel(trainX, trainY)
        self.assertEquals(learner.tree.getNumVertices(), 1)
 def testLearnModel(self): 
     #First check the integrety of the trees 
     generator = ExamplesGenerator()         
     
     for i in range(5):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 10)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(1, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)
         y = numpy.array(y, numpy.float)
     
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)        
         tree = learner.getTree() 
         
         for vertexId in tree.getAllVertexIds(): 
             vertex = tree.getVertex(vertexId)
             if vertex.getFeatureInd() != None: 
                 meanValue = y[vertex.getTrainInds()].mean()
                 self.assertEquals(meanValue, vertex.getValue())
                 if tree.isNonLeaf(vertexId): 
                     self.assertTrue(0 <= vertex.getFeatureInd() < X.shape[1]) 
                     self.assertTrue(X[:, vertex.getFeatureInd()].min() <= vertex.getThreshold() <= X[:, vertex.getFeatureInd()].max())
                 self.assertTrue(vertex.getTrainInds().shape[0] >= 1)
         
         
         self.assertTrue(tree.depth() <= maxDepth)
         #Check that each split contains indices from parent 
         root = tree.getRootId()
         vertexStack = [root]
         
         while len(vertexStack) != 0: 
             vertexId = vertexStack.pop()
             neighbours = tree.children(vertexId)
             
             if len(neighbours) > 2: 
                 self.fail("Cannot have more than 2 children") 
             elif len(neighbours) > 0: 
                 inds1 = tree.getVertex(neighbours[0]).getTrainInds()
                 inds2 = tree.getVertex(neighbours[1]).getTrainInds()
                 
                 nptst.assert_array_equal(numpy.union1d(inds1, inds2), numpy.unique(tree.getVertex(vertexId).getTrainInds()))
                 
                 vertexStack.append(neighbours[0])
                 vertexStack.append(neighbours[1])
     
     #Try a tree of depth 0 
     #learner = DecisionTreeLearner(minSplit=10, maxDepth=0) 
     #learner.learnModel(self.X, self.y)        
     #tree = learner.getTree()
     
     #self.assertEquals(tree.depth(), 0)
     
     #Try minSplit > numExamples 
     #learner = DecisionTreeLearner(minSplit=self.numExamples+1, maxDepth=0) 
     #learner.learnModel(self.X, self.y)        
     #tree = learner.getTree()
     
     #self.assertEquals(tree.getNumVertices(), 1)
     
     #Try a simple tree of depth 1 
     learner = DecisionTreeLearner(minSplit=1, maxDepth=1) 
     learner.learnModel(self.X, self.y)     
     
     bestFeature = 0 
     bestError = 10**6 
     bestThreshold = 0         
     
     for i in range(numFeatures): 
         vals = numpy.unique(self.X[:, i])
         
         for j in range(vals.shape[0]-1):             
             threshold = (vals[j+1]+vals[j])/2
             leftInds = self.X[:, i] <= threshold
             rightInds = self.X[:, i] > threshold
             
             valLeft = numpy.mean(self.y[leftInds])
             valRight = numpy.mean(self.y[rightInds])
             
             error = ((self.y[leftInds] - valLeft)**2).sum() + ((self.y[rightInds] - valRight)**2).sum()
             
             if error < bestError: 
                 bestError = error 
                 bestFeature = i 
                 bestThreshold = threshold 
     
     self.assertAlmostEquals(bestThreshold, learner.tree.getRoot().getThreshold())
     self.assertAlmostEquals(bestError, learner.tree.getRoot().getError(), 5)
     self.assertEquals(bestFeature, learner.tree.getRoot().getFeatureInd())
     
     #Now we will test pruning works 
     learner = DecisionTreeLearner(minSplit=1, maxDepth=10) 
     learner.learnModel(X, y)
     numVertices1 = learner.getTree().getNumVertices()       
     
     learner = DecisionTreeLearner(minSplit=1, maxDepth=10, pruneType="REP-CV") 
     learner.learnModel(X, y) 
     numVertices2 = learner.getTree().getNumVertices()   
     
     self.assertTrue(numVertices1 >= numVertices2)
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
Example #14
0
foldsSet = numpy.arange(2, 13, 2)
alpha = 1.0

gammas = numpy.array(numpy.round(2**numpy.arange(1, 7.5, 0.5)-1), dtype=numpy.int)

paramDict = {} 
paramDict["setGamma"] = gammas
numParams = paramDict["setGamma"].shape[0]

sampleMethod = Sampling.crossValidation
numProcesses = multiprocessing.cpu_count()
Cvs = numpy.array([1])

for datasetName, numRealisations in datasets:
    logging.debug("Dataset " + datasetName)
    learner = DecisionTreeLearner(criterion="mse", maxDepth=100, minSplit=1, pruneType="CART", processes=numProcesses)
    learner.setChunkSize(3)   
    
    outfileName = outputDir + datasetName + "Beta"

    for m in range(sampleSizes.shape[0]): 
        sampleSize = sampleSizes[m]
        logging.debug("Sample size " + str(sampleSize))
    
        penalties = numpy.zeros((foldsSet.shape[0], numParams))
        betas = numpy.zeros((gammas.shape[0], sampleSizes.shape[0]))
        
        for j in range(numRealisations):      
            logging.debug("Realisation: " + str(j))
            
            trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j)
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
Example #16
0
for datasetName, numRealisations in datasets:
    logging.debug("Dataset " + datasetName)
    meanErrors = numpy.zeros(sampleSizes.shape[0])   
    meanPenalties = numpy.zeros(sampleSizes.shape[0])
    meanIdealPenalities = numpy.zeros(sampleSizes.shape[0])
    
    k = 0 
    for sampleSize in sampleSizes: 
        logging.debug("Sample size " + str(sampleSize))
        errors = numpy.zeros(numRealisations)
        
        sampleMethod = Sampling.crossValidation
        
        #Setting maxDepth = 50 and minSplit = 5 doesn't effect results 
        numProcesses = multiprocessing.cpu_count()
        learner = DecisionTreeLearner(criterion="mse", maxDepth=100, minSplit=1, pruneType="CART", processes=numProcesses)
        learner.setChunkSize(3)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.array([31], dtype=numpy.int)
        numParams = paramDict["setGamma"].shape[0]
        
        alpha = 1.0
        folds = 4
        numRealisations = 10
        
        Cvs = numpy.array([folds-1])*alpha
        
        
        meanAllErrors = numpy.zeros(numParams) 
        meanTrainError = numpy.zeros(numParams)
Example #17
0
            else: 
                y[i] = 0.38 

y += numpy.random.randn(numExamples)*noise

numTrainExamples = numExamples*0.1 
numValidExamples = numExamples*0.1

trainX = X[0:numTrainExamples, :]
trainY = y[0:numTrainExamples]
validX = X[numTrainExamples:numTrainExamples+numValidExamples, :]
validY = y[numTrainExamples:numTrainExamples+numValidExamples]
testX = X[numTrainExamples+numValidExamples:, :]
testY = y[numTrainExamples+numValidExamples:]

learner = DecisionTreeLearner(minSplit=1, maxDepth=50)
learner.learnModel(trainX, trainY)


#Seem to be optimal 
alphaThreshold = 100.0
learner.setAlphaThreshold(alphaThreshold)
learner.repPrune(validX, validY)
#learner.tree = learner.tree.cut(3)

predY = learner.predict(testX)

plt.figure(0)
plt.scatter(testX[:, 0], testX[:, 1], c=testY, s=50, vmin=0, vmax=1)
plt.colorbar()