Esempio n. 1
0
 def recursiveSplit(self, X, y, argsortX, nodeId): 
     """
     Give a sample of data and a node index, we find the best split and 
     add children to the tree accordingly. 
     """
     if len(nodeId)-1 >= self.maxDepth: 
         return 
     
     node = self.tree.getVertex(nodeId)
     bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(self.minSplit, X, y, node.getTrainInds(), argsortX)
 
     #The split may have 0 items in one set, so don't split 
     if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: 
         node.setError(bestError)
         node.setFeatureInd(bestFeatureInd)
         node.setThreshold(bestThreshold)
         
         leftChildId = self.getLeftChildId(nodeId)
         leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean())
         self.tree.addChild(nodeId, leftChildId, leftChild)
         
         if leftChild.getTrainInds().shape[0] >= self.minSplit: 
             self.recursiveSplit(X, y, argsortX, leftChildId)
         
         rightChildId = self.getRightChildId(nodeId)
         rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean())
         self.tree.addChild(nodeId, rightChildId, rightChild)
         
         if rightChild.getTrainInds().shape[0] >= self.minSplit: 
             self.recursiveSplit(X, y, argsortX, rightChildId)
Esempio n. 2
0
    def learnModel(self, X, y):
        nodeId = (0, )
        self.tree = DictTree()
        rootNode = DecisionNode(numpy.arange(X.shape[0]), y.mean())
        self.tree.setVertex(nodeId, rootNode)

        #We compute a sorted version of X
        argsortX = numpy.zeros(X.shape, numpy.int)
        for i in range(X.shape[1]):
            argsortX[:, i] = numpy.argsort(X[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])

        self.growSkLearn(X, y)
        #self.recursiveSplit(X, y, argsortX, nodeId)
        self.unprunedTreeSize = self.tree.size

        if self.pruneType == "REP":
            #Note: This should be a seperate validation set
            self.repPrune(X, y)
        elif self.pruneType == "REP-CV":
            self.cvPrune(X, y)
        elif self.pruneType == "CART":
            self.cartPrune(X, y)
        elif self.pruneType == "none":
            pass
        else:
            raise ValueError("Unknown pruning type " + self.pruneType)
Esempio n. 3
0
 def growTree(self, X, y, argsortX, startId): 
     """
     Grow a tree using a stack. Give a sample of data and a node index, we 
     find the best split and add children to the tree accordingly. We perform 
     pre-pruning based on the penalty. 
     """
     eps = 10**-4 
     idStack = [startId]
     
     while len(idStack) != 0: 
         nodeId = idStack.pop()
         node = self.tree.getVertex(nodeId)
         accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX)
     
         #Choose best feature based on gains 
         accuracies += eps 
         bestFeatureInd = Util.randomChoice(accuracies)[0]
         bestThreshold = thresholds[bestFeatureInd]
     
         nodeInds = node.getTrainInds()    
         bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) 
         bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]])
         
         #The split may have 0 items in one set, so don't split 
         if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: 
             node.setError(1-accuracies[bestFeatureInd])
             node.setFeatureInd(bestFeatureInd)
             node.setThreshold(bestThreshold)            
                         
             leftChildId = self.getLeftChildId(nodeId)
             leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds]))
             self.tree.addChild(nodeId, leftChildId, leftChild)
             
             if leftChild.getTrainInds().shape[0] >= self.minSplit: 
                 idStack.append(leftChildId)
             
             rightChildId = self.getRightChildId(nodeId)
             rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds]))
             self.tree.addChild(nodeId, rightChildId, rightChild)
             
             if rightChild.getTrainInds().shape[0] >= self.minSplit: 
                 idStack.append(rightChildId)
Esempio n. 4
0
    def recursiveSplit(self, X, y, argsortX, nodeId):
        """
        Give a sample of data and a node index, we find the best split and 
        add children to the tree accordingly. 
        """
        if len(nodeId) - 1 >= self.maxDepth:
            return

        node = self.tree.getVertex(nodeId)
        bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(
            self.minSplit, X, y, node.getTrainInds(), argsortX)

        #The split may have 0 items in one set, so don't split
        if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0:
            node.setError(bestError)
            node.setFeatureInd(bestFeatureInd)
            node.setThreshold(bestThreshold)

            leftChildId = self.getLeftChildId(nodeId)
            leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean())
            self.tree.addChild(nodeId, leftChildId, leftChild)

            if leftChild.getTrainInds().shape[0] >= self.minSplit:
                self.recursiveSplit(X, y, argsortX, leftChildId)

            rightChildId = self.getRightChildId(nodeId)
            rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean())
            self.tree.addChild(nodeId, rightChildId, rightChild)

            if rightChild.getTrainInds().shape[0] >= self.minSplit:
                self.recursiveSplit(X, y, argsortX, rightChildId)
Esempio n. 5
0
    def learnModel(self, X, y):
        if numpy.unique(y).shape[0] != 2:
            raise ValueError("Must provide binary labels")
        if y.dtype != numpy.int:
            raise ValueError("Labels must be integers")

        self.shapeX = X.shape
        argsortX = numpy.zeros(X.shape, numpy.int)
        for i in range(X.shape[1]):
            argsortX[:, i] = numpy.argsort(X[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])

        rootId = (0, )
        idStack = [rootId]
        self.tree = DictTree()
        rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y))
        self.tree.setVertex(rootId, rootNode)
        bestError = float("inf")
        bestTree = self.tree

        #First grow a selection of trees

        while len(idStack) != 0:
            #Prune the current node away and grow from that node
            nodeId = idStack.pop()

            for i in range(self.sampleSize):
                self.tree = bestTree.deepCopy()
                try:
                    node = self.tree.getVertex(nodeId)
                except ValueError:
                    print(nodeId)
                    print(self.tree)
                    raise

                self.tree.pruneVertex(nodeId)
                self.growTree(X, y, argsortX, nodeId)
                self.prune(X, y)
                error = self.treeObjective(X, y)

                if error < bestError:
                    bestError = error
                    bestTree = self.tree.deepCopy()

            children = bestTree.children(nodeId)
            idStack.extend(children)

        self.tree = bestTree
    def testPrune(self):
        startId = (0, )
        minSplit = 20
        maxDepth = 5
        gamma = 0.05
        learner = PenaltyDecisionTree(minSplit=minSplit,
                                      maxDepth=maxDepth,
                                      gamma=gamma,
                                      pruning=False)

        trainX = self.X[100:, :]
        trainY = self.y[100:]
        testX = self.X[0:100, :]
        testY = self.y[0:100]

        argsortX = numpy.zeros(trainX.shape, numpy.int)
        for i in range(trainX.shape[1]):
            argsortX[:, i] = numpy.argsort(trainX[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])

        learner.tree = DictTree()
        rootNode = DecisionNode(numpy.arange(trainX.shape[0]),
                                Util.mode(trainY))
        learner.tree.setVertex(startId, rootNode)
        learner.growTree(trainX, trainY, argsortX, startId)
        learner.shapeX = trainX.shape
        learner.predict(trainX, trainY)
        learner.computeAlphas()

        obj1 = learner.treeObjective(trainX, trainY)
        size1 = learner.tree.getNumVertices()

        #Now we'll prune
        learner.prune(trainX, trainY)

        obj2 = learner.treeObjective(trainX, trainY)
        size2 = learner.tree.getNumVertices()

        self.assertTrue(obj1 >= obj2)
        self.assertTrue(size1 >= size2)

        #Check there are no nodes with alpha>alphaThreshold
        for vertexId in learner.tree.getAllVertexIds():
            self.assertTrue(
                learner.tree.getVertex(vertexId).alpha <=
                learner.alphaThreshold)
Esempio n. 7
0
    def growTree(self, X, y, argsortX, startId):
        """
        Grow a tree using a stack. Give a sample of data and a node index, we 
        find the best split and add children to the tree accordingly. We perform 
        pre-pruning based on the penalty. 
        """
        eps = 10**-4
        idStack = [startId]

        while len(idStack) != 0:
            nodeId = idStack.pop()
            node = self.tree.getVertex(nodeId)
            accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y,
                                                       node.getTrainInds(),
                                                       argsortX)

            #Choose best feature based on gains
            accuracies += eps
            bestFeatureInd = Util.randomChoice(accuracies)[0]
            bestThreshold = thresholds[bestFeatureInd]

            nodeInds = node.getTrainInds()
            bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[
                X[:, bestFeatureInd][nodeInds] < bestThreshold]])
            bestRightInds = numpy.sort(nodeInds[numpy.arange(
                nodeInds.shape[0])[
                    X[:, bestFeatureInd][nodeInds] >= bestThreshold]])

            #The split may have 0 items in one set, so don't split
            if bestLeftInds.sum() != 0 and bestRightInds.sum(
            ) != 0 and self.tree.depth() < self.maxDepth:
                node.setError(1 - accuracies[bestFeatureInd])
                node.setFeatureInd(bestFeatureInd)
                node.setThreshold(bestThreshold)

                leftChildId = self.getLeftChildId(nodeId)
                leftChild = DecisionNode(bestLeftInds,
                                         Util.mode(y[bestLeftInds]))
                self.tree.addChild(nodeId, leftChildId, leftChild)

                if leftChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(leftChildId)

                rightChildId = self.getRightChildId(nodeId)
                rightChild = DecisionNode(bestRightInds,
                                          Util.mode(y[bestRightInds]))
                self.tree.addChild(nodeId, rightChildId, rightChild)

                if rightChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(rightChildId)
    def testGrowTree(self):
        startId = (0, )
        minSplit = 20
        maxDepth = 3
        gamma = 0.01
        learner = PenaltyDecisionTree(minSplit=minSplit,
                                      maxDepth=maxDepth,
                                      gamma=gamma,
                                      pruning=False)

        trainX = self.X[100:, :]
        trainY = self.y[100:]
        testX = self.X[0:100, :]
        testY = self.y[0:100]

        argsortX = numpy.zeros(trainX.shape, numpy.int)
        for i in range(trainX.shape[1]):
            argsortX[:, i] = numpy.argsort(trainX[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])

        learner.tree = DictTree()
        rootNode = DecisionNode(numpy.arange(trainX.shape[0]),
                                Util.mode(trainY))
        learner.tree.setVertex(startId, rootNode)

        #Note that this matches with the case where we create a new tree each time
        numpy.random.seed(21)
        bestError = float("inf")

        for i in range(20):
            learner.tree.pruneVertex(startId)
            learner.growTree(trainX, trainY, argsortX, startId)

            predTestY = learner.predict(testX)
            error = Evaluator.binaryError(predTestY, testY)
            #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())

            if error < bestError:
                bestError = error
                bestTree = learner.tree.copy()

            self.assertTrue(learner.tree.depth() <= maxDepth)

            for vertexId in learner.tree.nonLeaves():
                self.assertTrue(
                    learner.tree.getVertex(vertexId).getTrainInds().shape[0] >=
                    minSplit)

        bestError1 = bestError
        learner.tree = bestTree

        #Now we test growing a tree from a non-root vertex
        numpy.random.seed(21)
        for i in range(20):
            learner.tree.pruneVertex((0, 1))
            learner.growTree(trainX, trainY, argsortX, (0, 1))

            self.assertTrue(
                learner.tree.getVertex((0, )) == bestTree.getVertex((0, )))
            self.assertTrue(
                learner.tree.getVertex((0, 0)) == bestTree.getVertex((0, 0)))

            predTestY = learner.predict(testX)
            error = Evaluator.binaryError(predTestY, testY)

            if error < bestError:
                bestError = error
                bestTree = learner.tree.copy()
            #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
        self.assertTrue(bestError1 >= bestError)
Esempio n. 9
0
    def growSkLearn(self, X, y):
        """
        Grow a decision tree from sklearn. 
        """

        from sklearn.tree import DecisionTreeRegressor
        regressor = DecisionTreeRegressor(max_depth=self.maxDepth,
                                          min_samples_split=self.minSplit)
        regressor.fit(X, y)

        #Convert the sklearn tree into our tree
        nodeId = (0, )
        nodeStack = [(nodeId, 0)]

        node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0])
        self.tree.setVertex(nodeId, node)

        while len(nodeStack) != 0:
            nodeId, nodeInd = nodeStack.pop()

            node = self.tree.getVertex(nodeId)
            node.setError(regressor.tree_.best_error[nodeInd])
            node.setFeatureInd(regressor.tree_.feature[nodeInd])
            node.setThreshold(regressor.tree_.threshold[nodeInd])

            if regressor.tree_.children[nodeInd, 0] != -1:
                leftChildInds = node.getTrainInds()[
                    X[node.getTrainInds(),
                      node.getFeatureInd()] < node.getThreshold()]
                leftChildId = self.getLeftChildId(nodeId)
                leftChild = DecisionNode(
                    leftChildInds,
                    regressor.tree_.value[regressor.tree_.children[nodeInd,
                                                                   0]])
                self.tree.addChild(nodeId, leftChildId, leftChild)
                nodeStack.append((self.getLeftChildId(nodeId),
                                  regressor.tree_.children[nodeInd, 0]))

            if regressor.tree_.children[nodeInd, 1] != -1:
                rightChildInds = node.getTrainInds()[
                    X[node.getTrainInds(),
                      node.getFeatureInd()] >= node.getThreshold()]
                rightChildId = self.getRightChildId(nodeId)
                rightChild = DecisionNode(
                    rightChildInds,
                    regressor.tree_.value[regressor.tree_.children[nodeInd,
                                                                   1]])
                self.tree.addChild(nodeId, rightChildId, rightChild)
                nodeStack.append((self.getRightChildId(nodeId),
                                  regressor.tree_.children[nodeInd, 1]))
Esempio n. 10
0
 def growSkLearn(self, X, y): 
     """
     Grow a decision tree from sklearn. 
     """
     
     from sklearn.tree import DecisionTreeRegressor
     regressor = DecisionTreeRegressor(max_depth = self.maxDepth, min_samples_split=self.minSplit)
     regressor.fit(X, y)
     
     #Convert the sklearn tree into our tree 
     nodeId = (0, )          
     nodeStack = [(nodeId, 0)] 
     
     node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0])
     self.tree.setVertex(nodeId, node)
     
     while len(nodeStack) != 0: 
         nodeId, nodeInd = nodeStack.pop()
         
         node = self.tree.getVertex(nodeId)
         node.setError(regressor.tree_.best_error[nodeInd])
         node.setFeatureInd(regressor.tree_.feature[nodeInd])
         node.setThreshold(regressor.tree_.threshold[nodeInd])
             
         if regressor.tree_.children[nodeInd, 0] != -1: 
             leftChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] < node.getThreshold()] 
             leftChildId = self.getLeftChildId(nodeId)
             leftChild = DecisionNode(leftChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 0]])
             self.tree.addChild(nodeId, leftChildId, leftChild)
             nodeStack.append((self.getLeftChildId(nodeId), regressor.tree_.children[nodeInd, 0]))
             
         if regressor.tree_.children[nodeInd, 1] != -1: 
             rightChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] >= node.getThreshold()]
             rightChildId = self.getRightChildId(nodeId)
             rightChild = DecisionNode(rightChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 1]])
             self.tree.addChild(nodeId, rightChildId, rightChild)
             nodeStack.append((self.getRightChildId(nodeId), regressor.tree_.children[nodeInd, 1]))