Example #1
0
 def testFindBestSplitRisk(self): 
     minSplit = 1 
     numExamples = 20 
     numFeatures = 10 
     X = numpy.zeros((numExamples, numFeatures))
     y = numpy.ones(numExamples, numpy.int)
     
     X[0:10, 2] = numpy.arange(10)
     X[10:, 2] = numpy.arange(10)+10 
     y[0:10] = -1 
     
     y += 1 
     
     nodeInds = numpy.arange(X.shape[0])
     argsortX = numpy.zeros(X.shape, numpy.int)      
     
     for i in range(X.shape[1]): 
         argsortX[:, i] = numpy.argsort(X[:, i])
         argsortX[:, i] = numpy.argsort(argsortX[:, i])        
     
     errors, thresholds = findBestSplitRisk(minSplit, X, y, nodeInds, argsortX) 
     print(errors, thresholds)
     
     X = numpy.random.rand(numExamples, numFeatures)
     errors, thresholds = findBestSplitRisk(minSplit, X, y, nodeInds, argsortX) 
     print(errors, thresholds)
    def growTree(self, X, y, argsortX, startId):
        """
        Grow a tree using a stack. Give a sample of data and a node index, we 
        find the best split and add children to the tree accordingly. We perform 
        pre-pruning based on the penalty. 
        """
        eps = 10**-4
        idStack = [startId]

        while len(idStack) != 0:
            nodeId = idStack.pop()
            node = self.tree.getVertex(nodeId)
            accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y,
                                                       node.getTrainInds(),
                                                       argsortX)

            #Choose best feature based on gains
            accuracies += eps
            bestFeatureInd = Util.randomChoice(accuracies)[0]
            bestThreshold = thresholds[bestFeatureInd]

            nodeInds = node.getTrainInds()
            bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[
                X[:, bestFeatureInd][nodeInds] < bestThreshold]])
            bestRightInds = numpy.sort(nodeInds[numpy.arange(
                nodeInds.shape[0])[
                    X[:, bestFeatureInd][nodeInds] >= bestThreshold]])

            #The split may have 0 items in one set, so don't split
            if bestLeftInds.sum() != 0 and bestRightInds.sum(
            ) != 0 and self.tree.depth() < self.maxDepth:
                node.setError(1 - accuracies[bestFeatureInd])
                node.setFeatureInd(bestFeatureInd)
                node.setThreshold(bestThreshold)

                leftChildId = self.getLeftChildId(nodeId)
                leftChild = DecisionNode(bestLeftInds,
                                         Util.mode(y[bestLeftInds]))
                self.tree.addChild(nodeId, leftChildId, leftChild)

                if leftChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(leftChildId)

                rightChildId = self.getRightChildId(nodeId)
                rightChild = DecisionNode(bestRightInds,
                                          Util.mode(y[bestRightInds]))
                self.tree.addChild(nodeId, rightChildId, rightChild)

                if rightChild.getTrainInds().shape[0] >= self.minSplit:
                    idStack.append(rightChildId)
 def growTree(self, X, y, argsortX, startId): 
     """
     Grow a tree using a stack. Give a sample of data and a node index, we 
     find the best split and add children to the tree accordingly. We perform 
     pre-pruning based on the penalty. 
     """
     eps = 10**-4 
     idStack = [startId]
     
     while len(idStack) != 0: 
         nodeId = idStack.pop()
         node = self.tree.getVertex(nodeId)
         accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX)
     
         #Choose best feature based on gains 
         accuracies += eps 
         bestFeatureInd = Util.randomChoice(accuracies)[0]
         bestThreshold = thresholds[bestFeatureInd]
     
         nodeInds = node.getTrainInds()    
         bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) 
         bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]])
         
         #The split may have 0 items in one set, so don't split 
         if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: 
             node.setError(1-accuracies[bestFeatureInd])
             node.setFeatureInd(bestFeatureInd)
             node.setThreshold(bestThreshold)            
                         
             leftChildId = self.getLeftChildId(nodeId)
             leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds]))
             self.tree.addChild(nodeId, leftChildId, leftChild)
             
             if leftChild.getTrainInds().shape[0] >= self.minSplit: 
                 idStack.append(leftChildId)
             
             rightChildId = self.getRightChildId(nodeId)
             rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds]))
             self.tree.addChild(nodeId, rightChildId, rightChild)
             
             if rightChild.getTrainInds().shape[0] >= self.minSplit: 
                 idStack.append(rightChildId)