def recursiveSplit(self, X, y, argsortX, nodeId): """ Give a sample of data and a node index, we find the best split and add children to the tree accordingly. """ if len(nodeId)-1 >= self.maxDepth: return node = self.tree.getVertex(nodeId) bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(self.minSplit, X, y, node.getTrainInds(), argsortX) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: node.setError(bestError) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean()) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean()) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, rightChildId)
def learnModel(self, X, y): nodeId = (0, ) self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), y.mean()) self.tree.setVertex(nodeId, rootNode) #We compute a sorted version of X argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) self.growSkLearn(X, y) #self.recursiveSplit(X, y, argsortX, nodeId) self.unprunedTreeSize = self.tree.size if self.pruneType == "REP": #Note: This should be a seperate validation set self.repPrune(X, y) elif self.pruneType == "REP-CV": self.cvPrune(X, y) elif self.pruneType == "CART": self.cartPrune(X, y) elif self.pruneType == "none": pass else: raise ValueError("Unknown pruning type " + self.pruneType)
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: node.setError(1-accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def recursiveSplit(self, X, y, argsortX, nodeId): """ Give a sample of data and a node index, we find the best split and add children to the tree accordingly. """ if len(nodeId) - 1 >= self.maxDepth: return node = self.tree.getVertex(nodeId) bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit( self.minSplit, X, y, node.getTrainInds(), argsortX) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: node.setError(bestError) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean()) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean()) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, rightChildId)
def learnModel(self, X, y): if numpy.unique(y).shape[0] != 2: raise ValueError("Must provide binary labels") if y.dtype != numpy.int: raise ValueError("Labels must be integers") self.shapeX = X.shape argsortX = numpy.zeros(X.shape, numpy.int) for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) rootId = (0, ) idStack = [rootId] self.tree = DictTree() rootNode = DecisionNode(numpy.arange(X.shape[0]), Util.mode(y)) self.tree.setVertex(rootId, rootNode) bestError = float("inf") bestTree = self.tree #First grow a selection of trees while len(idStack) != 0: #Prune the current node away and grow from that node nodeId = idStack.pop() for i in range(self.sampleSize): self.tree = bestTree.deepCopy() try: node = self.tree.getVertex(nodeId) except ValueError: print(nodeId) print(self.tree) raise self.tree.pruneVertex(nodeId) self.growTree(X, y, argsortX, nodeId) self.prune(X, y) error = self.treeObjective(X, y) if error < bestError: bestError = error bestTree = self.tree.deepCopy() children = bestTree.children(nodeId) idStack.extend(children) self.tree = bestTree
def testPrune(self): startId = (0, ) minSplit = 20 maxDepth = 5 gamma = 0.05 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) learner.growTree(trainX, trainY, argsortX, startId) learner.shapeX = trainX.shape learner.predict(trainX, trainY) learner.computeAlphas() obj1 = learner.treeObjective(trainX, trainY) size1 = learner.tree.getNumVertices() #Now we'll prune learner.prune(trainX, trainY) obj2 = learner.treeObjective(trainX, trainY) size2 = learner.tree.getNumVertices() self.assertTrue(obj1 >= obj2) self.assertTrue(size1 >= size2) #Check there are no nodes with alpha>alphaThreshold for vertexId in learner.tree.getAllVertexIds(): self.assertTrue( learner.tree.getVertex(vertexId).alpha <= learner.alphaThreshold)
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue( learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue( learner.tree.getVertex((0, )) == bestTree.getVertex((0, ))) self.assertTrue( learner.tree.getVertex((0, 0)) == bestTree.getVertex((0, 0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError)
def growSkLearn(self, X, y): """ Grow a decision tree from sklearn. """ from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth=self.maxDepth, min_samples_split=self.minSplit) regressor.fit(X, y) #Convert the sklearn tree into our tree nodeId = (0, ) nodeStack = [(nodeId, 0)] node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0]) self.tree.setVertex(nodeId, node) while len(nodeStack) != 0: nodeId, nodeInd = nodeStack.pop() node = self.tree.getVertex(nodeId) node.setError(regressor.tree_.best_error[nodeInd]) node.setFeatureInd(regressor.tree_.feature[nodeInd]) node.setThreshold(regressor.tree_.threshold[nodeInd]) if regressor.tree_.children[nodeInd, 0] != -1: leftChildInds = node.getTrainInds()[ X[node.getTrainInds(), node.getFeatureInd()] < node.getThreshold()] leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode( leftChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 0]]) self.tree.addChild(nodeId, leftChildId, leftChild) nodeStack.append((self.getLeftChildId(nodeId), regressor.tree_.children[nodeInd, 0])) if regressor.tree_.children[nodeInd, 1] != -1: rightChildInds = node.getTrainInds()[ X[node.getTrainInds(), node.getFeatureInd()] >= node.getThreshold()] rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode( rightChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 1]]) self.tree.addChild(nodeId, rightChildId, rightChild) nodeStack.append((self.getRightChildId(nodeId), regressor.tree_.children[nodeInd, 1]))
def growSkLearn(self, X, y): """ Grow a decision tree from sklearn. """ from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth = self.maxDepth, min_samples_split=self.minSplit) regressor.fit(X, y) #Convert the sklearn tree into our tree nodeId = (0, ) nodeStack = [(nodeId, 0)] node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0]) self.tree.setVertex(nodeId, node) while len(nodeStack) != 0: nodeId, nodeInd = nodeStack.pop() node = self.tree.getVertex(nodeId) node.setError(regressor.tree_.best_error[nodeInd]) node.setFeatureInd(regressor.tree_.feature[nodeInd]) node.setThreshold(regressor.tree_.threshold[nodeInd]) if regressor.tree_.children[nodeInd, 0] != -1: leftChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] < node.getThreshold()] leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(leftChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 0]]) self.tree.addChild(nodeId, leftChildId, leftChild) nodeStack.append((self.getLeftChildId(nodeId), regressor.tree_.children[nodeInd, 0])) if regressor.tree_.children[nodeInd, 1] != -1: rightChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] >= node.getThreshold()] rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(rightChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 1]]) self.tree.addChild(nodeId, rightChildId, rightChild) nodeStack.append((self.getRightChildId(nodeId), regressor.tree_.children[nodeInd, 1]))