def recursiveSplit(self, X, y, argsortX, nodeId): """ Give a sample of data and a node index, we find the best split and add children to the tree accordingly. """ if len(nodeId) - 1 >= self.maxDepth: return node = self.tree.getVertex(nodeId) bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit( self.minSplit, X, y, node.getTrainInds(), argsortX) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: node.setError(bestError) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean()) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean()) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, rightChildId)
def recursiveSplit(self, X, y, argsortX, nodeId): """ Give a sample of data and a node index, we find the best split and add children to the tree accordingly. """ if len(nodeId)-1 >= self.maxDepth: return node = self.tree.getVertex(nodeId) bestError, bestFeatureInd, bestThreshold, bestLeftInds, bestRightInds = findBestSplit(self.minSplit, X, y, node.getTrainInds(), argsortX) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0: node.setError(bestError) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, y[bestLeftInds].mean()) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, y[bestRightInds].mean()) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: self.recursiveSplit(X, y, argsortX, rightChildId)
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] < bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange( nodeInds.shape[0])[ X[:, bestFeatureInd][nodeInds] >= bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum( ) != 0 and self.tree.depth() < self.maxDepth: node.setError(1 - accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def growSkLearn(self, X, y): """ Grow a decision tree from sklearn. """ from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth=self.maxDepth, min_samples_split=self.minSplit) regressor.fit(X, y) #Convert the sklearn tree into our tree nodeId = (0, ) nodeStack = [(nodeId, 0)] node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0]) self.tree.setVertex(nodeId, node) while len(nodeStack) != 0: nodeId, nodeInd = nodeStack.pop() node = self.tree.getVertex(nodeId) node.setError(regressor.tree_.best_error[nodeInd]) node.setFeatureInd(regressor.tree_.feature[nodeInd]) node.setThreshold(regressor.tree_.threshold[nodeInd]) if regressor.tree_.children[nodeInd, 0] != -1: leftChildInds = node.getTrainInds()[ X[node.getTrainInds(), node.getFeatureInd()] < node.getThreshold()] leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode( leftChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 0]]) self.tree.addChild(nodeId, leftChildId, leftChild) nodeStack.append((self.getLeftChildId(nodeId), regressor.tree_.children[nodeInd, 0])) if regressor.tree_.children[nodeInd, 1] != -1: rightChildInds = node.getTrainInds()[ X[node.getTrainInds(), node.getFeatureInd()] >= node.getThreshold()] rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode( rightChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 1]]) self.tree.addChild(nodeId, rightChildId, rightChild) nodeStack.append((self.getRightChildId(nodeId), regressor.tree_.children[nodeInd, 1]))
def growTree(self, X, y, argsortX, startId): """ Grow a tree using a stack. Give a sample of data and a node index, we find the best split and add children to the tree accordingly. We perform pre-pruning based on the penalty. """ eps = 10**-4 idStack = [startId] while len(idStack) != 0: nodeId = idStack.pop() node = self.tree.getVertex(nodeId) accuracies, thresholds = findBestSplitRisk(self.minSplit, X, y, node.getTrainInds(), argsortX) #Choose best feature based on gains accuracies += eps bestFeatureInd = Util.randomChoice(accuracies)[0] bestThreshold = thresholds[bestFeatureInd] nodeInds = node.getTrainInds() bestLeftInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]<bestThreshold]]) bestRightInds = numpy.sort(nodeInds[numpy.arange(nodeInds.shape[0])[X[:, bestFeatureInd][nodeInds]>=bestThreshold]]) #The split may have 0 items in one set, so don't split if bestLeftInds.sum() != 0 and bestRightInds.sum() != 0 and self.tree.depth() < self.maxDepth: node.setError(1-accuracies[bestFeatureInd]) node.setFeatureInd(bestFeatureInd) node.setThreshold(bestThreshold) leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(bestLeftInds, Util.mode(y[bestLeftInds])) self.tree.addChild(nodeId, leftChildId, leftChild) if leftChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(leftChildId) rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(bestRightInds, Util.mode(y[bestRightInds])) self.tree.addChild(nodeId, rightChildId, rightChild) if rightChild.getTrainInds().shape[0] >= self.minSplit: idStack.append(rightChildId)
def growSkLearn(self, X, y): """ Grow a decision tree from sklearn. """ from sklearn.tree import DecisionTreeRegressor regressor = DecisionTreeRegressor(max_depth = self.maxDepth, min_samples_split=self.minSplit) regressor.fit(X, y) #Convert the sklearn tree into our tree nodeId = (0, ) nodeStack = [(nodeId, 0)] node = DecisionNode(numpy.arange(X.shape[0]), regressor.tree_.value[0]) self.tree.setVertex(nodeId, node) while len(nodeStack) != 0: nodeId, nodeInd = nodeStack.pop() node = self.tree.getVertex(nodeId) node.setError(regressor.tree_.best_error[nodeInd]) node.setFeatureInd(regressor.tree_.feature[nodeInd]) node.setThreshold(regressor.tree_.threshold[nodeInd]) if regressor.tree_.children[nodeInd, 0] != -1: leftChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] < node.getThreshold()] leftChildId = self.getLeftChildId(nodeId) leftChild = DecisionNode(leftChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 0]]) self.tree.addChild(nodeId, leftChildId, leftChild) nodeStack.append((self.getLeftChildId(nodeId), regressor.tree_.children[nodeInd, 0])) if regressor.tree_.children[nodeInd, 1] != -1: rightChildInds = node.getTrainInds()[X[node.getTrainInds(), node.getFeatureInd()] >= node.getThreshold()] rightChildId = self.getRightChildId(nodeId) rightChild = DecisionNode(rightChildInds, regressor.tree_.value[regressor.tree_.children[nodeInd, 1]]) self.tree.addChild(nodeId, rightChildId, rightChild) nodeStack.append((self.getRightChildId(nodeId), regressor.tree_.children[nodeInd, 1]))