def testCARTPrune(self): numExamples = 500 X, y = data.make_regression(numExamples) y = Standardiser().standardiseArray(y) numTrain = numpy.round(numExamples * 0.33) numValid = numpy.round(numExamples * 0.33) trainX = X[0:numTrain, :] trainY = y[0:numTrain] validX = X[numTrain:numTrain+numValid, :] validY = y[numTrain:numTrain+numValid] testX = X[numTrain+numValid:, :] testY = y[numTrain+numValid:] learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2) learner.learnModel(trainX, trainY) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2, gamma=1000) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 1000) predY = learner.predict(trainX) learner.setGamma(200) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 200) learner.setGamma(100) learner.learnModel(trainX, trainY) self.assertTrue(learner.tree.getNumVertices() <= 100) learner = DecisionTreeLearner(pruneType="none", maxDepth=10, minSplit=2) learner.learnModel(trainX, trainY) predY2 = learner.predict(trainX) #Gamma = 0 implies no pruning nptst.assert_array_equal(predY, predY2) #Full pruning learner = DecisionTreeLearner(pruneType="CART", maxDepth=3, gamma=1) learner.learnModel(trainX, trainY) self.assertEquals(learner.tree.getNumVertices(), 1)
def testCvPrune(self): numExamples = 500 X, y = data.make_regression(numExamples) y = Standardiser().standardiseArray(y) numTrain = numpy.round(numExamples * 0.33) numValid = numpy.round(numExamples * 0.33) trainX = X[0:numTrain, :] trainY = y[0:numTrain] validX = X[numTrain:numTrain+numValid, :] validY = y[numTrain:numTrain+numValid] testX = X[numTrain+numValid:, :] testY = y[numTrain+numValid:] learner = DecisionTreeLearner() learner.learnModel(trainX, trainY) error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY) #print(learner.getTree()) unprunedTree = learner.tree.copy() learner.setGamma(1000) learner.cvPrune(trainX, trainY) self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices()) learner.setGamma(100) learner.cvPrune(trainX, trainY) #Test if pruned tree is subtree of current: for vertexId in learner.tree.getAllVertexIds(): self.assertTrue(vertexId in unprunedTree.getAllVertexIds()) #The error should be better after pruning learner.learnModel(trainX, trainY) #learner.cvPrune(validX, validY, 0.0, 5) learner.repPrune(validX, validY) error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY) self.assertTrue(error1 >= error2)
def testPredict(self): generator = ExamplesGenerator() for i in range(10): numExamples = numpy.random.randint(1, 200) numFeatures = numpy.random.randint(1, 20) minSplit = numpy.random.randint(1, 50) maxDepth = numpy.random.randint(0, 10) X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) predY = learner.predict(X) tree = learner.tree for vertexId in tree.getAllVertexIds(): nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds()) #Compare against sklearn tree regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0) regressor.fit(X, y) sktree = regressor.tree_ #Note that the sklearn algorithm appears to combine nodes with same value #self.assertEquals(sktree.node_count, tree.getNumVertices()) self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd()) self.assertEquals(sktree.value[0], tree.getRoot().getValue()) self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3) predY2 = regressor.predict(X) #Note that this is not always precise because if two thresholds give the same error we choose the largest #and not sure how it is chosen in sklearn (or if the code is correct) self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)