def setUp(self): numpy.random.seed(21) numpy.seterr("raise") self.numExamples = 200 self.numFeatures = 10 generator = ExamplesGenerator() self.X, self.y = generator.generateBinaryExamples(self.numExamples, self.numFeatures)
def setUp(self): numpy.random.seed(21) numpy.seterr("raise") self.numExamples = 20 self.numFeatures = 5 generator = ExamplesGenerator() self.X, self.y = generator.generateBinaryExamples(self.numExamples, self.numFeatures) self.y = numpy.array(self.y, numpy.float)
def profileDecisionTreeRegressor(self): numExamples = 1000 numFeatures = 20 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) regressor = DecisionTreeRegressor(min_split=minSplit, max_depth=maxDepth, min_density=0.0) ProfileUtils.profile('regressor.fit(X, y)', globals(), locals())
def profileLearnModel(self): numExamples = 1000 numFeatures = 50 minSplit = 10 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") #learner.learnModel(X, y) #print("Done") ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals()) print(learner.getTree().getNumVertices())
def profilePredict(self): #Make the prdiction function faster numExamples = 1000 numFeatures = 20 minSplit = 1 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) print(learner.getTree().getNumVertices()) ProfileUtils.profile('learner.predict(X)', globals(), locals()) print(learner.getTree().getNumVertices())
def testPredict(self): generator = ExamplesGenerator() for i in range(10): numExamples = numpy.random.randint(1, 200) numFeatures = numpy.random.randint(1, 20) minSplit = numpy.random.randint(1, 50) maxDepth = numpy.random.randint(0, 10) X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) predY = learner.predict(X) tree = learner.tree for vertexId in tree.getAllVertexIds(): nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds()) #Compare against sklearn tree regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0) regressor.fit(X, y) sktree = regressor.tree_ #Note that the sklearn algorithm appears to combine nodes with same value #self.assertEquals(sktree.node_count, tree.getNumVertices()) self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd()) self.assertEquals(sktree.value[0], tree.getRoot().getValue()) self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3) predY2 = regressor.predict(X) #Note that this is not always precise because if two thresholds give the same error we choose the largest #and not sure how it is chosen in sklearn (or if the code is correct) self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)
def profileFindBestSplit(self): numExamples = 1000 numFeatures = 100 minSplit = 1 maxDepth = 20 generator = ExamplesGenerator() X, y = generator.generateBinaryExamples(numExamples, numFeatures) X = numpy.array(X, order="F") nodeInds = numpy.arange(X.shape[0]) argsortX = numpy.zeros(X.shape, numpy.int, order="F") for i in range(X.shape[1]): argsortX[:, i] = numpy.argsort(X[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) def run(): for i in range(10): findBestSplit3(minSplit, X, y, nodeInds, argsortX) ProfileUtils.profile('run()', globals(), locals())
def testLearnModel(self): #First check the integrety of the trees generator = ExamplesGenerator() for i in range(5): numExamples = numpy.random.randint(1, 200) numFeatures = numpy.random.randint(1, 10) minSplit = numpy.random.randint(1, 50) maxDepth = numpy.random.randint(1, 10) X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) tree = learner.getTree() for vertexId in tree.getAllVertexIds(): vertex = tree.getVertex(vertexId) if vertex.getFeatureInd() != None: meanValue = y[vertex.getTrainInds()].mean() self.assertEquals(meanValue, vertex.getValue()) if tree.isNonLeaf(vertexId): self.assertTrue(0 <= vertex.getFeatureInd() < X.shape[1]) self.assertTrue(X[:, vertex.getFeatureInd()].min() <= vertex.getThreshold() <= X[:, vertex.getFeatureInd()].max()) self.assertTrue(vertex.getTrainInds().shape[0] >= 1) self.assertTrue(tree.depth() <= maxDepth) #Check that each split contains indices from parent root = tree.getRootId() vertexStack = [root] while len(vertexStack) != 0: vertexId = vertexStack.pop() neighbours = tree.children(vertexId) if len(neighbours) > 2: self.fail("Cannot have more than 2 children") elif len(neighbours) > 0: inds1 = tree.getVertex(neighbours[0]).getTrainInds() inds2 = tree.getVertex(neighbours[1]).getTrainInds() nptst.assert_array_equal(numpy.union1d(inds1, inds2), numpy.unique(tree.getVertex(vertexId).getTrainInds())) vertexStack.append(neighbours[0]) vertexStack.append(neighbours[1]) #Try a tree of depth 0 #learner = DecisionTreeLearner(minSplit=10, maxDepth=0) #learner.learnModel(self.X, self.y) #tree = learner.getTree() #self.assertEquals(tree.depth(), 0) #Try minSplit > numExamples #learner = DecisionTreeLearner(minSplit=self.numExamples+1, maxDepth=0) #learner.learnModel(self.X, self.y) #tree = learner.getTree() #self.assertEquals(tree.getNumVertices(), 1) #Try a simple tree of depth 1 learner = DecisionTreeLearner(minSplit=1, maxDepth=1) learner.learnModel(self.X, self.y) bestFeature = 0 bestError = 10**6 bestThreshold = 0 for i in range(numFeatures): vals = numpy.unique(self.X[:, i]) for j in range(vals.shape[0]-1): threshold = (vals[j+1]+vals[j])/2 leftInds = self.X[:, i] <= threshold rightInds = self.X[:, i] > threshold valLeft = numpy.mean(self.y[leftInds]) valRight = numpy.mean(self.y[rightInds]) error = ((self.y[leftInds] - valLeft)**2).sum() + ((self.y[rightInds] - valRight)**2).sum() if error < bestError: bestError = error bestFeature = i bestThreshold = threshold self.assertAlmostEquals(bestThreshold, learner.tree.getRoot().getThreshold()) self.assertAlmostEquals(bestError, learner.tree.getRoot().getError(), 5) self.assertEquals(bestFeature, learner.tree.getRoot().getFeatureInd()) #Now we will test pruning works learner = DecisionTreeLearner(minSplit=1, maxDepth=10) learner.learnModel(X, y) numVertices1 = learner.getTree().getNumVertices() learner = DecisionTreeLearner(minSplit=1, maxDepth=10, pruneType="REP-CV") learner.learnModel(X, y) numVertices2 = learner.getTree().getNumVertices() self.assertTrue(numVertices1 >= numVertices2)