def setUp(self):
     numpy.random.seed(21)
     numpy.seterr("raise")
     self.numExamples = 200
     self.numFeatures = 10
     
     generator = ExamplesGenerator() 
     self.X, self.y = generator.generateBinaryExamples(self.numExamples, self.numFeatures)
 def setUp(self):
     numpy.random.seed(21)
     numpy.seterr("raise")
     self.numExamples = 20
     self.numFeatures = 5
     
     generator = ExamplesGenerator() 
     self.X, self.y = generator.generateBinaryExamples(self.numExamples, self.numFeatures)
     self.y = numpy.array(self.y, numpy.float)
 def profileDecisionTreeRegressor(self): 
     numExamples = 1000
     numFeatures = 20
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     regressor = DecisionTreeRegressor(min_split=minSplit, max_depth=maxDepth, min_density=0.0)
     
     ProfileUtils.profile('regressor.fit(X, y)', globals(), locals())
Beispiel #4
0
 def profileDecisionTreeRegressor(self): 
     numExamples = 1000
     numFeatures = 20
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     regressor = DecisionTreeRegressor(min_split=minSplit, max_depth=maxDepth, min_density=0.0)
     
     ProfileUtils.profile('regressor.fit(X, y)', globals(), locals())
 def profileLearnModel(self):
     numExamples = 1000
     numFeatures = 50
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
     y = numpy.array(y, numpy.float)
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") 
     #learner.learnModel(X, y)
     #print("Done")
     ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals())
     
     print(learner.getTree().getNumVertices())
Beispiel #6
0
 def profileLearnModel(self):
     numExamples = 1000
     numFeatures = 50
     minSplit = 10
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
     y = numpy.array(y, numpy.float)
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth, pruneType="REP-CV") 
     #learner.learnModel(X, y)
     #print("Done")
     ProfileUtils.profile('learner.learnModel(X, y) ', globals(), locals())
     
     print(learner.getTree().getNumVertices())
 def profilePredict(self): 
     #Make the prdiction function faster 
     numExamples = 1000
     numFeatures = 20
     minSplit = 1
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
     learner.learnModel(X, y)
     
     print(learner.getTree().getNumVertices())
     ProfileUtils.profile('learner.predict(X)', globals(), locals())
     
     print(learner.getTree().getNumVertices())
Beispiel #8
0
 def profilePredict(self): 
     #Make the prdiction function faster 
     numExamples = 1000
     numFeatures = 20
     minSplit = 1
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         
     learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
     learner.learnModel(X, y)
     
     print(learner.getTree().getNumVertices())
     ProfileUtils.profile('learner.predict(X)', globals(), locals())
     
     print(learner.getTree().getNumVertices())
 def testPredict(self): 
     
     generator = ExamplesGenerator()         
     
     for i in range(10):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 20)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(0, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)   
         y = numpy.array(y, numpy.float)
             
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)    
         
         predY = learner.predict(X)
         
         tree = learner.tree            
         
         for vertexId in tree.getAllVertexIds(): 
             
             nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds())
             
         #Compare against sklearn tree  
         regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0)
         regressor.fit(X, y)
         
         sktree = regressor.tree_
         
         #Note that the sklearn algorithm appears to combine nodes with same value 
         #self.assertEquals(sktree.node_count, tree.getNumVertices())
         self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd())
         self.assertEquals(sktree.value[0], tree.getRoot().getValue())
         self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3)
         
         predY2 = regressor.predict(X)
         
         #Note that this is not always precise because if two thresholds give the same error we choose the largest 
         #and not sure how it is chosen in sklearn (or if the code is correct)
         self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)  
Beispiel #10
0
 def profileFindBestSplit(self):
     numExamples = 1000
     numFeatures = 100
     minSplit = 1
     maxDepth = 20
     
     generator = ExamplesGenerator()
     X, y = generator.generateBinaryExamples(numExamples, numFeatures)
     X = numpy.array(X, order="F")
     
     nodeInds = numpy.arange(X.shape[0])
     argsortX = numpy.zeros(X.shape, numpy.int, order="F")      
     
     for i in range(X.shape[1]): 
         argsortX[:, i] = numpy.argsort(X[:, i])
         argsortX[:, i] = numpy.argsort(argsortX[:, i])            
     
     def run(): 
         for i in range(10): 
             findBestSplit3(minSplit, X, y, nodeInds, argsortX) 
     
     ProfileUtils.profile('run()', globals(), locals())
 def testLearnModel(self): 
     #First check the integrety of the trees 
     generator = ExamplesGenerator()         
     
     for i in range(5):        
         numExamples = numpy.random.randint(1, 200)
         numFeatures = numpy.random.randint(1, 10)
         minSplit = numpy.random.randint(1, 50)
         maxDepth = numpy.random.randint(1, 10)
         
         X, y = generator.generateBinaryExamples(numExamples, numFeatures)
         y = numpy.array(y, numpy.float)
     
         learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) 
         learner.learnModel(X, y)        
         tree = learner.getTree() 
         
         for vertexId in tree.getAllVertexIds(): 
             vertex = tree.getVertex(vertexId)
             if vertex.getFeatureInd() != None: 
                 meanValue = y[vertex.getTrainInds()].mean()
                 self.assertEquals(meanValue, vertex.getValue())
                 if tree.isNonLeaf(vertexId): 
                     self.assertTrue(0 <= vertex.getFeatureInd() < X.shape[1]) 
                     self.assertTrue(X[:, vertex.getFeatureInd()].min() <= vertex.getThreshold() <= X[:, vertex.getFeatureInd()].max())
                 self.assertTrue(vertex.getTrainInds().shape[0] >= 1)
         
         
         self.assertTrue(tree.depth() <= maxDepth)
         #Check that each split contains indices from parent 
         root = tree.getRootId()
         vertexStack = [root]
         
         while len(vertexStack) != 0: 
             vertexId = vertexStack.pop()
             neighbours = tree.children(vertexId)
             
             if len(neighbours) > 2: 
                 self.fail("Cannot have more than 2 children") 
             elif len(neighbours) > 0: 
                 inds1 = tree.getVertex(neighbours[0]).getTrainInds()
                 inds2 = tree.getVertex(neighbours[1]).getTrainInds()
                 
                 nptst.assert_array_equal(numpy.union1d(inds1, inds2), numpy.unique(tree.getVertex(vertexId).getTrainInds()))
                 
                 vertexStack.append(neighbours[0])
                 vertexStack.append(neighbours[1])
     
     #Try a tree of depth 0 
     #learner = DecisionTreeLearner(minSplit=10, maxDepth=0) 
     #learner.learnModel(self.X, self.y)        
     #tree = learner.getTree()
     
     #self.assertEquals(tree.depth(), 0)
     
     #Try minSplit > numExamples 
     #learner = DecisionTreeLearner(minSplit=self.numExamples+1, maxDepth=0) 
     #learner.learnModel(self.X, self.y)        
     #tree = learner.getTree()
     
     #self.assertEquals(tree.getNumVertices(), 1)
     
     #Try a simple tree of depth 1 
     learner = DecisionTreeLearner(minSplit=1, maxDepth=1) 
     learner.learnModel(self.X, self.y)     
     
     bestFeature = 0 
     bestError = 10**6 
     bestThreshold = 0         
     
     for i in range(numFeatures): 
         vals = numpy.unique(self.X[:, i])
         
         for j in range(vals.shape[0]-1):             
             threshold = (vals[j+1]+vals[j])/2
             leftInds = self.X[:, i] <= threshold
             rightInds = self.X[:, i] > threshold
             
             valLeft = numpy.mean(self.y[leftInds])
             valRight = numpy.mean(self.y[rightInds])
             
             error = ((self.y[leftInds] - valLeft)**2).sum() + ((self.y[rightInds] - valRight)**2).sum()
             
             if error < bestError: 
                 bestError = error 
                 bestFeature = i 
                 bestThreshold = threshold 
     
     self.assertAlmostEquals(bestThreshold, learner.tree.getRoot().getThreshold())
     self.assertAlmostEquals(bestError, learner.tree.getRoot().getError(), 5)
     self.assertEquals(bestFeature, learner.tree.getRoot().getFeatureInd())
     
     #Now we will test pruning works 
     learner = DecisionTreeLearner(minSplit=1, maxDepth=10) 
     learner.learnModel(X, y)
     numVertices1 = learner.getTree().getNumVertices()       
     
     learner = DecisionTreeLearner(minSplit=1, maxDepth=10, pruneType="REP-CV") 
     learner.learnModel(X, y) 
     numVertices2 = learner.getTree().getNumVertices()   
     
     self.assertTrue(numVertices1 >= numVertices2)