def testSetErrorCost(self): try: import sklearn except ImportError as error: return numExamples = 1000 numFeatures = 100 eg = ExamplesGenerator() X, y = eg.generateBinaryExamples(numExamples, numFeatures) svm = LibSVM() C = 0.1 kernel = "linear" kernelParam = 0 svm.setKernel(kernel, kernelParam) svm.setC(C) svm.setErrorCost(0.1) svm.learnModel(X, y) predY = svm.classify(X) e1 = Evaluator.binaryErrorP(y, predY) svm.setErrorCost(0.9) svm.learnModel(X, y) predY = svm.classify(X) e2 = Evaluator.binaryErrorP(y, predY) self.assertTrue(e1 > e2)
def testVariableImportance(self): X, y, c = ExamplesGenerator().generateBinaryExamples(numExamples=500, verbose=True) treeRank = TreeRank(self.leafRanklearner) treeRank.learnModel(X, y) weightVector = treeRank.variableImportance(X, y)
def setUp(self): numpy.random.seed(21) numpy.seterr("raise") self.numExamples = 20 self.numFeatures = 5 generator = ExamplesGenerator() self.X, self.y = generator.generateBinaryExamples(self.numExamples, self.numFeatures) self.y = numpy.array(self.y, numpy.float)
def setUp(self): numpy.random.seed(21) numpy.seterr("raise") self.numExamples = 200 self.numFeatures = 10 generator = ExamplesGenerator() self.X, self.y = generator.generateBinaryExamples( self.numExamples, self.numFeatures)
def testGetModel(self): try: import sklearn except ImportError as error: return numExamples = 50 numFeatures = 3 eg = ExamplesGenerator() X, y = eg.generateBinaryExamples(numExamples, numFeatures) svm = LibSVM() svm.learnModel(X, y) weights, b = svm.getWeights()
def testVariableImportance(self): X, y, c = ExamplesGenerator().generateBinaryExamples(numExamples=100, verbose=True) treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setFeatureSize(0.5) treeRankForest.setNumTrees(20) treeRankForest.setSampleSize(1.0) treeRankForest.learnModel(X, y) weightVector = treeRankForest.variableImportance(X, y) #Seems to work, sort of print(c) print(weightVector) print(numpy.argsort(c)) print(numpy.argsort(weightVector))
def setUp(self): try: import sklearn except ImportError as error: logging.debug(error) return numpy.random.seed(21) numExamples = 100 numFeatures = 10 eg = ExamplesGenerator() self.X, self.y = eg.generateBinaryExamples(numExamples, numFeatures) self.svm = LibSVM() self.svm.Cs = 2.0**numpy.arange(-2, 2, dtype=numpy.float) self.svm.gammas = 2.0**numpy.arange(-3, 1, dtype=numpy.float) self.svm.epsilons = 2.0**numpy.arange(-2, 0, dtype=numpy.float) numpy.set_printoptions(linewidth=150, suppress=True, precision=3) logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
def testPredict(self): generator = ExamplesGenerator() for i in range(10): numExamples = numpy.random.randint(1, 200) numFeatures = numpy.random.randint(1, 20) minSplit = numpy.random.randint(1, 50) maxDepth = numpy.random.randint(0, 10) X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) predY = learner.predict(X) tree = learner.tree for vertexId in tree.getAllVertexIds(): nptst.assert_array_equal(tree.getVertex(vertexId).getTrainInds(), tree.getVertex(vertexId).getTestInds()) #Compare against sklearn tree regressor = DecisionTreeRegressor(min_samples_split=minSplit, max_depth=maxDepth, min_density=0.0) regressor.fit(X, y) sktree = regressor.tree_ #Note that the sklearn algorithm appears to combine nodes with same value #self.assertEquals(sktree.node_count, tree.getNumVertices()) self.assertEquals(sktree.feature[0], tree.getRoot().getFeatureInd()) self.assertEquals(sktree.value[0], tree.getRoot().getValue()) self.assertAlmostEquals(sktree.threshold[0], tree.getRoot().getThreshold(), 3) predY2 = regressor.predict(X) #Note that this is not always precise because if two thresholds give the same error we choose the largest #and not sure how it is chosen in sklearn (or if the code is correct) self.assertTrue(abs(numpy.linalg.norm(predY-y)- numpy.linalg.norm(predY2-y))/numExamples < 0.05)
def setUp(self): examplesGenerator = ExamplesGenerator() self.X, self.y = examplesGenerator.generateBinaryExamples(1000)
def testLearnModel(self): #First check the integrety of the trees generator = ExamplesGenerator() for i in range(5): numExamples = numpy.random.randint(1, 200) numFeatures = numpy.random.randint(1, 10) minSplit = numpy.random.randint(1, 50) maxDepth = numpy.random.randint(1, 10) X, y = generator.generateBinaryExamples(numExamples, numFeatures) y = numpy.array(y, numpy.float) learner = DecisionTreeLearner(minSplit=minSplit, maxDepth=maxDepth) learner.learnModel(X, y) tree = learner.getTree() for vertexId in tree.getAllVertexIds(): vertex = tree.getVertex(vertexId) if vertex.getFeatureInd() != None: meanValue = y[vertex.getTrainInds()].mean() self.assertEquals(meanValue, vertex.getValue()) if tree.isNonLeaf(vertexId): self.assertTrue(0 <= vertex.getFeatureInd() < X.shape[1]) self.assertTrue(X[:, vertex.getFeatureInd()].min() <= vertex.getThreshold() <= X[:, vertex.getFeatureInd()].max()) self.assertTrue(vertex.getTrainInds().shape[0] >= 1) self.assertTrue(tree.depth() <= maxDepth) #Check that each split contains indices from parent root = tree.getRootId() vertexStack = [root] while len(vertexStack) != 0: vertexId = vertexStack.pop() neighbours = tree.children(vertexId) if len(neighbours) > 2: self.fail("Cannot have more than 2 children") elif len(neighbours) > 0: inds1 = tree.getVertex(neighbours[0]).getTrainInds() inds2 = tree.getVertex(neighbours[1]).getTrainInds() nptst.assert_array_equal(numpy.union1d(inds1, inds2), numpy.unique(tree.getVertex(vertexId).getTrainInds())) vertexStack.append(neighbours[0]) vertexStack.append(neighbours[1]) #Try a tree of depth 0 #learner = DecisionTreeLearner(minSplit=10, maxDepth=0) #learner.learnModel(self.X, self.y) #tree = learner.getTree() #self.assertEquals(tree.depth(), 0) #Try minSplit > numExamples #learner = DecisionTreeLearner(minSplit=self.numExamples+1, maxDepth=0) #learner.learnModel(self.X, self.y) #tree = learner.getTree() #self.assertEquals(tree.getNumVertices(), 1) #Try a simple tree of depth 1 learner = DecisionTreeLearner(minSplit=1, maxDepth=1) learner.learnModel(self.X, self.y) bestFeature = 0 bestError = 10**6 bestThreshold = 0 for i in range(numFeatures): vals = numpy.unique(self.X[:, i]) for j in range(vals.shape[0]-1): threshold = (vals[j+1]+vals[j])/2 leftInds = self.X[:, i] <= threshold rightInds = self.X[:, i] > threshold valLeft = numpy.mean(self.y[leftInds]) valRight = numpy.mean(self.y[rightInds]) error = ((self.y[leftInds] - valLeft)**2).sum() + ((self.y[rightInds] - valRight)**2).sum() if error < bestError: bestError = error bestFeature = i bestThreshold = threshold self.assertAlmostEquals(bestThreshold, learner.tree.getRoot().getThreshold()) self.assertAlmostEquals(bestError, learner.tree.getRoot().getError(), 5) self.assertEquals(bestFeature, learner.tree.getRoot().getFeatureInd()) #Now we will test pruning works learner = DecisionTreeLearner(minSplit=1, maxDepth=10) learner.learnModel(X, y) numVertices1 = learner.getTree().getNumVertices() learner = DecisionTreeLearner(minSplit=1, maxDepth=10, pruneType="REP-CV") learner.learnModel(X, y) numVertices2 = learner.getTree().getNumVertices() self.assertTrue(numVertices1 >= numVertices2)