def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) maxDepths = range(3, 10) trainAucs = numpy.array([0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]) testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRank = TreeRank(self.leafRanklearner) treeRank.setMaxDepth(maxDepth) treeRank.learnModel(X, y) trainScores = treeRank.predict(X) testScores = treeRank.predict(testX) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testLocalAuc(self): testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1]) predY = numpy.array([0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819]) self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY)) self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0) self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
def testAveragePrecisionFromLists(self): predList = [4, 2, 10] testList = [4, 2, 15, 16] self.assertEquals(Evaluator.averagePrecisionFromLists(testList, predList), 0.5) predList = [0,1,2,3,4,5] testList = [0, 3, 4, 5] self.assertAlmostEquals(Evaluator.averagePrecisionFromLists(testList, predList), 0.691666666666)
def testLocalAuc(self): testY = numpy.array([-1, -1, 1, 1, 1, 1, 1, -1, -1, 1]) predY = numpy.array([ 0.987, 0.868, 0.512, 0.114, 0.755, 0.976, 0.05, 0.371, 0.629, 0.819 ]) self.assertEquals(Evaluator.localAuc(testY, predY, 1.0), Evaluator.auc(predY, testY)) self.assertEquals(Evaluator.localAuc(testY, predY, 0.0), 0) self.assertEquals(Evaluator.localAuc(testY, testY, 0.2), 1.0)
def testBinaryError(self): testY = numpy.array([1, 1, -1, 1]) predY = numpy.array([-1, 1, -1, 1]) predY2 = numpy.array([-1, -1, -1, 1]) predY3 = numpy.array([-1, -1, 1, -1]) self.assertTrue(Evaluator.binaryError(testY, predY) == 0.25) self.assertTrue(Evaluator.binaryError(testY, testY) == 0.0) self.assertTrue(Evaluator.binaryError(predY, predY) == 0.0) self.assertTrue(Evaluator.binaryError(testY, predY2) == 0.5) self.assertTrue(Evaluator.binaryError(testY, predY3) == 1.0)
def testMeanAbsError(self): testY = numpy.array([1, 2, 1.5]) predY = numpy.array([2, 1, 0.5]) self.assertEquals(Evaluator.meanAbsError(testY, predY), 1.0) self.assertEquals(Evaluator.meanAbsError(testY, testY), 0.0) testY = numpy.random.rand(10) predY = numpy.random.rand(10) error = numpy.abs(testY - predY).mean() self.assertEquals(error, Evaluator.meanAbsError(testY, predY))
def testWeightedRootMeanSqError(self): y = numpy.array([0.1, 0.2, 0.3]) predY = numpy.array([0.1, 0.2, 0.3]) self.assertEquals(Evaluator.weightedRootMeanSqError(y, predY), 0.0) #Errors on larger ys are weighted more predY = numpy.array([0.0, 0.2, 0.3]) predY2 = numpy.array([0.1, 0.2, 0.4]) self.assertTrue(Evaluator.weightedRootMeanSqError(y, predY) < Evaluator.weightedRootMeanSqError(y, predY2))
def testAveragePrecisionFromLists(self): predList = [4, 2, 10] testList = [4, 2, 15, 16] self.assertEquals( Evaluator.averagePrecisionFromLists(testList, predList), 0.5) predList = [0, 1, 2, 3, 4, 5] testList = [0, 3, 4, 5] self.assertAlmostEquals( Evaluator.averagePrecisionFromLists(testList, predList), 0.691666666666)
def testLearnModel2(self): #We want to make sure the learnt tree with gamma = 0 maximise the #empirical risk minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) #Vary sampleSize numpy.random.seed(21) learner.setSampleSize(1) learner.learnModel(self.X, self.y) error1 = learner.treeObjective(self.X, self.y) numpy.random.seed(21) learner.setSampleSize(5) learner.learnModel(self.X, self.y) error2 = learner.treeObjective(self.X, self.y) numpy.random.seed(21) learner.setSampleSize(10) learner.learnModel(self.X, self.y) error3 = learner.treeObjective(self.X, self.y) self.assertTrue(error1 >= error2) self.assertTrue(error2 >= error3) #Now vary max depth learner.gamma = 0 numpy.random.seed(21) learner.setSampleSize(1) learner.minSplit = 1 learner.maxDepth = 3 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error1 = Evaluator.binaryError(self.y, predY) numpy.random.seed(21) learner.maxDepth = 5 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error2 = Evaluator.binaryError(self.y, predY) numpy.random.seed(21) learner.maxDepth = 10 learner.learnModel(self.X, self.y) predY = learner.predict(self.X) error3 = Evaluator.binaryError(self.y, predY) self.assertTrue(error1 >= error2) self.assertTrue(error2 >= error3)
def testSetC(self): rankSVM = RankSVM() rankSVM.setC(100.0) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc1 = Evaluator.auc(predY, self.y) rankSVM.setC(0.1) rankSVM.learnModel(self.X, self.y) predY = rankSVM.predict(self.X) auc2 = Evaluator.auc(predY, self.y) self.assertTrue(auc1 != auc2)
def testRootMeanSqError(self): y = numpy.array([1,2,3]) predY = numpy.array([1,2,3]) self.assertEquals(Evaluator.rootMeanSqError(y, predY), 0.0) y = numpy.array([1,2,3]) predY = numpy.array([1,2,2]) self.assertEquals(Evaluator.rootMeanSqError(y, predY), float(1)/numpy.sqrt(3)) predY = numpy.array([1,2]) self.assertRaises(ValueError, Evaluator.rootMeanSqError, y, predY)
def testRootMeanSqError(self): y = numpy.array([1, 2, 3]) predY = numpy.array([1, 2, 3]) self.assertEquals(Evaluator.rootMeanSqError(y, predY), 0.0) y = numpy.array([1, 2, 3]) predY = numpy.array([1, 2, 2]) self.assertEquals(Evaluator.rootMeanSqError(y, predY), float(1) / numpy.sqrt(3)) predY = numpy.array([1, 2]) self.assertRaises(ValueError, Evaluator.rootMeanSqError, y, predY)
def testWeightedRootMeanSqError(self): y = numpy.array([0.1, 0.2, 0.3]) predY = numpy.array([0.1, 0.2, 0.3]) self.assertEquals(Evaluator.weightedRootMeanSqError(y, predY), 0.0) #Errors on larger ys are weighted more predY = numpy.array([0.0, 0.2, 0.3]) predY2 = numpy.array([0.1, 0.2, 0.4]) self.assertTrue( Evaluator.weightedRootMeanSqError(y, predY) < Evaluator.weightedRootMeanSqError(y, predY2))
def testBinaryBootstrapError(self): testY = numpy.array([-1, -1, 1, 1, 1]) predY = 1 - testY trainY = numpy.array([-1, -1, 1, 1, 1]) predTrainY = 1 - trainY self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, trainY, 0.5), 0.0) self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.5), 0.5) self.assertEquals(Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.1), 0.9) self.assertEquals(Evaluator.binaryBootstrapError(testY, predY, trainY, trainY, 0.1), 0.1)
def evaluateCvOuter(self, X, Y, folds, leafRank): """ Run cross validation and output some ROC curves. In this case Y is a 1D array. """ Parameter.checkClass(X, numpy.ndarray) Parameter.checkClass(Y, numpy.ndarray) Parameter.checkInt(folds, 2, float('inf')) if Y.ndim != 1: raise ValueError("Expecting Y to be 1D") indexList = cross_val.StratifiedKFold(Y, folds) self.setLeafRank(leafRank) bestParams = [] bestTrainAUCs = numpy.zeros(folds) bestTrainROCs = [] bestTestAUCs = numpy.zeros(folds) bestTestROCs = [] bestMetaDicts = [] i = 0 for trainInds, testInds in indexList: Util.printIteration(i, 1, folds) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY))) logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY))) self.learnModel(trainX, trainY) predTrainY = self.predict(trainX) predTestY = self.predict(testX) bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY) bestTestAUCs[i] = Evaluator.auc(predTestY, testY) #Store the parameters and ROC curves bestTrainROCs.append(Evaluator.roc(trainY, predTrainY)) bestTestROCs.append(Evaluator.roc(testY, predTestY)) metaDict = {} bestMetaDicts.append(metaDict) i += 1 logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs))) logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs))) allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs] return (bestParams, allMetrics, bestMetaDicts)
def testAuc(self): testY = numpy.array([-1, -1, 1, 1]) predY = numpy.array([-1, 0, 1, 1]) predY2 = numpy.array([0.1, 0.2, 0.3, 0.4]) self.assertEquals(Evaluator.auc(predY, testY), 1.0) self.assertEquals(Evaluator.auc(predY2, testY), 1.0) self.assertEquals(Evaluator.auc(-predY, testY), 0.0) numExamples = 1000 testY = numpy.array(numpy.random.rand(numExamples)>0.5, numpy.int) predY = numpy.random.rand(numExamples)>0.5 #For a random score the AUC is approximately 0.5 self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
def testAuc(self): testY = numpy.array([-1, -1, 1, 1]) predY = numpy.array([-1, 0, 1, 1]) predY2 = numpy.array([0.1, 0.2, 0.3, 0.4]) self.assertEquals(Evaluator.auc(predY, testY), 1.0) self.assertEquals(Evaluator.auc(predY2, testY), 1.0) self.assertEquals(Evaluator.auc(-predY, testY), 0.0) numExamples = 1000 testY = numpy.array(numpy.random.rand(numExamples) > 0.5, numpy.int) predY = numpy.random.rand(numExamples) > 0.5 #For a random score the AUC is approximately 0.5 self.assertAlmostEquals(Evaluator.auc(predY, testY), 0.5, 1)
def computeRankMetrics(self, X, Y, indexList, bestLearners, standardiserY, labelIndex): #Some code to do ranking using the learner predictors i = 0 rankMetrics = numpy.zeros((len(indexList), self.boundsList[labelIndex].shape[0]-1)) for idxtr, idxts in indexList: logging.info("Iteration " + str(i)) trainX, testX = X[idxtr, :], X[idxts, :] trainY, testY = Y[idxtr], Y[idxts] bestLearners[i].learnModel(trainX, trainY) predY = bestLearners[i].predict(testX) gc.collect() #Now output 3 sets of ranked scores predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) for j in range(self.boundsList[labelIndex].shape[0]-1): rankMetrics[i, j] = Evaluator.auc(YScores[:, j], YIndList[j]) i += 1 logging.debug(rankMetrics) return rankMetrics
def testAuc(self): self.treeRankForest.learnModel(self.X, self.Y) scores = self.treeRankForest.predictScores(self.X) auc1 = Evaluator.auc(scores, self.Y.ravel()) auc2 = self.treeRankForest.aucFromROC(self.treeRankForest.predictROC(self.X, self.Y)) self.assertAlmostEquals(auc1, auc2, places=4)
def testPredict2(self): #Test on Gauss2D dataset dataDir = PathDefaults.getDataDir() fileName = dataDir + "Gauss2D_learn.csv" XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") X = XY[:, 0:2] y = XY[:, 2] y = y*2 - 1 fileName = dataDir + "Gauss2D_test.csv" testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",") testX = testXY[:, 0:2] testY = testXY[:, 2] testY = testY*2-1 X = Standardiser().standardiseArray(X) testX = Standardiser().standardiseArray(testX) numTrees = 5 minSplit = 50 maxDepths = range(3, 10) trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347]) testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801]) i = 0 #The results are approximately the same, but not exactly for maxDepth in maxDepths: treeRankForest = TreeRankForest(self.leafRanklearner) treeRankForest.setMaxDepth(maxDepth) treeRankForest.setMinSplit(minSplit) treeRankForest.setNumTrees(numTrees) treeRankForest.learnModel(X, y) trainScores = treeRankForest.predict(X) testScores = treeRankForest.predict(testX) print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY)) self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1) self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1) i+=1
def testPrecisionFromIndLists(self): predList = [4, 2, 10] testList = [4, 2] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 2.0/3) testList = [4, 2, 10] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1) predList = [10, 2, 4] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1) testList = [1, 9, 11] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 0) predList = [1, 2, 3, 4, 5] testList = [1, 9, 11] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1.0/5)
def computeBootstrapError(args): """ Used in conjunction with the parallel model selection. Trains and then tests on a seperate test set and evaluated the bootstrap error. """ (trainX, trainY, testX, testY, learner) = args learner.learnModel(trainX, trainY) predTestY = learner.predict(testX) predTrainY = learner.predict(trainX) weight = 0.632 return Evaluator.binaryBootstrapError(predTestY, testY, predTrainY, trainY, weight)
def testCvPrune(self): numExamples = 500 X, y = data.make_regression(numExamples) y = Standardiser().standardiseArray(y) numTrain = numpy.round(numExamples * 0.33) numValid = numpy.round(numExamples * 0.33) trainX = X[0:numTrain, :] trainY = y[0:numTrain] validX = X[numTrain:numTrain+numValid, :] validY = y[numTrain:numTrain+numValid] testX = X[numTrain+numValid:, :] testY = y[numTrain+numValid:] learner = DecisionTreeLearner() learner.learnModel(trainX, trainY) error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY) #print(learner.getTree()) unprunedTree = learner.tree.copy() learner.setGamma(1000) learner.cvPrune(trainX, trainY) self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices()) learner.setGamma(100) learner.cvPrune(trainX, trainY) #Test if pruned tree is subtree of current: for vertexId in learner.tree.getAllVertexIds(): self.assertTrue(vertexId in unprunedTree.getAllVertexIds()) #The error should be better after pruning learner.learnModel(trainX, trainY) #learner.cvPrune(validX, validY, 0.0, 5) learner.repPrune(validX, validY) error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY) self.assertTrue(error1 >= error2)
def meanAUC(self, predY, testY, labelIndex, standardiserY): predY = standardiserY.unstandardiseArray(predY) testY = standardiserY.unstandardiseArray(testY) YScores = MetabolomicsUtils.scoreLabels(predY, self.boundsList[labelIndex]) YIndList = MetabolomicsUtils.createIndicatorLabel(testY, self.boundsList[labelIndex]) rankMetrics = numpy.zeros(self.boundsList[labelIndex].shape[0]-1) for j in range(rankMetrics.shape[0]): rankMetrics[j] = Evaluator.auc(YScores[:, j], YIndList[j]) return numpy.mean(rankMetrics)
def testBinaryBootstrapError(self): testY = numpy.array([-1, -1, 1, 1, 1]) predY = 1 - testY trainY = numpy.array([-1, -1, 1, 1, 1]) predTrainY = 1 - trainY self.assertEquals( Evaluator.binaryBootstrapError(testY, testY, trainY, trainY, 0.5), 0.0) self.assertEquals( Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.5), 0.5) self.assertEquals( Evaluator.binaryBootstrapError(testY, testY, trainY, predTrainY, 0.1), 0.9) self.assertEquals( Evaluator.binaryBootstrapError(testY, predY, trainY, trainY, 0.1), 0.1)
def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
def computeIdealPenalty(args): """ Find the complete penalty. """ (X, y, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) = args svm = LibSVM('gaussian', gamma, C) svm.learnModel(X, y) predY = svm.predict(X) predFullY, decisionsY = svm.predict(fullX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") trueError = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) idealPenalty = trueError - Evaluator.binaryError(predY, y) return idealPenalty
def testPrecisionFromIndLists(self): predList = [4, 2, 10] testList = [4, 2] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 2.0 / 3) testList = [4, 2, 10] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1) predList = [10, 2, 4] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1) testList = [1, 9, 11] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 0) predList = [1, 2, 3, 4, 5] testList = [1, 9, 11] self.assertEquals(Evaluator.precisionFromIndLists(testList, predList), 1.0 / 5)
def saveResult(self, X, Y, learner, paramDict, fileName): """ Save a single result to file, checking if the results have already been computed """ filelock = FileLock(fileName) gc.collect() if not filelock.isLocked() and not filelock.fileExists(): filelock.lock() try: logging.debug("Computing file " + fileName) logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1))) #idxFull = Sampling.crossValidation(self.outerFolds, X.shape[0]) idxFull = StratifiedKFold(Y, self.outerFolds) errors = numpy.zeros(self.outerFolds) for i, (trainInds, testInds) in enumerate(idxFull): logging.debug("Outer fold: " + str(i)) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] #idx = Sampling.crossValidation(self.innerFolds, trainX.shape[0]) idx = StratifiedKFold(trainY, self.innerFolds) logging.debug("Initial learner is " + str(learner)) bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY, idx, best="max") logging.debug("Best learner is " + str(bestLearner)) bestLearner.learnModel(trainX, trainY) predY = bestLearner.predict(testX) errors[i] = Evaluator.auc(predY, testY) logging.debug("Mean auc: " + str(numpy.mean(errors))) numpy.save(fileName, errors) logging.debug("Saved results as : " + fileName) finally: filelock.unlock() else: logging.debug("File exists, or is locked: " + fileName)
def learnModelCut(self, X, Y, folds=4): """ Perform model learning with tree cutting in order to choose a maximal depth. The best tree is chosen using cross validation and depths are selected from 0 to maxDepth. The best depth corresponds the maximal AUC obtained using cross validation. :param X: A matrix with examples as rows :type X: :class:`ndarray` :param Y: A vector of binary labels as a 1D array :type Y: :class:`ndarray` :param folds: The number of cross validation folds. :type folds: :class:`int` """ indexList = cross_val.StratifiedKFold(Y, folds) depths = numpy.arange(1, self.maxDepth) meanAUCs = numpy.zeros(depths.shape[0]) for trainInds, testInds in indexList: trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] self.learnModel(trainX, trainY) fullTree = self.tree for i in range(fullTree.depth()): d = depths[i] self.tree = TreeRank.cut(fullTree, d) predTestY = self.predict(testX) meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds) bestDepth = depths[numpy.argmax(meanAUCs)] self.learnModel(X, Y) self.tree = TreeRank.cut(self.tree, bestDepth)
def greedyMC2(lists, itemList, trainList, n): """ A method to greedily select a subset of the outputLists such that the average precision is maximised """ currentListsInds = range(len(lists)) newListsInds = [] currentAvPrecision = 0 lastAvPrecision = -0.1 while currentAvPrecision - lastAvPrecision > 0: lastAvPrecision = currentAvPrecision averagePrecisions = numpy.zeros(len(currentListsInds)) for i, j in enumerate(currentListsInds): newListsInds.append(j) newLists = [] for k in newListsInds: newLists.append(lists[k]) rankAggregate, scores = RankAggregator.MC2(newLists, itemList) averagePrecisions[i] = Evaluator.averagePrecisionFromLists(trainList, rankAggregate[0:n], n) newListsInds.remove(j) j = numpy.argmax(averagePrecisions) currentAvPrecision = averagePrecisions[j] if currentAvPrecision > lastAvPrecision: newListsInds.append(currentListsInds.pop(j)) return newListsInds
#Figure out why the penalty is increasing X = trainX y = trainY for i in range(foldsSet.shape[0]): folds = foldsSet[i] idx = Sampling.crossValidation(folds, validX.shape[0]) penalty = 0 fullError = 0 trainError = 0 learner.learnModel(validX, validY) predY = learner.predict(X) predValidY = learner.predict(validX) idealPenalty = Evaluator.rootMeanSqError(predY, y) - Evaluator.rootMeanSqError(predValidY, validY) for trainInds, testInds in idx: trainX = validX[trainInds, :] trainY = validY[trainInds] #learner.setGamma(gamma) #learner.setC(C) learner.learnModel(trainX, trainY) predY = learner.predict(validX) predTrainY = learner.predict(trainX) fullError += Evaluator.rootMeanSqError(predY, validY) trainError += Evaluator.rootMeanSqError(predTrainY, trainY) penalty += Evaluator.rootMeanSqError(predY, validY) - Evaluator.rootMeanSqError(predTrainY, trainY) print((folds-1)*fullError/folds, (folds-1)*trainError/folds, (folds-1)*penalty/folds)
tau = 1.0 lmbda = 0.1 linearKernel = LinearKernel() permutationKernel = PermutationGraphKernel(tau, linearKernel) randomWalkKernel = RandWalkGraphKernel(lmbda) K1 = numpy.zeros((numGraphs, numGraphs)) K2 = numpy.zeros((numGraphs, numGraphs)) for i in range(0, numGraphs): print(("i="+str(i))) for j in range(0, numGraphs): print(("j="+str(j))) K1[i, j] = permutationKernel.evaluate(graphs[i], graphs[j]) K2[i, j] = randomWalkKernel.evaluate(graphs[i], graphs[j]) D1 = KernelUtils.computeDistanceMatrix(K1) D2 = KernelUtils.computeDistanceMatrix(K2) numPairs = numGraphs/2 windowSize = 3 pairIndices = numpy.array([list(range(numPairs)), list(range(numPairs))]).T pairIndices[:, 1] = numPairs + pairIndices[:, 1] error1 = Evaluator.evaluateWindowError(D1, windowSize, pairIndices) error2 = Evaluator.evaluateWindowError(D2, windowSize, pairIndices) print(("Error 1: " + str(error1))) print(("Error 2: " + str(error2)))
def testPredict(self): rankBoost = RankBoost() rankBoost.learnModel(self.X, self.y) predY = rankBoost.predict(self.X) self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)
def testGrowTree(self): startId = (0, ) minSplit = 20 maxDepth = 3 gamma = 0.01 learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) trainX = self.X[100:, :] trainY = self.y[100:] testX = self.X[0:100, :] testY = self.y[0:100] argsortX = numpy.zeros(trainX.shape, numpy.int) for i in range(trainX.shape[1]): argsortX[:, i] = numpy.argsort(trainX[:, i]) argsortX[:, i] = numpy.argsort(argsortX[:, i]) learner.tree = DictTree() rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY)) learner.tree.setVertex(startId, rootNode) #Note that this matches with the case where we create a new tree each time numpy.random.seed(21) bestError = float("inf") for i in range(20): learner.tree.pruneVertex(startId) learner.growTree(trainX, trainY, argsortX, startId) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) if error < bestError: bestError = error bestTree = learner.tree.copy() self.assertTrue(learner.tree.depth() <= maxDepth) for vertexId in learner.tree.nonLeaves(): self.assertTrue(learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit) bestError1 = bestError learner.tree = bestTree #Now we test growing a tree from a non-root vertex numpy.random.seed(21) for i in range(20): learner.tree.pruneVertex((0, 1)) learner.growTree(trainX, trainY, argsortX, (0, 1)) self.assertTrue(learner.tree.getVertex((0,)) == bestTree.getVertex((0,))) self.assertTrue(learner.tree.getVertex((0,0)) == bestTree.getVertex((0,0))) predTestY = learner.predict(testX) error = Evaluator.binaryError(predTestY, testY) if error < bestError: bestError = error bestTree = learner.tree.copy() #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices()) self.assertTrue(bestError1 >= bestError )
minAlpha = alpha if alpha > maxAlpha: maxAlpha = alpha numAlphas = 100 alphas = numpy.linspace(maxAlpha+0.1, minAlpha, numAlphas) errors = numpy.zeros(numAlphas) for i in range(alphas.shape[0]): #learner.learnModel(trainX, trainY) learner.setAlphaThreshold(alphas[i]) learner.cvPrune(trainX, trainY) #learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)]) #learner.prune(validX, validY, alphas[i]) predY = learner.predict(testX) errors[i] = Evaluator.rootMeanSqError(predY, testY) plt.figure(3) plt.scatter(alphas, errors) #Now plot best tree plt.figure(4) learner.learnModel(trainX, trainY) #learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)]) learner.setAlphaThreshold(alphas[numpy.argmin(errors)]) learner.cvPrune(trainX, trainY) rootId = learner.tree.getRootId() displayTree(learner, rootId, 0, 1, 0, 1, colormap) plt.show()
def testModelSelect(self): """ We test the results on some data and compare to SVR. """ numExamples = 200 X, y = data.make_regression(numExamples, noise=0.5) X = Standardiser().standardiseArray(X) y = Standardiser().standardiseArray(y) trainX = X[0:100, :] trainY = y[0:100] testX = X[100:, :] testY = y[100:] learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV") learner.setPruneCV(8) paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART") paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) return #Let's compare to the SVM learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float) paramDict["setEpsilon"] = learner2.getEpsilons() idx = Sampling.crossValidation(folds, trainX.shape[0]) bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestSVM.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error)