Example #1
0
    def testParallelPenaltyGridRbf(self):
        svm = self.svm
        svm.setKernel("gaussian")
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]

        idealPenalties = svm.parallelPenaltyGridRbf(trainX, trainY, self.X,
                                                    self.y)
        idealPenalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        idealPenalties3 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]

                svm.setGamma(gamma)
                svm.setC(C)
                svm.learnModel(trainX, trainY)
                predY = svm.predict(self.X)
                predTrainY = svm.predict(trainX)
                penalty = Evaluator.binaryError(
                    predY, self.y) - Evaluator.binaryError(predTrainY, trainY)

                idealPenalties2[i, j] = penalty

                args = (trainX, trainY, self.X, self.y, svm)
                idealPenalties3[i, j] = computeIdealPenalty(args)

        tol = 10**-6
        self.assertTrue(
            numpy.linalg.norm(idealPenalties2.T - idealPenalties) < tol)
Example #2
0
    def testParallelPenaltyGrid(self):
        svm = self.svm
        svm.setKernel("gaussian")
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]
        
        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()      

        idealPenalties = svm.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
        idealPenalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        idealPenalties3 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]

                svm.setGamma(gamma)
                svm.setC(C)
                svm.learnModel(trainX, trainY)
                predY = svm.predict(self.X)
                predTrainY = svm.predict(trainX)
                penalty = Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY)

                idealPenalties2[i, j] = penalty

                args = (trainX, trainY, self.X, self.y, svm)
                idealPenalties3[i, j] = computeIdealPenalty(args)

        tol = 10**-6 
        self.assertTrue(numpy.linalg.norm(idealPenalties2.T - idealPenalties) < tol)
Example #3
0
    def testSetErrorCost(self):
        try:
            import sklearn
        except ImportError as error:
            return

        numExamples = 1000
        numFeatures = 100
        eg = ExamplesGenerator()
        X, y = eg.generateBinaryExamples(numExamples, numFeatures)
        svm = LibSVM()

        C = 0.1
        kernel = "linear"
        kernelParam = 0
        svm.setKernel(kernel, kernelParam)
        svm.setC(C)

        svm.setErrorCost(0.1)
        svm.learnModel(X, y)
        predY = svm.classify(X)
        e1 = Evaluator.binaryErrorP(y, predY)

        svm.setErrorCost(0.9)
        svm.learnModel(X, y)
        predY = svm.classify(X)
        e2 = Evaluator.binaryErrorP(y, predY)

        self.assertTrue(e1 > e2)
Example #4
0
    def testPredict2(self):
        # Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1, 2, 3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array(
            [0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508, 0.7367508, 0.7367508]
        )
        testAucs = numpy.array([0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400, 0.6874400, 0.6874400])
        i = 0

        # The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i += 1
Example #5
0
    def testSetErrorCost(self):
        try:
            import sklearn
        except ImportError as error:
            return

        numExamples = 1000
        numFeatures = 100
        eg = ExamplesGenerator()
        X, y = eg.generateBinaryExamples(numExamples, numFeatures)
        svm = LibSVM()

        C = 0.1
        kernel = "linear"
        kernelParam = 0
        svm.setKernel(kernel, kernelParam)
        svm.setC(C)

        svm.setErrorCost(0.1)
        svm.learnModel(X, y)
        predY = svm.classify(X)
        e1 = Evaluator.binaryErrorP(y, predY)

        svm.setErrorCost(0.9)
        svm.learnModel(X, y)
        predY = svm.classify(X)
        e2 = Evaluator.binaryErrorP(y, predY)

        self.assertTrue(e1 > e2)
Example #6
0
    def testParallelPen(self):
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)

        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(
                        predY, self.y) - Evaluator.binaryError(
                            predTrainY, trainY)

                penalty = penalty * Cv[0] / len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i,
                            j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(
            numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
Example #7
0
    def testPredict(self):
        rankBoost = RankBoost()
        rankBoost.learnModel(self.X, self.y)
        predY = rankBoost.predict(self.X)

        self.assertTrue(
            Evaluator.auc(predY, self.y) <= 1.0
            and Evaluator.auc(predY, self.y) >= 0.0)
Example #8
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #9
0
    def evaluateCvOuter(self, X, Y, folds):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param y: A vector of labels
        :type y: :class:`ndarray`

        :param folds: The number of cross validation folds
        :type folds: :class:`int`
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds, "Outer CV: ")
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            #self.learnModelCut(trainX, trainY)

            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #10
0
    def testParallelPen(self): 
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()            
        
        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)
        
        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY)

                penalty = penalty*Cv[0]/len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
    def testLearnModel2(self):
        #We want to make sure the learnt tree with gamma = 0 maximise the
        #empirical risk
        minSplit = 20
        maxDepth = 3
        gamma = 0.01
        learner = PenaltyDecisionTree(minSplit=minSplit,
                                      maxDepth=maxDepth,
                                      gamma=gamma,
                                      pruning=False)

        #Vary sampleSize
        numpy.random.seed(21)
        learner.setSampleSize(1)
        learner.learnModel(self.X, self.y)
        error1 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)
        learner.setSampleSize(5)
        learner.learnModel(self.X, self.y)
        error2 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)
        learner.setSampleSize(10)
        learner.learnModel(self.X, self.y)
        error3 = learner.treeObjective(self.X, self.y)

        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)

        #Now vary max depth
        learner.gamma = 0

        numpy.random.seed(21)
        learner.setSampleSize(1)
        learner.minSplit = 1
        learner.maxDepth = 3
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error1 = Evaluator.binaryError(self.y, predY)

        numpy.random.seed(21)
        learner.maxDepth = 5
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error2 = Evaluator.binaryError(self.y, predY)

        numpy.random.seed(21)
        learner.maxDepth = 10
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error3 = Evaluator.binaryError(self.y, predY)

        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)
    def testLearnModel2(self): 
        #We want to make sure the learnt tree with gamma = 0 maximise the 
        #empirical risk 
        minSplit = 20
        maxDepth = 3
        gamma = 0.01
        learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) 
        
        #Vary sampleSize
        numpy.random.seed(21)
        learner.setSampleSize(1)           
        learner.learnModel(self.X, self.y)        
        error1 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)
        learner.setSampleSize(5)        
        learner.learnModel(self.X, self.y)
        error2 = learner.treeObjective(self.X, self.y)

        numpy.random.seed(21)                
        learner.setSampleSize(10)       
        learner.learnModel(self.X, self.y)
        error3 = learner.treeObjective(self.X, self.y)
        
        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)
        
        #Now vary max depth 
        learner.gamma = 0         
        
        numpy.random.seed(21)
        learner.setSampleSize(1) 
        learner.minSplit = 1
        learner.maxDepth = 3 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error1 = Evaluator.binaryError(self.y, predY)
        
        numpy.random.seed(21)
        learner.maxDepth = 5 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error2 = Evaluator.binaryError(self.y, predY)
        
        numpy.random.seed(21)
        learner.maxDepth = 10 
        learner.learnModel(self.X, self.y)
        predY = learner.predict(self.X)
        error3 = Evaluator.binaryError(self.y, predY)        
        
        self.assertTrue(error1 >= error2)
        self.assertTrue(error2 >= error3)
Example #13
0
    def testSetC(self):
        rankSVM = RankSVM()
        rankSVM.setC(100.0)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc1 = Evaluator.auc(predY, self.y)

        rankSVM.setC(0.1)
        rankSVM.learnModel(self.X, self.y)
        predY = rankSVM.predict(self.X)
        auc2 = Evaluator.auc(predY, self.y)

        self.assertTrue(auc1 != auc2)
Example #14
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " +
                          str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " +
                          str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #15
0
    def evaluateCvOuter(self, X, Y, folds, leafRank):
        """
        Run cross validation and output some ROC curves. In this case Y is a 1D array.
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)
        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0

        for trainInds, testInds in indexList:
            Util.printIteration(i, 1, folds)
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            bestMetaDicts.append(metaDict)

            i += 1

        logging.debug("Mean test AUC = " + str(numpy.mean(bestTestAUCs)))
        logging.debug("Std test AUC = " + str(numpy.std(bestTestAUCs)))
        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #16
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName,
                           skiprows=1,
                           usecols=(1, 2, 3),
                           delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName,
                               skiprows=1,
                               usecols=(1, 2, 3),
                               delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        maxDepths = range(3, 10)
        trainAucs = numpy.array([
            0.7194734, 0.7284824, 0.7332185, 0.7348198, 0.7366152, 0.7367508,
            0.7367508, 0.7367508
        ])
        testAucs = numpy.array([
            0.6789078, 0.6844632, 0.6867918, 0.6873420, 0.6874820, 0.6874400,
            0.6874400, 0.6874400
        ])
        i = 0

        #The results are approximately the same, but not exactly
        for maxDepth in maxDepths:
            treeRank = TreeRank(self.leafRanklearner)
            treeRank.setMaxDepth(maxDepth)
            treeRank.learnModel(X, y)
            trainScores = treeRank.predict(X)
            testScores = treeRank.predict(testX)

            self.assertAlmostEquals(Evaluator.auc(trainScores, y),
                                    trainAucs[i], 2)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY),
                                    testAucs[i], 1)
            i += 1
Example #17
0
    def testAuc(self):
        self.treeRank.learnModel(self.X, self.Y)
        scores = self.treeRank.predictScores(self.X)

        auc1 = Evaluator.auc(scores, self.Y.ravel())
        auc2 = self.treeRank.aucFromROC(self.treeRank.predictROC(self.X, self.Y))

        self.assertAlmostEquals(auc1, auc2, places=4)
Example #18
0
    def testPredict2(self):
        #Test on Gauss2D dataset
        dataDir = PathDefaults.getDataDir()

        fileName = dataDir + "Gauss2D_learn.csv"
        XY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        X = XY[:, 0:2]
        y = XY[:, 2]
        
        y = y*2 - 1 

        fileName = dataDir + "Gauss2D_test.csv"
        testXY = numpy.loadtxt(fileName, skiprows=1, usecols=(1,2,3), delimiter=",")
        testX = testXY[:, 0:2]
        testY = testXY[:, 2]
        
        testY = testY*2-1

        X = Standardiser().standardiseArray(X)
        testX = Standardiser().standardiseArray(testX)

        numTrees = 5
        minSplit = 50 
        maxDepths = range(3, 10)
        trainAucs = numpy.array([0.7252582, 0.7323278, 0.7350289, 0.7372529, 0.7399985, 0.7382176, 0.7395104, 0.7386347])
        testAucs = numpy.array([0.6806122, 0.6851614, 0.6886183, 0.6904147, 0.6897266, 0.6874600, 0.6875980, 0.6878801])

        i = 0
        
        #The results are approximately the same, but not exactly 
        for maxDepth in maxDepths:
            treeRankForest = TreeRankForest(self.leafRanklearner)
            treeRankForest.setMaxDepth(maxDepth)
            treeRankForest.setMinSplit(minSplit)
            treeRankForest.setNumTrees(numTrees)
            treeRankForest.learnModel(X, y)
            trainScores = treeRankForest.predict(X)
            testScores = treeRankForest.predict(testX)

            print(Evaluator.auc(trainScores, y), Evaluator.auc(testScores, testY))

            self.assertAlmostEquals(Evaluator.auc(trainScores, y), trainAucs[i], 1)
            self.assertAlmostEquals(Evaluator.auc(testScores, testY), testAucs[i], 1)
            i+=1
Example #19
0
    def testSetSvmType(self):
        try:
            import sklearn
        except ImportError as error:
            return

        numExamples = 100
        numFeatures = 10
        X = numpy.random.randn(numExamples, numFeatures)
        X = Standardiser().standardiseArray(X)
        c = numpy.random.randn(numFeatures)

        y = numpy.dot(X, numpy.array([c]).T).ravel() + 1
        y2 = numpy.array(y > 0, numpy.int32) * 2 - 1

        svm = LibSVM()

        svm.setSvmType("Epsilon_SVR")

        self.assertEquals(svm.getType(), "Epsilon_SVR")

        #Try to get a good error
        Cs = 2**numpy.arange(-6, 4, dtype=numpy.float)
        epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float)

        bestError = 10
        for C in Cs:
            for epsilon in epsilons:
                svm.setEpsilon(epsilon)
                svm.setC(C)
                svm.learnModel(X, y)
                yp = svm.predict(X)

                if Evaluator.rootMeanSqError(y, yp) < bestError:
                    bestError = Evaluator.rootMeanSqError(y, yp)

        self.assertTrue(
            bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0])))

        svm.setSvmType("C_SVC")
        svm.learnModel(X, y2)
        yp2 = svm.predict(X)

        self.assertTrue(0 <= Evaluator.binaryError(y2, yp2) <= 1)
Example #20
0
    def testSetSvmType(self):
        try:
            import sklearn
        except ImportError as error:
            return

        numExamples = 100
        numFeatures = 10
        X = numpy.random.randn(numExamples, numFeatures)
        X = Standardiser().standardiseArray(X)
        c = numpy.random.randn(numFeatures)

        y = numpy.dot(X, numpy.array([c]).T).ravel() + 1
        y2 = numpy.array(y > 0, numpy.int32)*2 -1 
        
        svm = LibSVM()

        svm.setSvmType("Epsilon_SVR")

        self.assertEquals(svm.getType(), "Epsilon_SVR")

        #Try to get a good error
        Cs = 2**numpy.arange(-6, 4, dtype=numpy.float)
        epsilons = 2**numpy.arange(-6, 4, dtype=numpy.float)

        bestError = 10 
        for C in Cs:
            for epsilon in epsilons:
                svm.setEpsilon(epsilon)
                svm.setC(C)
                svm.learnModel(X, y)
                yp = svm.predict(X)

                if Evaluator.rootMeanSqError(y, yp) < bestError:
                    bestError = Evaluator.rootMeanSqError(y, yp) 

        self.assertTrue(bestError < Evaluator.rootMeanSqError(y, numpy.zeros(y.shape[0])))
        
        svm.setSvmType("C_SVC")
        svm.learnModel(X, y2)
        yp2 = svm.predict(X)

        self.assertTrue(0 <= Evaluator.binaryError(y2, yp2)  <= 1)
Example #21
0
 def testPredict(self):
     generator = SVMLeafRank(self.paramDict, self.folds)
     learner = generator.generateLearner(self.X, self.y)
     
     predY = learner.predict(self.X)
     #Seems to work
     auc = learner.getMetricMethod()(predY, self.y)
     auc2 = Evaluator.auc(predY, self.y)    
     
     self.assertEquals(auc, auc2)
Example #22
0
    def testClassify(self):
        try:
            import sklearn
        except ImportError as error:
            return

        self.svm.learnModel(self.X, self.y)
        predY = self.svm.classify(self.X)
        y = self.y

        e = Evaluator.binaryError(y, predY)

        #Now, permute examples
        perm = numpy.random.permutation(self.X.shape[0])
        predY = self.svm.classify(self.X[perm, :])
        y = y[perm]

        e2 = Evaluator.binaryError(y, predY)

        self.assertEquals(e, e2)
Example #23
0
def computeBootstrapError(args):
    """
    Used in conjunction with the parallel model selection. Trains and then tests
    on a seperate test set and evaluated the bootstrap error. 
    """
    (trainX, trainY, testX, testY, learner) = args
    learner.learnModel(trainX, trainY)
    predTestY = learner.predict(testX)
    predTrainY = learner.predict(trainX)
    weight = 0.632
    return Evaluator.binaryBootstrapError(predTestY, testY, predTrainY, trainY, weight)
Example #24
0
    def testClassify(self):
        try:
            import sklearn
        except ImportError as error:
            return

        self.svm.learnModel(self.X, self.y)
        predY = self.svm.classify(self.X)
        y = self.y

        e = Evaluator.binaryError(y, predY)

        #Now, permute examples
        perm = numpy.random.permutation(self.X.shape[0])
        predY = self.svm.classify(self.X[perm, :])
        y = y[perm]

        e2 = Evaluator.binaryError(y, predY)

        self.assertEquals(e, e2)
Example #25
0
 def testCvPrune(self): 
     numExamples = 500
     X, y = data.make_regression(numExamples)  
     
     y = Standardiser().standardiseArray(y)
     
     numTrain = numpy.round(numExamples * 0.33)     
     numValid = numpy.round(numExamples * 0.33) 
     
     trainX = X[0:numTrain, :]
     trainY = y[0:numTrain]
     validX = X[numTrain:numTrain+numValid, :]
     validY = y[numTrain:numTrain+numValid]
     testX = X[numTrain+numValid:, :]
     testY = y[numTrain+numValid:]
     
     learner = DecisionTreeLearner()
     learner.learnModel(trainX, trainY)
     error1 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     #print(learner.getTree())
     unprunedTree = learner.tree.copy() 
     learner.setGamma(1000)
     learner.cvPrune(trainX, trainY)
     
     self.assertEquals(unprunedTree.getNumVertices(), learner.tree.getNumVertices())
     learner.setGamma(100)
     learner.cvPrune(trainX, trainY)
     
     #Test if pruned tree is subtree of current: 
     for vertexId in learner.tree.getAllVertexIds(): 
         self.assertTrue(vertexId in unprunedTree.getAllVertexIds())
         
     #The error should be better after pruning 
     learner.learnModel(trainX, trainY)
     #learner.cvPrune(validX, validY, 0.0, 5)
     learner.repPrune(validX, validY)
   
     error2 = Evaluator.rootMeanSqError(learner.predict(testX), testY)
     
     self.assertTrue(error1 >= error2)
Example #26
0
def computeBootstrapError(args):
    """
    Used in conjunction with the parallel model selection. Trains and then tests
    on a seperate test set and evaluated the bootstrap error. 
    """
    (trainX, trainY, testX, testY, learner) = args
    learner.learnModel(trainX, trainY)
    predTestY = learner.predict(testX)
    predTrainY = learner.predict(trainX)
    weight = 0.632
    return Evaluator.binaryBootstrapError(predTestY, testY, predTrainY, trainY,
                                          weight)
    def testBayesError(self):
        dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/"
        data = numpy.load(dataDir + "toyData.npz")
        gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"]

        sampleSize = 100
        trainX, trainY = X[0:sampleSize, :], y[0:sampleSize]
        testX, testY = X[sampleSize:, :], y[sampleSize:]

        #We form a test set from the grid points
        gridX = numpy.zeros((gridPoints.shape[0]**2, 2))
        for m in range(gridPoints.shape[0]):
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints
            gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m]

        Cs = 2**numpy.arange(-5, 5, dtype=numpy.float)
        gammas = 2**numpy.arange(-5, 5, dtype=numpy.float)

        bestError = 1 

        for C in Cs:
            for gamma in gammas:
                svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma)
                svm.learnModel(trainX, trainY)
                predY, decisionsY = svm.predict(gridX, True)
                decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
                error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)

                predY, decisionsY = svm.predict(testX, True)
                error2 = Evaluator.binaryError(testY, predY)
                print(error, error2)

                if error < bestError:
                    error = bestError
                    bestC = C
                    bestGamma = gamma

        svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma)
        svm.learnModel(trainX, trainY)
        predY, decisionsY = svm.predict(gridX, True)

        plt.figure(0)
        plt.contourf(gridPoints, gridPoints, decisionGrid, 100)
        plt.colorbar()

        plt.figure(1)
        plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1")
        plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1")
        plt.legend()
        plt.show()
Example #28
0
def computeIdealPenalty(args):
    """
    Find the complete penalty.
    """
    (X, y, fullX, C, gamma, gridPoints, pdfX, pdfY1X, pdfYminus1X) = args

    svm = LibSVM('gaussian', gamma, C)
    svm.learnModel(X, y)
    predY = svm.predict(X)
    predFullY, decisionsY = svm.predict(fullX, True)
    decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F")
    trueError = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X)
    idealPenalty = trueError - Evaluator.binaryError(predY, y)

    return idealPenalty
Example #29
0
    def testComputeTestError(self):
        C = 10.0
        gamma = 0.5

        numTrainExamples = self.X.shape[0]*0.5

        trainX, trainY = self.X[0:numTrainExamples, :], self.y[0:numTrainExamples]
        testX, testY = self.X[numTrainExamples:, :], self.y[numTrainExamples:]

        svm = LibSVM('gaussian', gamma, C)
        args = (trainX, trainY, testX, testY, svm)
        error = computeTestError(args)

        svm = LibSVM('gaussian', gamma, C)
        svm.learnModel(trainX, trainY)
        predY = svm.predict(testX)
        self.assertEquals(Evaluator.binaryError(predY, testY), error)
Example #30
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx,
                                                      paramDict)

        tol = 10**-6
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error / len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
Example #31
0
    def testComputeTestError(self):
        C = 10.0
        gamma = 0.5

        numTrainExamples = self.X.shape[0] * 0.5

        trainX, trainY = self.X[
            0:numTrainExamples, :], self.y[0:numTrainExamples]
        testX, testY = self.X[numTrainExamples:, :], self.y[numTrainExamples:]

        svm = LibSVM('gaussian', gamma, C)
        args = (trainX, trainY, testX, testY, svm)
        error = computeTestError(args)

        svm = LibSVM('gaussian', gamma, C)
        svm.learnModel(trainX, trainY)
        predY = svm.predict(testX)
        self.assertEquals(Evaluator.binaryError(predY, testY), error)
Example #32
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()    
        
        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict)
        
        tol = 10**-6 
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) 
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error/len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error
            
        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
Example #33
0
    def learnModelCut(self, X, Y, folds=4):
        """
        Perform model learning with tree cutting in order to choose a maximal
        depth. The best tree is chosen using cross validation and depths are
        selected from 0 to maxDepth. The best depth corresponds the maximal
        AUC obtained using cross validation. 

        :param X: A matrix with examples as rows
        :type X: :class:`ndarray`

        :param Y: A vector of binary labels as a 1D array
        :type Y: :class:`ndarray`

        :param folds: The number of cross validation folds.
        :type folds: :class:`int`
        """

        indexList = cross_val.StratifiedKFold(Y, folds)
        depths = numpy.arange(1, self.maxDepth)
        meanAUCs = numpy.zeros(depths.shape[0])

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            self.learnModel(trainX, trainY)
            fullTree = self.tree

            for i in range(fullTree.depth()):
                d = depths[i]
                self.tree = TreeRank.cut(fullTree, d)
                predTestY = self.predict(testX)

                meanAUCs[i] += Evaluator.auc(predTestY, testY)/float(folds)

        bestDepth = depths[numpy.argmax(meanAUCs)]
        self.learnModel(X, Y)
        self.tree = TreeRank.cut(self.tree, bestDepth)
Example #34
0
    def testLearnModel(self):
        try:
            import sklearn
        except ImportError as error:
            return

        self.svm.learnModel(self.X, self.y)
        predY = self.svm.classify(self.X)
        y = self.y

        e = Evaluator.binaryError(y, predY)

        #Test for wrong labels
        numExamples = 6
        X = numpy.array([[-3], [-2], [-1], [1], [2], [3]], numpy.float)
        y = numpy.array([[-1], [-1], [-1], [1], [1], [5]])

        self.assertRaises(ValueError, self.svm.learnModel, X, y)

        #Try the regression SVM
        svm = LibSVM(type="Epsilon_SVR")
        y = numpy.random.rand(self.X.shape[0])
        svm.learnModel(self.X, self.y)
Example #35
0
    def testLearnModel(self):
        try:
            import sklearn
        except ImportError as error:
            return

        self.svm.learnModel(self.X, self.y)
        predY = self.svm.classify(self.X)
        y = self.y

        e = Evaluator.binaryError(y, predY)

        #Test for wrong labels
        numExamples = 6
        X = numpy.array([[-3], [-2], [-1], [1], [2] ,[3]], numpy.float)
        y = numpy.array([[-1], [-1], [-1], [1], [1] ,[5]])

        self.assertRaises(ValueError, self.svm.learnModel, X, y)

        #Try the regression SVM
        svm = LibSVM(type="Epsilon_SVR")
        y = numpy.random.rand(self.X.shape[0])
        svm.learnModel(self.X, self.y)
Example #36
0
    def greedyMC2(lists, itemList, trainList, n): 
        """
        A method to greedily select a subset of the outputLists such that 
        the average precision is maximised
        """
        currentListsInds = range(len(lists))
        newListsInds = []
        currentAvPrecision = 0 
        lastAvPrecision = -0.1
        
        while currentAvPrecision - lastAvPrecision > 0: 
            lastAvPrecision = currentAvPrecision 
            averagePrecisions = numpy.zeros(len(currentListsInds))      
            
            for i, j in enumerate(currentListsInds):
                newListsInds.append(j)

                newLists = []                
                for k in newListsInds: 
                    newLists.append(lists[k])
                
                rankAggregate, scores = RankAggregator.MC2(newLists, itemList)
                averagePrecisions[i] = Evaluator.averagePrecisionFromLists(trainList, rankAggregate[0:n], n)
                newListsInds.remove(j)

            j = numpy.argmax(averagePrecisions)
            currentAvPrecision = averagePrecisions[j]
            
            if currentAvPrecision > lastAvPrecision: 
                newListsInds.append(currentListsInds.pop(j))
            
        return newListsInds 
            
                
        
        
 def testGrowTree(self):
     startId = (0, )
     minSplit = 20
     maxDepth = 3
     gamma = 0.01
     learner = PenaltyDecisionTree(minSplit=minSplit, maxDepth=maxDepth, gamma=gamma, pruning=False) 
     
     trainX = self.X[100:, :]
     trainY = self.y[100:]
     testX = self.X[0:100, :]
     testY = self.y[0:100]    
     
     argsortX = numpy.zeros(trainX.shape, numpy.int)
     for i in range(trainX.shape[1]): 
         argsortX[:, i] = numpy.argsort(trainX[:, i])
         argsortX[:, i] = numpy.argsort(argsortX[:, i])
     
     learner.tree = DictTree()
     rootNode = DecisionNode(numpy.arange(trainX.shape[0]), Util.mode(trainY))
     learner.tree.setVertex(startId, rootNode)        
     
     #Note that this matches with the case where we create a new tree each time 
     numpy.random.seed(21)
     bestError = float("inf")        
     
     for i in range(20): 
         learner.tree.pruneVertex(startId)
         learner.growTree(trainX, trainY, argsortX, startId)
         
         predTestY = learner.predict(testX)
         error = Evaluator.binaryError(predTestY, testY)
         #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
         
         if error < bestError: 
             bestError = error 
             bestTree = learner.tree.copy() 
         
         self.assertTrue(learner.tree.depth() <= maxDepth)
         
         for vertexId in learner.tree.nonLeaves(): 
             self.assertTrue(learner.tree.getVertex(vertexId).getTrainInds().shape[0] >= minSplit)
     
     bestError1 = bestError               
     learner.tree = bestTree    
     
     #Now we test growing a tree from a non-root vertex 
     numpy.random.seed(21)
     for i in range(20): 
         learner.tree.pruneVertex((0, 1)) 
         learner.growTree(trainX, trainY, argsortX, (0, 1))
         
         self.assertTrue(learner.tree.getVertex((0,)) == bestTree.getVertex((0,)))
         self.assertTrue(learner.tree.getVertex((0,0)) == bestTree.getVertex((0,0)))
         
         
         predTestY = learner.predict(testX)
         error = Evaluator.binaryError(predTestY, testY)
         
         if error < bestError: 
             bestError = error 
             bestTree = learner.tree.copy() 
         #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
     self.assertTrue(bestError1 >= bestError )
Example #38
0
        minAlpha = alpha 
    if alpha > maxAlpha: 
        maxAlpha = alpha 
        
numAlphas = 100
alphas = numpy.linspace(maxAlpha+0.1, minAlpha, numAlphas)
errors = numpy.zeros(numAlphas)

for i in range(alphas.shape[0]): 
    #learner.learnModel(trainX, trainY)
    learner.setAlphaThreshold(alphas[i])
    learner.cvPrune(trainX, trainY)
    #learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)])
    #learner.prune(validX, validY, alphas[i])
    predY = learner.predict(testX)
    errors[i] = Evaluator.rootMeanSqError(predY, testY)
    
plt.figure(3)
plt.scatter(alphas, errors)

#Now plot best tree 
plt.figure(4)
learner.learnModel(trainX, trainY)
#learner.cvPrune(validX, validY, alphas[numpy.argmin(errors)])
learner.setAlphaThreshold(alphas[numpy.argmin(errors)])
learner.cvPrune(trainX, trainY)
rootId = learner.tree.getRootId()
displayTree(learner, rootId, 0, 1, 0, 1, colormap)

plt.show()
    
Example #39
0
print(numpy.sum(y==2), numpy.sum(y==0)) 

trainSplit = 0.3
numTrainExamples = numExamples*trainSplit

trainX = X[0:numTrainExamples, :]
trainY = y[0:numTrainExamples]
testX = X[numTrainExamples:, :]
testY = y[numTrainExamples:]

learner = PenaltyDecisionTree(minSplit=1, maxDepth=50, pruning=False)
learner.learnModel(trainX, trainY)

predY = learner.predict(trainX)
print(Evaluator.binaryError(predY, trainY))
print(learner.getTree())


plt.figure(0)
plt.scatter(testX[:, 0], testX[:, 1], c=testY, s=50, vmin=0, vmax=2)
plt.title("Test set")
plt.colorbar()

plt.figure(1)
plt.scatter(trainX[:, 0], trainX[:, 1], c=trainY, s=50, vmin=0, vmax=2)
plt.title("Training set")
plt.colorbar()

colormap  = matplotlib.cm.get_cmap()
Example #40
0
            newTrainOutputList = []
            newTestOutputList = []
            for item in outputList: 
                if item not in testExpertMatchesInds: 
                    newTrainOutputList.append(item)
                if item not in trainExpertMatchesInds: 
                    newTestOutputList.append(item)
              
            trainOutputLists.append(newTrainOutputList)
            testOutputLists.append(newTestOutputList)
        
        for i, n in enumerate(ns):     
            for j, trainOutputList in enumerate(trainOutputLists): 
                testOutputList = testOutputLists[j]                
                
                trainPrecisions[i, j] = Evaluator.precisionFromIndLists(trainExpertMatchesInds, trainOutputList[0:n]) 
                testPrecisions[i, j] = Evaluator.precisionFromIndLists(testExpertMatchesInds, testOutputList[0:n]) 
                averageTrainPrecisions[s, i, j] = Evaluator.averagePrecisionFromLists(trainExpertMatchesInds, trainOutputList[0:n], n)
                averageTestPrecisions[s, i, j] = Evaluator.averagePrecisionFromLists(testExpertMatchesInds, testOutputList[0:n], n) 

        #Now look at rank aggregations
        relevantItems = set([])
        for trainOutputList in trainOutputLists: 
            relevantItems = relevantItems.union(trainOutputList)
        relevantItems = list(relevantItems)
        
        listInds = RankAggregator.greedyMC2(trainOutputLists, relevantItems, trainExpertMatchesInds, 20) 
        
        newOutputList = []
        for listInd in listInds: 
            newOutputList.append(testOutputLists[listInd])
    def testGrowTree(self):
        startId = (0, )
        minSplit = 20
        maxDepth = 3
        gamma = 0.01
        learner = PenaltyDecisionTree(minSplit=minSplit,
                                      maxDepth=maxDepth,
                                      gamma=gamma,
                                      pruning=False)

        trainX = self.X[100:, :]
        trainY = self.y[100:]
        testX = self.X[0:100, :]
        testY = self.y[0:100]

        argsortX = numpy.zeros(trainX.shape, numpy.int)
        for i in range(trainX.shape[1]):
            argsortX[:, i] = numpy.argsort(trainX[:, i])
            argsortX[:, i] = numpy.argsort(argsortX[:, i])

        learner.tree = DictTree()
        rootNode = DecisionNode(numpy.arange(trainX.shape[0]),
                                Util.mode(trainY))
        learner.tree.setVertex(startId, rootNode)

        #Note that this matches with the case where we create a new tree each time
        numpy.random.seed(21)
        bestError = float("inf")

        for i in range(20):
            learner.tree.pruneVertex(startId)
            learner.growTree(trainX, trainY, argsortX, startId)

            predTestY = learner.predict(testX)
            error = Evaluator.binaryError(predTestY, testY)
            #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())

            if error < bestError:
                bestError = error
                bestTree = learner.tree.copy()

            self.assertTrue(learner.tree.depth() <= maxDepth)

            for vertexId in learner.tree.nonLeaves():
                self.assertTrue(
                    learner.tree.getVertex(vertexId).getTrainInds().shape[0] >=
                    minSplit)

        bestError1 = bestError
        learner.tree = bestTree

        #Now we test growing a tree from a non-root vertex
        numpy.random.seed(21)
        for i in range(20):
            learner.tree.pruneVertex((0, 1))
            learner.growTree(trainX, trainY, argsortX, (0, 1))

            self.assertTrue(
                learner.tree.getVertex((0, )) == bestTree.getVertex((0, )))
            self.assertTrue(
                learner.tree.getVertex((0, 0)) == bestTree.getVertex((0, 0)))

            predTestY = learner.predict(testX)
            error = Evaluator.binaryError(predTestY, testY)

            if error < bestError:
                bestError = error
                bestTree = learner.tree.copy()
            #print(Evaluator.binaryError(predTestY, testY), learner.tree.getNumVertices())
        self.assertTrue(bestError1 >= bestError)
Example #42
0
    def evaluateCvOuter(self, X, Y, folds, leafRank, innerFolds=3):
        """
        Run model selection and output some ROC curves. In this case Y is a 1D array. 
        """
        Parameter.checkClass(X, numpy.ndarray)
        Parameter.checkClass(Y, numpy.ndarray)
        Parameter.checkInt(folds, 2, float('inf'))
        if Y.ndim != 1:
            raise ValueError("Expecting Y to be 1D")

        indexList = cross_val.StratifiedKFold(Y, folds)

        maxDepths = numpy.flipud(numpy.arange(1, 12, 1))
        if leafRank == self.getTreeRankLib().LRforest:
            varSplits = numpy.arange(0.6, 1.01, 0.2)
        else:
            varSplits = numpy.array([1])
        #According to Nicolas nfcv>1 doesn't help
        nfcvs = [1]
        #This is tied in with depth 
        mincrit = 0.00
        #If minsplit is too low sometimes get a node with no positive labels
        minSplits = numpy.array([50])

        self.setLeafRank(leafRank)

        bestParams = []
        bestTrainAUCs = numpy.zeros(folds)
        bestTrainROCs = []
        bestTestAUCs = numpy.zeros(folds)
        bestTestROCs = []
        bestMetaDicts = []
        i = 0 

        for trainInds, testInds in indexList:
            trainX, trainY = X[trainInds, :], Y[trainInds]
            testX, testY = X[testInds, :], Y[testInds]

            meanParamAUCs = []
            paramList = [] 

            logging.debug("Distribution of labels in train: " + str(numpy.bincount(trainY)))
            logging.debug("Distribution of labels in test: " + str(numpy.bincount(testY)))

            for varSplit in varSplits:
                for nfcv in nfcvs:
                    for minSplit in minSplits:

                        self.setMaxDepth(maxDepths[0])
                        self.setVarSplit(varSplit)
                        self.setNfcv(nfcv)
                        self.setMinSplit(minSplit)
                        logging.debug(self)
                        idx = cross_val.StratifiedKFold(trainY, innerFolds)

                        j = 0
                        metrics = numpy.zeros((len(idx), maxDepths.shape[0]))

                        for idxtr, idxts in idx:
                            Util.printIteration(j, 1, innerFolds)

                            innerTrainX, innerTestX = trainX[idxtr, :], trainX[idxts, :]
                            innerTrainY, innerTestY = trainY[idxtr], trainY[idxts]

                            self.learnModel(innerTrainX, innerTrainY)

                            for k in range(maxDepths.shape[0]):
                                maxDepth = maxDepths[k]

                                robjects.globalenv["maxDepth"] = maxDepth
                                robjects.globalenv["tree"] = self.tree
                                nodeList = robjects.r('tree$nodes[tree$depth>=maxDepth]')
                                self.tree = self.treeRankLib.subTreeRank(self.tree, nodeList)

                                predY = self.predict(innerTestX)
                                gc.collect()

                                metrics[j, k] = Evaluator.auc(predY, innerTestY)
                                
                            j += 1

                        meanAUC = numpy.mean(metrics, 0)
                        varAUC = numpy.var(metrics, 0)
                        logging.warn(self.baseLib.warnings())
                        logging.debug("Mean AUCs and variances at each depth " + str((meanAUC, varAUC)))

                        for k in range(maxDepths.shape[0]):
                            maxDepth = maxDepths[k]
                            meanParamAUCs.append(meanAUC[k])
                            paramList.append((maxDepth, varSplit, nfcv, minSplit))

                        #Try to get some memory back
                        gc.collect()
                        robjects.r('gc(verbose=TRUE)')
                        robjects.r('memory.profile()')

                        #print(self.hp.heap())

            #Now choose best params
            bestInd = numpy.argmax(numpy.array(meanParamAUCs))

            self.setMaxDepth(paramList[bestInd][0])
            self.setVarSplit(paramList[bestInd][1])
            self.setNfcv(paramList[bestInd][2])
            self.setMinSplit(paramList[bestInd][3])

            self.learnModel(trainX, trainY)
            predTrainY = self.predict(trainX)
            predTestY = self.predict(testX)
            bestTrainAUCs[i] = Evaluator.auc(predTrainY, trainY)
            bestTestAUCs[i] = Evaluator.auc(predTestY, testY)

            #Store the parameters and ROC curves
            bestParams.append(paramList[bestInd])
            bestTrainROCs.append(Evaluator.roc(trainY, predTrainY))
            bestTestROCs.append(Evaluator.roc(testY, predTestY))

            metaDict = {}
            metaDict["size"] = self.getTreeSize()
            metaDict["depth"] = self.getTreeDepth()
            bestMetaDicts.append(metaDict)

            i += 1

        allMetrics = [bestTrainAUCs, bestTrainROCs, bestTestAUCs, bestTestROCs]

        return (bestParams, allMetrics, bestMetaDicts)
Example #43
0
#Figure out why the penalty is increasing 
X = trainX 
y = trainY 

for i in range(foldsSet.shape[0]): 
    folds = foldsSet[i]
    idx = Sampling.crossValidation(folds, validX.shape[0])
    
    penalty = 0
    fullError = 0 
    trainError = 0     
    
    learner.learnModel(validX, validY)
    predY = learner.predict(X)
    predValidY = learner.predict(validX)
    idealPenalty = Evaluator.rootMeanSqError(predY, y) - Evaluator.rootMeanSqError(predValidY, validY)
    
    for trainInds, testInds in idx:
        trainX = validX[trainInds, :]
        trainY = validY[trainInds]
    
        #learner.setGamma(gamma)
        #learner.setC(C)
        learner.learnModel(trainX, trainY)
        predY = learner.predict(validX)
        predTrainY = learner.predict(trainX)
        fullError += Evaluator.rootMeanSqError(predY, validY)
        trainError += Evaluator.rootMeanSqError(predTrainY, trainY)
        penalty += Evaluator.rootMeanSqError(predY, validY) - Evaluator.rootMeanSqError(predTrainY, trainY)
        
    print((folds-1)*fullError/folds, (folds-1)*trainError/folds, (folds-1)*penalty/folds)
Example #44
0
        methodNames = graphRanker.getNames()
        
        if runLSI: 
            outputFilename = dataset.getOutputFieldDir(field) + "outputListsLSI.npz"
        else: 
            outputFilename = dataset.getOutputFieldDir(field) + "outputListsLDA.npz"
            
        Util.savePickle([outputLists, trainExpertMatchesInds, testExpertMatchesInds], outputFilename, debug=True)
        
        numMethods = len(outputLists)
        precisions = numpy.zeros((len(ns), numMethods))
        averagePrecisions = numpy.zeros(numMethods)
        
        for i, n in enumerate(ns):     
            for j in range(len(outputLists)): 
                precisions[i, j] = Evaluator.precisionFromIndLists(testExpertMatchesInds, outputLists[j][0:n]) 
            
        for j in range(len(outputLists)):                 
            averagePrecisions[j] = Evaluator.averagePrecisionFromLists(testExpertMatchesInds, outputLists[j][0:averagePrecisionN], averagePrecisionN) 
        
        precisions2 = numpy.c_[numpy.array(ns), precisions]
        
        logging.debug(Latex.listToRow(methodNames))
        logging.debug("Computing Precision")
        logging.debug(Latex.array2DToRows(precisions2))
        logging.debug("Computing Average Precision")
        logging.debug(Latex.array1DToRow(averagePrecisions))
#fermer le fichier 
fich.close()

logging.debug("All done!")
Example #45
0
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
Example #46
0
    def testPredict(self):
        rankBoost = RankBoost()
        rankBoost.learnModel(self.X, self.y)
        predY = rankBoost.predict(self.X)

        self.assertTrue(Evaluator.auc(predY, self.y) <= 1.0 and Evaluator.auc(predY, self.y) >= 0.0)