Exemple #1
0
    def testCrossValidation(self):
        numExamples = 10
        folds = 2

        indices = Sampling.crossValidation(folds, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9]))

        indices = Sampling.crossValidation(3, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([3, 4, 5, 6, 7, 8, 9], [0, 1, 2]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 6, 7, 8, 9], [3, 4, 5]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 5], [6, 7, 8, 9]))

        indices = Sampling.crossValidation(4, numExamples)

        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([2, 3, 4, 5, 6, 7, 8, 9], [0, 1]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 5, 6, 7, 8, 9], [2, 3, 4]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 7, 8, 9], [5, 6]))
        self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 3, 4, 5, 6], [7, 8, 9]))

        indices = Sampling.crossValidation(numExamples, numExamples)
        self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0]))
        self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 2, 3, 4, 5, 6, 7, 8, 9], [1]))
        self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 3, 4, 5, 6, 7, 8, 9], [2]))
        self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 4, 5, 6, 7, 8, 9], [3]))
        self.assertEquals((list(indices[4][0]), list(indices[4][1])), ([0, 1, 2, 3, 5, 6, 7, 8, 9], [4]))

        self.assertRaises(ValueError, Sampling.crossValidation, numExamples+1, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, 0, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, -1, numExamples)
        self.assertRaises(ValueError, Sampling.crossValidation, folds, 1)
Exemple #2
0
 def testParallelPen(self): 
     #Check if penalisation == inf when treeSize < gamma 
     numExamples = 100
     X, y = data.make_regression(numExamples) 
     learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2)
     
     paramDict = {} 
     paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int)
     
     folds = 3
     alpha = 1.0
     Cvs = numpy.array([(folds-1)*alpha])
     
     idx = Sampling.crossValidation(folds, X.shape[0])
     
     resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs)
     
     learner, trainErrors, currentPenalties = resultsList[0]
     
     learner.setGamma(2**10)
     treeSize = 0
     #Let's work out the size of the unpruned tree 
     for trainInds, testInds in idx: 
         trainX = X[trainInds, :]
         trainY = y[trainInds]
         
         learner.learnModel(trainX, trainY)
         treeSize += learner.tree.size 
     
     treeSize /= float(folds)         
     
     self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all())      
     self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
Exemple #3
0
    def testParallelVfPenRbf2(self):
        #Test support vector regression
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        resultsList = svm.parallelVfPenRbf(self.X,
                                           self.y,
                                           idx,
                                           Cv,
                                           type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros(
            (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    penalty = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(self.X)
                        predTrainY = svm.predict(trainX)
                        penalty += svm.getMetricMethod()(
                            predY, self.y) - svm.getMetricMethod()(predTrainY,
                                                                   trainY)

                    penalty = penalty * Cv[0] / len(idx)
                    svm.learnModel(self.X, self.y)
                    predY = svm.predict(self.X)
                    meanErrors2[j, k, i] = svm.getMetricMethod()(
                        predY, self.y) + penalty

                    if meanErrors2[j, k, i] < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestEpsilon = epsilon
                        bestError = meanErrors2[j, k, i]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
    def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc):
        """
        ParamList is a list of lists of parameters and paramFunc
        is a list of the corresponding functions to call with the parameters
        as arguments. Note that a parameter can also be a tuple which is expanded
        out before the function is called. 

        e.g.
        paramList = [[1, 2], [2, 1], [12, 1]]
        paramFunc = [predictor.setC, predictor.setD]
        """

        inds = Sampling.crossValidation(folds, graph.getNumEdges())
        errors = numpy.zeros((len(paramList), folds))
        allEdges = graph.getAllEdges()

        for i in range(len(paramList)):
            paramSet = paramList[i]
            logging.debug("Using paramSet=" + str(paramSet))

            for j in range(len(paramSet)):
                if type(paramSet[j]) == tuple:
                    paramFunc[j](*paramSet[j])
                else:
                    paramFunc[j](paramSet[j])

            predY = numpy.zeros(0)
            y = numpy.zeros(0)
            j = 0

            for (trainInds, testInds) in inds:
                trainEdges = allEdges[trainInds, :]
                testEdges = allEdges[testInds, :]

                trainGraph = SparseGraph(graph.getVertexList(),
                                         graph.isUndirected())
                trainGraph.addEdges(trainEdges,
                                    graph.getEdgeValues(trainEdges))

                testGraph = SparseGraph(graph.getVertexList(),
                                        graph.isUndirected())
                testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges))

                self.learnModel(trainGraph)

                predY = self.predictEdges(testGraph, testGraph.getAllEdges())
                y = testGraph.getEdgeValues(testGraph.getAllEdges())
                #Note that the order the edges is different in testGraphs as
                #opposed to graph when calling getAllEdges()

                errors[i, j] = errorFunc(y, predY)
                j = j + 1

            logging.info("Error of current fold: " +
                         str(numpy.mean(errors[i, :])))

        meanErrors = numpy.mean(errors, 1)
        strErrors = numpy.std(errors, 1)

        return meanErrors, strErrors
Exemple #5
0
    def testParallelPen(self):
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)

        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(
                        predY, self.y) - Evaluator.binaryError(
                            predTrainY, trainY)

                penalty = penalty * Cv[0] / len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i,
                            j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(
            numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
 def cvPrune(self, validX, validY): 
     """
     We do something like reduced error pruning but we use cross validation 
     to decide which nodes to prune. 
     """
     
     #First set the value of the vertices using the training set. 
     #Reset all alphas to zero 
     inds = Sampling.crossValidation(self.folds, validX.shape[0])
     
     for i in self.tree.getAllVertexIds(): 
         self.tree.getVertex(i).setAlpha(0.0)
         self.tree.getVertex(i).setTestError(0.0)
     
     for trainInds, testInds in inds:             
         rootId = (0,)
         root = self.tree.getVertex(rootId)
         root.setTrainInds(trainInds)
         root.setTestInds(testInds)
         root.tempValue = numpy.mean(validY[trainInds])
         
         nodeStack = [(rootId, root.tempValue)]
         
         while len(nodeStack) != 0: 
             (nodeId, value) = nodeStack.pop()
             node = self.tree.getVertex(nodeId)
             tempTrainInds = node.getTrainInds()
             tempTestInds = node.getTestInds()
             node.setTestError(numpy.sum((validY[tempTestInds] - node.tempValue)**2) + node.getTestError())
             childIds = [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)]
             
             for childId in childIds:                 
                 if self.tree.vertexExists(childId): 
                     child = self.tree.getVertex(childId)
                     
                     if childId[-1] == 0: 
                         childInds = validX[tempTrainInds, node.getFeatureInd()] < node.getThreshold()
                     else: 
                         childInds = validX[tempTrainInds, node.getFeatureInd()] >= node.getThreshold()
                     
                     if childInds.sum() !=0:   
                         value = numpy.mean(validY[tempTrainInds[childInds]])
                         
                     child.tempValue = value 
                     child.setTrainInds(tempTrainInds[childInds])
                     nodeStack.append((childId, value))
                     
                     if childId[-1] == 0: 
                         childInds = validX[tempTestInds, node.getFeatureInd()] < node.getThreshold() 
                     else: 
                         childInds = validX[tempTestInds, node.getFeatureInd()] >= node.getThreshold()  
                      
                     child.setTestInds(tempTestInds[childInds])
     
     self.computeAlphas()
     self.prune()
    def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc):
        """
        ParamList is a list of lists of parameters and paramFunc
        is a list of the corresponding functions to call with the parameters
        as arguments. Note that a parameter can also be a tuple which is expanded
        out before the function is called. 

        e.g.
        paramList = [[1, 2], [2, 1], [12, 1]]
        paramFunc = [predictor.setC, predictor.setD]
        """

        inds = Sampling.crossValidation(folds, graph.getNumEdges())
        errors = numpy.zeros((len(paramList), folds))
        allEdges = graph.getAllEdges()

        for i in range(len(paramList)):
            paramSet = paramList[i]
            logging.debug("Using paramSet=" + str(paramSet))

            for j in range(len(paramSet)):
                if type(paramSet[j]) == tuple:
                    paramFunc[j](*paramSet[j])
                else: 
                    paramFunc[j](paramSet[j])

            predY = numpy.zeros(0)
            y = numpy.zeros(0)
            j = 0 

            for (trainInds, testInds) in inds:
                trainEdges = allEdges[trainInds, :]
                testEdges = allEdges[testInds, :]

                trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
                trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges))

                testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected())
                testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges))

                self.learnModel(trainGraph)

                predY = self.predictEdges(testGraph, testGraph.getAllEdges())
                y = testGraph.getEdgeValues(testGraph.getAllEdges())
                #Note that the order the edges is different in testGraphs as
                #opposed to graph when calling getAllEdges()

                errors[i, j] = errorFunc(y, predY)
                j = j+1 

            logging.info("Error of current fold: " + str(numpy.mean(errors[i, :])))

        meanErrors = numpy.mean(errors, 1)
        strErrors = numpy.std(errors, 1)

        return meanErrors, strErrors
Exemple #8
0
    def testParallelPen(self): 
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()            
        
        resultsList = svm.parallelPen(self.X, self.y, idx, paramDict, Cv)
        
        tol = 10**-6
        bestError = 1
        trainErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        penalties2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                penalty = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(self.X)
                    predTrainY = svm.predict(trainX)
                    penalty += Evaluator.binaryError(predY, self.y) - Evaluator.binaryError(predTrainY, trainY)

                penalty = penalty*Cv[0]/len(idx)
                svm.learnModel(self.X, self.y)
                predY = svm.predict(self.X)
                trainErrors2[i, j] = Evaluator.binaryError(predY, self.y)
                penalties2[i, j] = penalty
                meanErrors2[i, j] = Evaluator.binaryError(predY, self.y) + penalty

                if meanErrors2[i, j] < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = meanErrors2[i, j]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
        self.assertTrue(numpy.linalg.norm(trainErrors2.T - trainErrors) < tol)
        self.assertTrue(numpy.linalg.norm(penalties2.T - currentPenalties) < tol)
Exemple #9
0
    def generateLearner(self, X, y):
        """
        Train using the given examples and labels, and use model selection to
        find the best parameters.
        """
        if numpy.unique(y).shape[0] != 2:
            print(y)
            raise ValueError("Can only operate on binary data")

        #Do model selection first 
        if self.sampleSize == None: 
            idx = Sampling.crossValidation(self.folds, X.shape[0])
            learner, meanErrors = self.parallelModelSelect(X, y, idx, self.paramDict)
        else: 
            idx = Sampling.crossValidation(self.folds, self.sampleSize)
            inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize]
            learner, meanErrors = self.parallelModelSelect(X[inds, :], y[inds], idx, self.paramDict)
            learner = self.getBestLearner(meanErrors, self.paramDict, X, y)
        
        return learner
    def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError):
        """
        Compute the cross validation according to a given metric. 
        """
        Parameter.checkInt(folds, 2, float('inf'))
        idx = Sampling.crossValidation(folds, y.shape[0])
        metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel, self.predict, metricMethod)

        mean = numpy.mean(metrics, 0)
        var = numpy.var(metrics, 0)

        return (mean, var)
Exemple #11
0
    def testParallelVfPenRbf2(self):
        #Test support vector regression 
        folds = 3
        Cv = numpy.array([4.0])
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        resultsList = svm.parallelVfPenRbf(self.X, self.y, idx, Cv, type="Epsilon_SVR")

        tol = 10**-6 
        bestError = 100
        meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]
                    
                    penalty = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(self.X)
                        predTrainY = svm.predict(trainX)
                        penalty += svm.getMetricMethod()(predY, self.y) - svm.getMetricMethod()(predTrainY, trainY)

                    penalty = penalty*Cv[0]/len(idx)
                    svm.learnModel(self.X, self.y)
                    predY = svm.predict(self.X)
                    meanErrors2[j, k, i] = svm.getMetricMethod()(predY, self.y) + penalty

                    if meanErrors2[j, k, i] < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestEpsilon = epsilon 
                        bestError = meanErrors2[j, k, i]

        bestSVM, trainErrors, currentPenalties = resultsList[0]
        meanErrors = trainErrors + currentPenalties

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
    def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError):
        """
        Compute the cross validation according to a given metric. 
        """
        Parameter.checkInt(folds, 2, float('inf'))
        idx = Sampling.crossValidation(folds, y.shape[0])
        metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel,
                                                  self.predict, metricMethod)

        mean = numpy.mean(metrics, 0)
        var = numpy.var(metrics, 0)

        return (mean, var)
    def testParallelPenaltyGrid(self): 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        randomForest = RandomForest()
        
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]
        
        paramDict = {} 
        paramDict["setMinSplit"] = randomForest.getMinSplits()
        paramDict["setMaxDepth"] = randomForest.getMaxDepths()      

        idealPenalties = randomForest.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
Exemple #14
0
    def generateLearner(self, X, y):
        """
        Train using the given examples and labels, and use model selection to
        find the best parameters.
        """
        if numpy.unique(y).shape[0] != 2:
            print(y)
            raise ValueError("Can only operate on binary data")

        #Do model selection first
        if self.sampleSize == None:
            idx = Sampling.crossValidation(self.folds, X.shape[0])
            learner, meanErrors = self.parallelModelSelect(
                X, y, idx, self.paramDict)
        else:
            idx = Sampling.crossValidation(self.folds, self.sampleSize)
            inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize]
            learner, meanErrors = self.parallelModelSelect(
                X[inds, :], y[inds], idx, self.paramDict)
            learner = self.getBestLearner(meanErrors, self.paramDict, X, y)

        return learner
Exemple #15
0
    def testParallelVfcvRbf2(self):
        #In this test we try SVM regression
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        bestSVM, meanErrors = svm.parallelVfcvRbf(self.X,
                                                  self.y,
                                                  idx,
                                                  type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros(
            (svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    error = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]
                        testX = self.X[testInds, :]
                        testY = self.y[testInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(testX)
                        error += svm.getMetricMethod()(predY, testY)

                    meanErrors2[j, k, i] = error / len(idx)

                    if error < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestError = error
                        bestEpsilon = epsilon

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
    def testParallelPenaltyGrid(self): 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        decisionTree = DecisionTree()
        bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx)
        
        trainX = self.X[0:40, :]
        trainY = self.y[0:40]
        
        paramDict = {} 
        paramDict["setMinSplit"] = decisionTree.getMinSplits()
        paramDict["setMaxDepth"] = decisionTree.getMaxDepths()      

        idealPenalties = decisionTree.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
Exemple #17
0
    def testParallelPenaltyGrid(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        randomForest = RandomForest()

        trainX = self.X[0:40, :]
        trainY = self.y[0:40]

        paramDict = {}
        paramDict["setMinSplit"] = randomForest.getMinSplits()
        paramDict["setMaxDepth"] = randomForest.getMaxDepths()

        idealPenalties = randomForest.parallelPenaltyGrid(
            trainX, trainY, self.X, self.y, paramDict)
Exemple #18
0
    def testParallelVfcvRbf2(self):
        #In this test we try SVM regression 
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")
        svm.setSvmType("Epsilon_SVR")
        bestSVM, meanErrors = svm.parallelVfcvRbf(self.X, self.y, idx, type="Epsilon_SVR")

        tol = 10**-6
        bestError = 100
        meanErrors2 = numpy.zeros((svm.gammas.shape[0], svm.epsilons.shape[0], svm.Cs.shape[0]))

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                for k in range(svm.epsilons.shape[0]):
                    epsilon = svm.epsilons[k]

                    error = 0
                    for trainInds, testInds in idx:
                        trainX = self.X[trainInds, :]
                        trainY = self.y[trainInds]
                        testX = self.X[testInds, :]
                        testY = self.y[testInds]

                        svm.setGamma(gamma)
                        svm.setC(C)
                        svm.setEpsilon(epsilon)
                        svm.learnModel(trainX, trainY)
                        predY = svm.predict(testX)
                        error += svm.getMetricMethod()(predY, testY)

                    meanErrors2[j, k, i] = error/len(idx)

                    if error < bestError:
                        bestC = C
                        bestGamma = gamma
                        bestError = error
                        bestEpsilon = epsilon

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertEquals(bestEpsilon, bestSVM.getEpsilon())
        self.assertTrue(numpy.linalg.norm(meanErrors2 - meanErrors) < tol)
Exemple #19
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx,
                                                      paramDict)

        tol = 10**-6
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0]))
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error / len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error

        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
Exemple #20
0
    def testParallelModelSelect(self):
        folds = 3
        idx = Sampling.crossValidation(folds, self.X.shape[0])
        svm = self.svm
        svm.setKernel("gaussian")

        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()    
        
        bestSVM, meanErrors = svm.parallelModelSelect(self.X, self.y, idx, paramDict)
        
        tol = 10**-6 
        bestError = 1
        meanErrors2 = numpy.zeros((svm.Cs.shape[0], svm.gammas.shape[0])) 
        print("Computing real grid")

        for i in range(svm.Cs.shape[0]):
            C = svm.Cs[i]
            for j in range(svm.gammas.shape[0]):
                gamma = svm.gammas[j]
                error = 0
                for trainInds, testInds in idx:
                    trainX = self.X[trainInds, :]
                    trainY = self.y[trainInds]
                    testX = self.X[testInds, :]
                    testY = self.y[testInds]

                    svm.setGamma(gamma)
                    svm.setC(C)
                    svm.learnModel(trainX, trainY)
                    predY = svm.predict(testX)
                    error += Evaluator.binaryError(predY, testY)

                meanErrors2[i, j] = error/len(idx)

                if error < bestError:
                    bestC = C
                    bestGamma = gamma
                    bestError = error
            
        self.assertEquals(bestC, bestSVM.getC())
        self.assertEquals(bestGamma, bestSVM.getGamma())
        self.assertTrue(numpy.linalg.norm(meanErrors2.T - meanErrors) < tol)
Exemple #21
0
    def testGetBestLearner(self):
        svm = self.svm
        paramDict = {}
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()

        errors = numpy.random.rand(svm.getCs().shape[0],
                                   svm.getGammas().shape[0])

        folds = 5
        idx = Sampling.crossValidation(folds, self.X.shape[0])

        svm.normModelSelect = True
        svm.setKernel("gaussian")
        learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx)

        bestC = learner.getC()

        #Find the best norm
        bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape)
        learner.setC(svm.getCs()[bestInds[0]])
        learner.setGamma(svm.getGammas()[bestInds[1]])

        norms = []
        for trainInds, testInds in idx:
            validX = self.X[trainInds, :]
            validY = self.y[trainInds]
            learner.learnModel(validX, validY)

            norms.append(learner.weightNorm())

        bestNorm = numpy.array(norms).mean()

        norms = numpy.zeros(paramDict["setC"].shape[0])
        for i, C in enumerate(paramDict["setC"]):
            learner.setC(C)
            learner.learnModel(self.X, self.y)
            norms[i] = learner.weightNorm()

        bestC2 = paramDict["setC"][numpy.abs(norms - bestNorm).argmin()]

        self.assertEquals(bestC, bestC2)
Exemple #22
0
    def testGetBestLearner(self): 
        svm = self.svm
        paramDict = {} 
        paramDict["setC"] = svm.getCs()
        paramDict["setGamma"] = svm.getGammas()      

        errors = numpy.random.rand(svm.getCs().shape[0], svm.getGammas().shape[0])

        folds = 5 
        idx = Sampling.crossValidation(folds, self.X.shape[0])

        svm.normModelSelect = True 
        svm.setKernel("gaussian")
        learner = svm.getBestLearner(errors, paramDict, self.X, self.y, idx)
        
        bestC = learner.getC()
        
        #Find the best norm 
        bestInds = numpy.unravel_index(numpy.argmin(errors), errors.shape)
        learner.setC(svm.getCs()[bestInds[0]])
        learner.setGamma(svm.getGammas()[bestInds[1]])              
        
        norms = []
        for trainInds, testInds in idx: 
            validX = self.X[trainInds, :]
            validY = self.y[trainInds]
            learner.learnModel(validX, validY)
            
            norms.append(learner.weightNorm())  
        
        bestNorm = numpy.array(norms).mean()
        
        norms = numpy.zeros(paramDict["setC"].shape[0]) 
        for i, C in enumerate(paramDict["setC"]): 
            learner.setC(C)
            learner.learnModel(self.X, self.y)
            norms[i] = learner.weightNorm()            
            
        bestC2 = paramDict["setC"][numpy.abs(norms-bestNorm).argmin()]
        
        self.assertEquals(bestC, bestC2)
    def learningRate(self, X, y, foldsSet, paramDict):
        """
        Find a matrix beta which has the same dimensions as the parameter grid. 
        Each value in the grid represents the learning rate with respect to 
        those particular parameters.         
        
        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param foldsSet: A list of folds to try. 

        :param paramDict: A dictionary index by the method name and with value as an array of values
        :type X: :class:`dict`
        """
        try:
            from sklearn import linear_model
        except ImportError:
            raise

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        betaGrid = numpy.ones(tuple(gridSize))

        gridSize.insert(0, foldsSet.shape[0])
        penalties = numpy.zeros(tuple(gridSize))
        Cvs = numpy.array([1])

        for i in range(foldsSet.shape[0]):
            folds = foldsSet[i]
            logging.debug("Folds " + str(folds))

            idx = Sampling.crossValidation(folds, X.shape[0])
            resultsList = self.parallelPen(X, y, idx, paramDict, Cvs)
            bestLearner, trainErrors, currentPenalties = resultsList[0]
            penalties[i, :] = currentPenalties

        indexIter = itertools.product(*gridInds)

        for inds in indexIter:
            inds2 = [slice(0, penalties.shape[0])]
            inds2.extend(inds)
            inds2 = tuple(inds2)
            tempPenalties = penalties[inds2]

            penInds = numpy.logical_and(numpy.isfinite(tempPenalties),
                                        tempPenalties > 0)
            penInds = numpy.squeeze(penInds)
            tempPenalties = tempPenalties[penInds].flatten()
            tempfoldsSet = numpy.array(foldsSet, numpy.float)[penInds]

            if tempPenalties.shape[0] > 1:
                xp = numpy.log((tempfoldsSet - 1) / tempfoldsSet * X.shape[0])
                yp = numpy.log(tempPenalties) + numpy.log(tempfoldsSet)

                clf = linear_model.LinearRegression()
                clf.fit(numpy.array([xp]).T, yp)
                betaGrid[inds] = clf.coef_[0]

        return -betaGrid
    def cvPrune(self, validX, validY):
        """
        We do something like reduced error pruning but we use cross validation 
        to decide which nodes to prune. 
        """

        #First set the value of the vertices using the training set.
        #Reset all alphas to zero
        inds = Sampling.crossValidation(self.folds, validX.shape[0])

        for i in self.tree.getAllVertexIds():
            self.tree.getVertex(i).setAlpha(0.0)
            self.tree.getVertex(i).setTestError(0.0)

        for trainInds, testInds in inds:
            rootId = (0, )
            root = self.tree.getVertex(rootId)
            root.setTrainInds(trainInds)
            root.setTestInds(testInds)
            root.tempValue = numpy.mean(validY[trainInds])

            nodeStack = [(rootId, root.tempValue)]

            while len(nodeStack) != 0:
                (nodeId, value) = nodeStack.pop()
                node = self.tree.getVertex(nodeId)
                tempTrainInds = node.getTrainInds()
                tempTestInds = node.getTestInds()
                node.setTestError(
                    numpy.sum((validY[tempTestInds] - node.tempValue)**2) +
                    node.getTestError())
                childIds = [
                    self.getLeftChildId(nodeId),
                    self.getRightChildId(nodeId)
                ]

                for childId in childIds:
                    if self.tree.vertexExists(childId):
                        child = self.tree.getVertex(childId)

                        if childId[-1] == 0:
                            childInds = validX[
                                tempTrainInds,
                                node.getFeatureInd()] < node.getThreshold()
                        else:
                            childInds = validX[
                                tempTrainInds,
                                node.getFeatureInd()] >= node.getThreshold()

                        if childInds.sum() != 0:
                            value = numpy.mean(
                                validY[tempTrainInds[childInds]])

                        child.tempValue = value
                        child.setTrainInds(tempTrainInds[childInds])
                        nodeStack.append((childId, value))

                        if childId[-1] == 0:
                            childInds = validX[
                                tempTestInds,
                                node.getFeatureInd()] < node.getThreshold()
                        else:
                            childInds = validX[
                                tempTestInds,
                                node.getFeatureInd()] >= node.getThreshold()

                        child.setTestInds(tempTestInds[childInds])

        self.computeAlphas()
        self.prune()
Exemple #25
0
 def run():
     for i in range(2):
         print("Iteration " + str(i))
         idx = Sampling.crossValidation(self.folds, numExamples)
         learner.parallelPen(X, Y, idx, self.paramDict, Cvs)
Exemple #26
0
    idx = sampleMethod(folds, validY.shape[0])
    svmGridResults = learner.parallelPen(validX, validY, idx, paramDict, Cvs)
    
    
    for result in svmGridResults: 
        learner, trainErrors, currentPenalties = result
        print(numpy.mean(trainErrors), numpy.mean(currentPenalties))
"""

#Figure out why the penalty is increasing 
X = trainX 
y = trainY 

for i in range(foldsSet.shape[0]): 
    folds = foldsSet[i]
    idx = Sampling.crossValidation(folds, validX.shape[0])
    
    penalty = 0
    fullError = 0 
    trainError = 0     
    
    learner.learnModel(validX, validY)
    predY = learner.predict(X)
    predValidY = learner.predict(validX)
    idealPenalty = Evaluator.rootMeanSqError(predY, y) - Evaluator.rootMeanSqError(predValidY, validY)
    
    for trainInds, testInds in idx:
        trainX = validX[trainInds, :]
        trainY = validY[trainInds]
    
        #learner.setGamma(gamma)
 def testParallelVfcv(self): 
     folds = 3
     idx = Sampling.crossValidation(folds, self.X.shape[0])
     decisionTree = DecisionTree()
     bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx)
    def learningRate(self, X, y, foldsSet, paramDict): 
        """
        Find a matrix beta which has the same dimensions as the parameter grid. 
        Each value in the grid represents the learning rate with respect to 
        those particular parameters.         
        
        :param X: The examples as rows
        :type X: :class:`numpy.ndarray`

        :param y: The binary -1/+1 labels 
        :type y: :class:`numpy.ndarray`

        :param foldsSet: A list of folds to try. 

        :param paramDict: A dictionary index by the method name and with value as an array of values
        :type X: :class:`dict`
        """ 
        try: 
            from sklearn import linear_model 
        except ImportError: 
            raise
        
        gridSize = [] 
        gridInds = [] 
        for key in paramDict.keys(): 
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0])) 
            
        betaGrid = numpy.ones(tuple(gridSize))
        
        gridSize.insert(0, foldsSet.shape[0])
        penalties = numpy.zeros(tuple(gridSize))
        Cvs = numpy.array([1])
        
        for i in range(foldsSet.shape[0]):
            folds = foldsSet[i]
            logging.debug("Folds " + str(folds))
                       
            idx = Sampling.crossValidation(folds, X.shape[0])
            resultsList = self.parallelPen(X, y, idx, paramDict, Cvs)
            bestLearner, trainErrors, currentPenalties = resultsList[0]
            penalties[i, :] = currentPenalties
        
        indexIter = itertools.product(*gridInds)

        for inds in indexIter: 
            inds2 = [slice(0, penalties.shape[0])]
            inds2.extend(inds)
            inds2 = tuple(inds2)
            tempPenalties = penalties[inds2]
            
            penInds = numpy.logical_and(numpy.isfinite(tempPenalties), tempPenalties>0)
            penInds = numpy.squeeze(penInds)
            tempPenalties = tempPenalties[penInds].flatten()
            tempfoldsSet = numpy.array(foldsSet, numpy.float)[penInds]  
                   
            if tempPenalties.shape[0] > 1: 
                xp = numpy.log((tempfoldsSet-1)/tempfoldsSet*X.shape[0])
                yp = numpy.log(tempPenalties)+numpy.log(tempfoldsSet)    
            
                clf = linear_model.LinearRegression()
                clf.fit(numpy.array([xp]).T, yp)
                betaGrid[inds] = clf.coef_[0]  
        
        return -betaGrid 
Exemple #29
0
    def testModelSelect(self): 
        
        """
        We test the results on some data and compare to SVR. 
        """
        numExamples = 200
        X, y = data.make_regression(numExamples, noise=0.5)  
        
        X = Standardiser().standardiseArray(X)
        y = Standardiser().standardiseArray(y)
        
        trainX = X[0:100, :]
        trainY = y[0:100]
        testX = X[100:, :]
        testY = y[100:]
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV")
        learner.setPruneCV(8)
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) 
        paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int)
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
        
        
        learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART")
        
        paramDict = {} 
        paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) 
        
        folds = 5
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict)


        predY = bestTree.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
              
        return 
        #Let's compare to the SVM 
        learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") 
        
        paramDict = {} 
        paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float)
        paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
        paramDict["setEpsilon"] = learner2.getEpsilons()
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])
        bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict)

        predY = bestSVM.predict(testX)
        error = Evaluator.rootMeanSqError(testY, predY)
        print(error)
Exemple #30
0
 def run():
     for i in range(5):
         print("Iteration " + str(i))
         idx = Sampling.crossValidation(folds, numExamples)
         learner.parallelPen(X, Y, idx, paramDict, Cvs)
 def run():
     for i in range(5):
         print("Iteration " + str(i))
         idx = Sampling.crossValidation(folds, numExamples)
         learner.parallelModelSelect(X, Y, idx, paramDict)
Exemple #32
0
 def run():
     for i in range(5):
         print("Iteration " + str(i))
         idx = Sampling.crossValidation(self.folds, numExamples)
         learner.parallelModelSelect(X, Y, idx, self.paramDict)
    meanPenalties = numpy.zeros((numGammas, numEpsilons, numCs))
    meanBetaPenalties = numpy.zeros((numGammas, numEpsilons, numCs))
    meanIdealPenalities = numpy.zeros((numGammas, numEpsilons, numCs))

    for j in range(numRealisations):
        print("")
        logging.debug("j=" + str(j))
        trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j)
        logging.debug("Loaded dataset with " + str(trainX.shape) +  " train and " + str(testX.shape) + " test examples")
        
        trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize]
        trainX = trainX[trainInds,:]
        trainY = trainY[trainInds]
        
        idx = Sampling.crossValidation(folds, trainX.shape[0])

        Cvs = [(folds-1)*alpha, beta[j, sampleSizeInd, :]]    
    
        #Now try penalisation
        methodInd = 0
        resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs)
        bestLearner, trainErrors, currentPenalties = resultsList[0]
        meanPenalties += currentPenalties
        predY = bestLearner.predict(testX)
                
        #Learning rate penalisation 
        methodInd = 1
        bestLearner, trainErrors, currentPenalties = resultsList[1]
        meanBetaPenalties += currentPenalties
        predY = bestLearner.predict(testX)