def testCrossValidation(self): numExamples = 10 folds = 2 indices = Sampling.crossValidation(folds, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([5, 6, 7, 8, 9], [0, 1, 2, 3, 4])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 3, 4], [5, 6, 7, 8, 9])) indices = Sampling.crossValidation(3, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([3, 4, 5, 6, 7, 8, 9], [0, 1, 2])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 2, 6, 7, 8, 9], [3, 4, 5])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 5], [6, 7, 8, 9])) indices = Sampling.crossValidation(4, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([2, 3, 4, 5, 6, 7, 8, 9], [0, 1])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 1, 5, 6, 7, 8, 9], [2, 3, 4])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 2, 3, 4, 7, 8, 9], [5, 6])) self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 3, 4, 5, 6], [7, 8, 9])) indices = Sampling.crossValidation(numExamples, numExamples) self.assertEquals((list(indices[0][0]), list(indices[0][1])), ([1, 2, 3, 4, 5, 6, 7, 8, 9], [0])) self.assertEquals((list(indices[1][0]), list(indices[1][1])), ([0, 2, 3, 4, 5, 6, 7, 8, 9], [1])) self.assertEquals((list(indices[2][0]), list(indices[2][1])), ([0, 1, 3, 4, 5, 6, 7, 8, 9], [2])) self.assertEquals((list(indices[3][0]), list(indices[3][1])), ([0, 1, 2, 4, 5, 6, 7, 8, 9], [3])) self.assertEquals((list(indices[4][0]), list(indices[4][1])), ([0, 1, 2, 3, 5, 6, 7, 8, 9], [4])) self.assertRaises(ValueError, Sampling.crossValidation, numExamples+1, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, 0, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, -1, numExamples) self.assertRaises(ValueError, Sampling.crossValidation, folds, 1)
def testRepCrossValidation(self): numExamples = 10 folds = 3 repetitions = 1 indices = Sampling.repCrossValidation(folds, numExamples, repetitions) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) repetitions = 2 indices = Sampling.repCrossValidation(folds, numExamples, repetitions) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
def testParallelPen(self): #Check if penalisation == inf when treeSize < gamma numExamples = 100 X, y = data.make_regression(numExamples) learner = DecisionTreeLearner(pruneType="CART", maxDepth=10, minSplit=2) paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 10, 0.5)-1), dtype=numpy.int) folds = 3 alpha = 1.0 Cvs = numpy.array([(folds-1)*alpha]) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = learner.parallelPen(X, y, idx, paramDict, Cvs) learner, trainErrors, currentPenalties = resultsList[0] learner.setGamma(2**10) treeSize = 0 #Let's work out the size of the unpruned tree for trainInds, testInds in idx: trainX = X[trainInds, :] trainY = y[trainInds] learner.learnModel(trainX, trainY) treeSize += learner.tree.size treeSize /= float(folds) self.assertTrue(numpy.isinf(currentPenalties[paramDict["setGamma"]>treeSize]).all()) self.assertTrue(not numpy.isinf(currentPenalties[paramDict["setGamma"]<treeSize]).all())
def testShuffleSplit(self): numExamples = 10 folds = 5 indices = Sampling.shuffleSplit(folds, numExamples) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) indices = Sampling.shuffleSplit(folds, numExamples, 0.5) trainSize = numExamples*0.5 for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all()) self.assertTrue(indices[i][0].shape[0] == trainSize) indices = Sampling.shuffleSplit(folds, numExamples, 0.55)
def cvModelSelection(self, graph, paramList, paramFunc, folds, errorFunc): """ ParamList is a list of lists of parameters and paramFunc is a list of the corresponding functions to call with the parameters as arguments. Note that a parameter can also be a tuple which is expanded out before the function is called. e.g. paramList = [[1, 2], [2, 1], [12, 1]] paramFunc = [predictor.setC, predictor.setD] """ inds = Sampling.crossValidation(folds, graph.getNumEdges()) errors = numpy.zeros((len(paramList), folds)) allEdges = graph.getAllEdges() for i in range(len(paramList)): paramSet = paramList[i] logging.debug("Using paramSet=" + str(paramSet)) for j in range(len(paramSet)): if type(paramSet[j]) == tuple: paramFunc[j](*paramSet[j]) else: paramFunc[j](paramSet[j]) predY = numpy.zeros(0) y = numpy.zeros(0) j = 0 for (trainInds, testInds) in inds: trainEdges = allEdges[trainInds, :] testEdges = allEdges[testInds, :] trainGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) trainGraph.addEdges(trainEdges, graph.getEdgeValues(trainEdges)) testGraph = SparseGraph(graph.getVertexList(), graph.isUndirected()) testGraph.addEdges(testEdges, graph.getEdgeValues(testEdges)) self.learnModel(trainGraph) predY = self.predictEdges(testGraph, testGraph.getAllEdges()) y = testGraph.getEdgeValues(testGraph.getAllEdges()) #Note that the order the edges is different in testGraphs as #opposed to graph when calling getAllEdges() errors[i, j] = errorFunc(y, predY) j = j+1 logging.info("Error of current fold: " + str(numpy.mean(errors[i, :]))) meanErrors = numpy.mean(errors, 1) strErrors = numpy.std(errors, 1) return meanErrors, strErrors
def cvPrune(self, validX, validY): """ We do something like reduced error pruning but we use cross validation to decide which nodes to prune. """ #First set the value of the vertices using the training set. #Reset all alphas to zero inds = Sampling.crossValidation(self.folds, validX.shape[0]) for i in self.tree.getAllVertexIds(): self.tree.getVertex(i).setAlpha(0.0) self.tree.getVertex(i).setTestError(0.0) for trainInds, testInds in inds: rootId = (0,) root = self.tree.getVertex(rootId) root.setTrainInds(trainInds) root.setTestInds(testInds) root.tempValue = numpy.mean(validY[trainInds]) nodeStack = [(rootId, root.tempValue)] while len(nodeStack) != 0: (nodeId, value) = nodeStack.pop() node = self.tree.getVertex(nodeId) tempTrainInds = node.getTrainInds() tempTestInds = node.getTestInds() node.setTestError(numpy.sum((validY[tempTestInds] - node.tempValue)**2) + node.getTestError()) childIds = [self.getLeftChildId(nodeId), self.getRightChildId(nodeId)] for childId in childIds: if self.tree.vertexExists(childId): child = self.tree.getVertex(childId) if childId[-1] == 0: childInds = validX[tempTrainInds, node.getFeatureInd()] < node.getThreshold() else: childInds = validX[tempTrainInds, node.getFeatureInd()] >= node.getThreshold() if childInds.sum() !=0: value = numpy.mean(validY[tempTrainInds[childInds]]) child.tempValue = value child.setTrainInds(tempTrainInds[childInds]) nodeStack.append((childId, value)) if childId[-1] == 0: childInds = validX[tempTestInds, node.getFeatureInd()] < node.getThreshold() else: childInds = validX[tempTestInds, node.getFeatureInd()] >= node.getThreshold() child.setTestInds(tempTestInds[childInds]) self.computeAlphas() self.prune()
def testBootstrap2(self): numExamples = 10 folds = 2 indices = Sampling.bootstrap2(folds, numExamples) for i in range(folds): self.assertEquals(indices[i][0].shape[0], numExamples) self.assertTrue(indices[i][1].shape[0] < numExamples) self.assertTrue((numpy.union1d(indices[0][0], indices[0][1]) == numpy.arange(numExamples)).all())
def generateLearner(self, X, y): """ Train using the given examples and labels, and use model selection to find the best parameters. """ if numpy.unique(y).shape[0] != 2: print(y) raise ValueError("Can only operate on binary data") #Do model selection first if self.sampleSize == None: idx = Sampling.crossValidation(self.folds, X.shape[0]) learner, meanErrors = self.parallelModelSelect(X, y, idx, self.paramDict) else: idx = Sampling.crossValidation(self.folds, self.sampleSize) inds = numpy.random.permutation(X.shape[0])[0:self.sampleSize] learner, meanErrors = self.parallelModelSelect(X[inds, :], y[inds], idx, self.paramDict) learner = self.getBestLearner(meanErrors, self.paramDict, X, y) return learner
def evaluateCv(self, X, y, folds, metricMethod=Evaluator.binaryError): """ Compute the cross validation according to a given metric. """ Parameter.checkInt(folds, 2, float('inf')) idx = Sampling.crossValidation(folds, y.shape[0]) metrics = AbstractPredictor.evaluateLearn(X, y, idx, self.learnModel, self.predict, metricMethod) mean = numpy.mean(metrics, 0) var = numpy.var(metrics, 0) return (mean, var)
def processSimpleDataset(name, numRealisations, split, ext=".csv", delimiter=",", usecols=None, skiprows=1, converters=None): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ext print("Loading data from file " + fileName) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "/" XY = numpy.loadtxt(fileName, delimiter=delimiter, skiprows=skiprows, usecols=usecols, converters=converters) X = XY[:, :-1] y = XY[:, -1] idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) preprocessSave(X, y, outputDir, idx)
def testParallelPenaltyGrid(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) randomForest = RandomForest() trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setMinSplit"] = randomForest.getMinSplits() paramDict["setMaxDepth"] = randomForest.getMaxDepths() idealPenalties = randomForest.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
def testParallelModelSelect(self): X = scipy.sparse.rand(10, 10, 0.5) X = X.tocsr() numExamples = X.getnnz() paramDict = {} paramDict["setK"] = numpy.array([5, 10, 20]) folds = 3 idx = Sampling.randCrossValidation(folds, numExamples) lmbdas = numpy.array([0.1]) softImpute = SoftImpute(lmbdas, k=10) learner, meanErrors = softImpute.parallelModelSelect(X, idx, paramDict)
def testParallelPenaltyGrid(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) decisionTree = DecisionTree() bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx) trainX = self.X[0:40, :] trainY = self.y[0:40] paramDict = {} paramDict["setMinSplit"] = decisionTree.getMinSplits() paramDict["setMaxDepth"] = decisionTree.getMaxDepths() idealPenalties = decisionTree.parallelPenaltyGrid(trainX, trainY, self.X, self.y, paramDict)
def recommend(learner): """ Take a list of coauthors and read in the complete graph into a sparse matrix X such that X_ij = k means author i has worked with j, k times. Then do matrix factorisation on the resulting methods. """ outputDir = PathDefaults.getOutputDir() + "erasm/" matrixFileName = outputDir + "Toy" numExamples = 50 numFolds = 5 X = scipy.io.mmread(matrixFileName) X = scipy.sparse.csr_matrix(X) logging.debug("Loaded matrix " + str(X.shape) + " with " + str(X.getnnz()) + " non zeros") X = X.tocsr() X = X[0:numExamples ,:] X, maxS = preprocess(X) #Take out some ratings to form a training set rowInds, colInds = X.nonzero() randInds = numpy.random.permutation(rowInds.shape[0]) indexList = Sampling.crossValidation(numFolds, rowInds.shape[0]) paramList = [] for j, (trnIdx, tstIdx) in enumerate(indexList): trainInds = randInds[trnIdx] testInds = randInds[tstIdx] trainX = SparseUtils.selectMatrix(X, rowInds[trainInds], colInds[trainInds]).tocsr() testX = SparseUtils.selectMatrix(X, rowInds[testInds], colInds[testInds]).tocsr() paramList.append((trainX, testX, learner)) pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()) results = pool.map(computeTestError, paramList) #results = map(computeTestError, paramList) testErrors = numpy.array(results) meanTestErrors = testErrors.mean() logging.debug("Test errors = " + str(meanTestErrors)) errorFileName = outputDir + "results_" + learner.name() numpy.savez(errorFileName, meanTestErrors) logging.debug("Saved results as " + errorFileName)
def processParkinsonsDataset(name, numRealisations): numpy.random.seed(21) dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" fileName = dataDir + name + ".data" XY = numpy.loadtxt(fileName, delimiter=",", skiprows=1) inds = list(set(range(XY.shape[1])) - set([5, 6])) X = XY[:, inds] y1 = XY[:, 5] y2 = XY[:, 6] #We don't keep whole collections of patients split = 0.5 idx = Sampling.shuffleSplit(numRealisations, X.shape[0], split) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-motor/" preprocessSave(X, y1, outputDir, idx) outputDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" + name + "-total/" preprocessSave(X, y2, outputDir, idx)
def run(): for i in range(2): print("Iteration " + str(i)) idx = Sampling.crossValidation(self.folds, numExamples) learner.parallelPen(X, Y, idx, self.paramDict, Cvs)
def testParallelVfcv(self): folds = 3 idx = Sampling.crossValidation(folds, self.X.shape[0]) decisionTree = DecisionTree() bestLearner, meanErrors = decisionTree.parallelVfcv(self.X, self.y, idx)
def run(): for i in range(5): print("Iteration " + str(i)) idx = Sampling.crossValidation(folds, numExamples) learner.parallelModelSelect(X, Y, idx, paramDict)
def learningRate(self, X, y, foldsSet, paramDict): """ Find a matrix beta which has the same dimensions as the parameter grid. Each value in the grid represents the learning rate with respect to those particular parameters. :param X: The examples as rows :type X: :class:`numpy.ndarray` :param y: The binary -1/+1 labels :type y: :class:`numpy.ndarray` :param foldsSet: A list of folds to try. :param paramDict: A dictionary index by the method name and with value as an array of values :type X: :class:`dict` """ try: from sklearn import linear_model except ImportError: raise gridSize = [] gridInds = [] for key in paramDict.keys(): gridSize.append(paramDict[key].shape[0]) gridInds.append(numpy.arange(paramDict[key].shape[0])) betaGrid = numpy.ones(tuple(gridSize)) gridSize.insert(0, foldsSet.shape[0]) penalties = numpy.zeros(tuple(gridSize)) Cvs = numpy.array([1]) for i in range(foldsSet.shape[0]): folds = foldsSet[i] logging.debug("Folds " + str(folds)) idx = Sampling.crossValidation(folds, X.shape[0]) resultsList = self.parallelPen(X, y, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] penalties[i, :] = currentPenalties indexIter = itertools.product(*gridInds) for inds in indexIter: inds2 = [slice(0, penalties.shape[0])] inds2.extend(inds) inds2 = tuple(inds2) tempPenalties = penalties[inds2] penInds = numpy.logical_and(numpy.isfinite(tempPenalties), tempPenalties>0) penInds = numpy.squeeze(penInds) tempPenalties = tempPenalties[penInds].flatten() tempfoldsSet = numpy.array(foldsSet, numpy.float)[penInds] if tempPenalties.shape[0] > 1: xp = numpy.log((tempfoldsSet-1)/tempfoldsSet*X.shape[0]) yp = numpy.log(tempPenalties)+numpy.log(tempfoldsSet) clf = linear_model.LinearRegression() clf.fit(numpy.array([xp]).T, yp) betaGrid[inds] = clf.coef_[0] return -betaGrid
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")
meanPenalties = numpy.zeros((numGammas, numEpsilons, numCs)) meanBetaPenalties = numpy.zeros((numGammas, numEpsilons, numCs)) meanIdealPenalities = numpy.zeros((numGammas, numEpsilons, numCs)) for j in range(numRealisations): print("") logging.debug("j=" + str(j)) trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j) logging.debug("Loaded dataset with " + str(trainX.shape) + " train and " + str(testX.shape) + " test examples") trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] trainX = trainX[trainInds,:] trainY = trainY[trainInds] idx = Sampling.crossValidation(folds, trainX.shape[0]) Cvs = [(folds-1)*alpha, beta[j, sampleSizeInd, :]] #Now try penalisation methodInd = 0 resultsList = learner.parallelPen(trainX, trainY, idx, paramDict, Cvs) bestLearner, trainErrors, currentPenalties = resultsList[0] meanPenalties += currentPenalties predY = bestLearner.predict(testX) #Learning rate penalisation methodInd = 1 bestLearner, trainErrors, currentPenalties = resultsList[1] meanBetaPenalties += currentPenalties predY = bestLearner.predict(testX)
def shuffleSplit90(repetitions, numExamples): """ Take two thirds of the examples to train, and the rest to test """ return Sampling.shuffleSplit(repetitions, numExamples, 0.9)
def testModelSelect(self): """ We test the results on some data and compare to SVR. """ numExamples = 200 X, y = data.make_regression(numExamples, noise=0.5) X = Standardiser().standardiseArray(X) y = Standardiser().standardiseArray(y) trainX = X[0:100, :] trainY = y[0:100] testX = X[100:, :] testY = y[100:] learner = DecisionTreeLearner(maxDepth=20, minSplit=10, pruneType="REP-CV") learner.setPruneCV(8) paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 10) paramDict["setPruneCV"] = numpy.arange(6, 11, 2, numpy.int) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) learner = DecisionTreeLearner(maxDepth=20, minSplit=5, pruneType="CART") paramDict = {} paramDict["setGamma"] = numpy.linspace(0.0, 1.0, 50) folds = 5 idx = Sampling.crossValidation(folds, trainX.shape[0]) bestTree, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestTree.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error) return #Let's compare to the SVM learner2 = LibSVM(kernel='gaussian', type="Epsilon_SVR") paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float) paramDict["setEpsilon"] = learner2.getEpsilons() idx = Sampling.crossValidation(folds, trainX.shape[0]) bestSVM, cvGrid = learner2.parallelModelSelect(trainX, trainY, idx, paramDict) predY = bestSVM.predict(testX) error = Evaluator.rootMeanSqError(testY, predY) print(error)
def repCrossValidation3(folds, numExamples): return Sampling.repCrossValidation(folds, numExamples, repetitions=3)
idx = sampleMethod(folds, validY.shape[0]) svmGridResults = learner.parallelPen(validX, validY, idx, paramDict, Cvs) for result in svmGridResults: learner, trainErrors, currentPenalties = result print(numpy.mean(trainErrors), numpy.mean(currentPenalties)) """ #Figure out why the penalty is increasing X = trainX y = trainY for i in range(foldsSet.shape[0]): folds = foldsSet[i] idx = Sampling.crossValidation(folds, validX.shape[0]) penalty = 0 fullError = 0 trainError = 0 learner.learnModel(validX, validY) predY = learner.predict(X) predValidY = learner.predict(validX) idealPenalty = Evaluator.rootMeanSqError(predY, y) - Evaluator.rootMeanSqError(predValidY, validY) for trainInds, testInds in idx: trainX = validX[trainInds, :] trainY = validY[trainInds] #learner.setGamma(gamma)
def saveResults(self, labelIndex): """ Compute the results and save them for a particular hormone. Does so for all leafranks """ folds = 5 if type(self.X) == numpy.ndarray: X = self.X[self.YList[labelIndex][1], :] else: X = self.X[labelIndex][self.YList[labelIndex][1], :] X = numpy.c_[X, self.ages[self.YList[labelIndex][1]]] Y = self.YList[labelIndex][0] numExamples = X.shape[0] logging.debug("Shape of examples: " + str(X.shape)) standardiserX = Standardiser() X = standardiserX.standardiseArray(X) standardiserY = Standardiser() Y = standardiserY.standardiseArray(Y) #We need to include the ROC curves indexList = Sampling.crossValidation(folds, numExamples) splitFunction = lambda trainX, trainY: Sampling.crossValidation(folds, trainX.shape[0]) #We need a metric to minimise def invMeanAUC(predY, testY): return 1 - self.meanAUC(predY, testY, labelIndex, standardiserY) metricMethods = [invMeanAUC] #Now create a learnerIterator based on the SVM Cs = 2**numpy.arange(-8, 2, dtype=numpy.float) gammas = 2**numpy.arange(-10, 0, dtype=numpy.float) epsilons = 2**numpy.arange(-5, 0, dtype=numpy.float) fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_rbf-" + self.featuresName + ".dat" learnerIterator = [] for C in Cs: for gamma in gammas: for epsilon in epsilons: learner = svm.SVR(C=C, gamma=gamma, epsilon=epsilon) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY) #Try the polynomial SVM fileName = self.resultsDir + self.labelNames[labelIndex] + "-svr_poly-" + self.featuresName + ".dat" degrees = numpy.array([2, 3]) for C in Cs: for degree in degrees: for epsilon in epsilons: learner = svm.SVR(kernel='poly', C=C, degree=degree, epsilon=epsilon) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY) #Now try Lasso and ElasticNet fileName = self.resultsDir + self.labelNames[labelIndex] + "-lasso-" + self.featuresName + ".dat" alphas = 2**numpy.arange(-9, 0, dtype=numpy.float) learnerIterator = [] for alpha in alphas: learner = linear_model.Lasso(alpha = alpha) learner.learnModel = learner.fit learnerIterator.append(learner) self.saveResult(X, Y, indexList, splitFunction, learnerIterator, metricMethods, fileName, labelIndex, standardiserY)