def testBayesError(self): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" data = numpy.load(dataDir + "toyData.npz") gridPoints, X, y, pdfX, pdfY1X, pdfYminus1X = data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"] sampleSize = 100 trainX, trainY = X[0:sampleSize, :], y[0:sampleSize] testX, testY = X[sampleSize:, :], y[sampleSize:] #We form a test set from the grid points gridX = numpy.zeros((gridPoints.shape[0]**2, 2)) for m in range(gridPoints.shape[0]): gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 0] = gridPoints gridX[m*gridPoints.shape[0]:(m+1)*gridPoints.shape[0], 1] = gridPoints[m] Cs = 2**numpy.arange(-5, 5, dtype=numpy.float) gammas = 2**numpy.arange(-5, 5, dtype=numpy.float) bestError = 1 for C in Cs: for gamma in gammas: svm = LibSVM(kernel="gaussian", C=C, kernelParam=gamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) decisionGrid = numpy.reshape(decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F") error = ModelSelectUtils.bayesError(gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X) predY, decisionsY = svm.predict(testX, True) error2 = Evaluator.binaryError(testY, predY) print(error, error2) if error < bestError: error = bestError bestC = C bestGamma = gamma svm = LibSVM(kernel="gaussian", C=bestC, kernelParam=bestGamma) svm.learnModel(trainX, trainY) predY, decisionsY = svm.predict(gridX, True) plt.figure(0) plt.contourf(gridPoints, gridPoints, decisionGrid, 100) plt.colorbar() plt.figure(1) plt.scatter(X[y==1, 0], X[y==1, 1], c='r' ,label="-1") plt.scatter(X[y==-1, 0], X[y==-1, 1], c='b',label="+1") plt.legend() plt.show()
import multiprocessing import sys from apgl.predictors.LibSVM import LibSVM, computeTestError from apgl.predictors.DecisionTree import DecisionTree from sandbox.util.FileLock import FileLock from sandbox.util.PathDefaults import PathDefaults from sandbox.util.Sampling import Sampling from sandbox.util.Evaluator import Evaluator from sandbox.util.Util import Util from apgl.modelselect.ModelSelectUtils import ModelSelectUtils import logging import numpy import os datasets = ModelSelectUtils.getRegressionDatasets() numProcesses = 8 dataDir = PathDefaults.getDataDir() + "modelPenalisation/regression/" datasetName = datasets[9] print(datasetName) j = 0 trainX, trainY, testX, testY = ModelSelectUtils.loadRegressDataset(dataDir, datasetName, j) learner = LibSVM(kernel='gaussian', type="Epsilon_SVR", processes=numProcesses) paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float)
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" svm = LibSVM() numCs = svm.getCs().shape[0] numGammas = svm.getGammas().shape[0] numMethods = 1 + (1 + cvScalings.shape[0]) numParams = 2 runIdeal = True runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) approxGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas)) data = numpy.load(dataDir + datasetName + ".npz") gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = ( data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"], ) # We form a test set from the grid points testX = numpy.zeros((gridPoints.shape[0] ** 2, 2)) for m in range(gridPoints.shape[0]): testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 0] = gridPoints testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 1] = gridPoints[m] for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): folds = foldsSet[m] logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") perm = numpy.random.permutation(trainX.shape[0]) trainInds = perm[0:sampleSize] validX = trainX[trainInds, :] validY = trainY[trainInds] svm = LibSVM(processes=numProcesses) # Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf( svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X ) # Cross validation if runCv: logging.debug("Running V-fold cross validation") methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) if sampleMethod == Sampling.bootstrap: bootstrap = True else: bootstrap = False bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap) predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = cvGrid # v fold penalisation if runVfpen: logging.debug("Running penalisation") # BIC penalisation Cv = float((folds - 1) * numpy.log(validX.shape[0]) / 2) tempCvScalings = cvScalings * (folds - 1) tempCvScalings = numpy.insert(tempCvScalings, 0, Cv) # Use cross validation idx = sampleMethod(folds, validY.shape[0]) svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings) for n in range(len(tempCvScalings)): bestSVM, trainErrors, approxGrid = svmGridResults[n] methodInd = n + 1 predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array( [bestSVM.getC(), bestSVM.getKernelParams()] ) errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) meanErrorGrids = numpy.mean(errorGrids, 0) stdErrorGrids = numpy.std(errorGrids, 0) meanIdealGrids = numpy.mean(idealGrids, 0) stdIdealGrids = numpy.std(idealGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez( outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids, ) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")