def getSetup(learnerName, dataDir, outputDir, numProcesses): if learnerName=="SVM": learner = LibSVM(kernel='gaussian', type="C_SVC", processes=numProcesses) loadMethod = ModelSelectUtils.loadRatschDataset dataDir += "benchmark/" outputDir += "classification/" + learnerName + "/" paramDict = {} paramDict["setC"] = learner.getCs() paramDict["setGamma"] = learner.getGammas() elif learnerName=="SVR": learner = LibSVM(kernel='gaussian', type="Epsilon_SVR", processes=numProcesses) learner.normModelSelect = True loadMethod = ModelSelectUtils.loadRegressDataset dataDir += "regression/" outputDir += "regression/" + learnerName + "/" paramDict = {} paramDict["setC"] = 2.0**numpy.arange(-10, 14, 2, dtype=numpy.float) paramDict["setGamma"] = 2.0**numpy.arange(-10, 4, 2, dtype=numpy.float) paramDict["setEpsilon"] = learner.getEpsilons() elif learnerName=="CART": learner = DecisionTreeLearner(criterion="mse", maxDepth=30, minSplit=1, pruneType="CART", processes=numProcesses) learner.setChunkSize(2) loadMethod = ModelSelectUtils.loadRegressDataset dataDir += "regression/" outputDir += "regression/" + learnerName + "/" paramDict = {} paramDict["setGamma"] = numpy.array(numpy.round(2**numpy.arange(1, 7.5, 0.5)-1), dtype=numpy.int) else: raise ValueError("Unknown learnerName: " + learnerName) return learner, loadMethod, dataDir, outputDir, paramDict
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" svm = LibSVM() numCs = svm.getCs().shape[0] numGammas = svm.getGammas().shape[0] numMethods = 1 + (1 + cvScalings.shape[0]) numParams = 2 runIdeal = True runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) approxGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas)) data = numpy.load(dataDir + datasetName + ".npz") gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = ( data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"], ) # We form a test set from the grid points testX = numpy.zeros((gridPoints.shape[0] ** 2, 2)) for m in range(gridPoints.shape[0]): testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 0] = gridPoints testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 1] = gridPoints[m] for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): folds = foldsSet[m] logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") perm = numpy.random.permutation(trainX.shape[0]) trainInds = perm[0:sampleSize] validX = trainX[trainInds, :] validY = trainY[trainInds] svm = LibSVM(processes=numProcesses) # Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf( svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X ) # Cross validation if runCv: logging.debug("Running V-fold cross validation") methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) if sampleMethod == Sampling.bootstrap: bootstrap = True else: bootstrap = False bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap) predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = cvGrid # v fold penalisation if runVfpen: logging.debug("Running penalisation") # BIC penalisation Cv = float((folds - 1) * numpy.log(validX.shape[0]) / 2) tempCvScalings = cvScalings * (folds - 1) tempCvScalings = numpy.insert(tempCvScalings, 0, Cv) # Use cross validation idx = sampleMethod(folds, validY.shape[0]) svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings) for n in range(len(tempCvScalings)): bestSVM, trainErrors, approxGrid = svmGridResults[n] methodInd = n + 1 predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array( [bestSVM.getC(), bestSVM.getKernelParams()] ) errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) meanErrorGrids = numpy.mean(errorGrids, 0) stdErrorGrids = numpy.std(errorGrids, 0) meanIdealGrids = numpy.mean(idealGrids, 0) stdIdealGrids = numpy.std(idealGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez( outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids, ) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")