def testUnlock(self): fileLock = FileLock(self.fileName) fileLock.lock() self.assertTrue(fileLock.isLocked()) fileLock.unlock() self.assertTrue(not fileLock.isLocked())
def saveWeightVectorResults(self, X, Y, learner, paramDict, fileName): """ Save the results of the variable importance """ filelock = FileLock(fileName) gc.collect() if not filelock.isLocked() and not filelock.fileExists(): filelock.lock() try: logging.debug("Computing weights file " + fileName) logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1))) tempLearner = learner.copy() logging.debug("Initial learner is " + str(tempLearner)) idx = StratifiedKFold(Y, self.innerFolds) tempLearner.processes = self.numProcesses bestLearner, cvGrid = tempLearner.parallelModelSelect(X, Y, idx, paramDict) bestLearner = tempLearner.getBestLearner(cvGrid, paramDict, X, Y, idx, best="max") logging.debug("Best learner is " + str(bestLearner)) bestLearner.learnModel(X, Y) weightVector = bestLearner.variableImportance(X, Y) numpy.save(fileName, weightVector) logging.debug("Saved results as : " + fileName) finally: filelock.unlock() else: logging.debug("File exists, or is locked: " + fileName)
def saveStats(args): i, theta = args resultsFileName = outputDir + "SimStats" + str(i) + ".pkl" lock = FileLock(resultsFileName) if not lock.fileExists() and not lock.isLocked(): lock.lock() model = HIVModelUtils.createModel(targetGraph, startDate, endDate, recordStep, M, matchAlpha, breakSize, matchAlg, theta=thetaArray[i]) times, infectedIndices, removedIndices, graph, compTimes, graphMetrics = HIVModelUtils.simulate(model) times = numpy.arange(startDate, endDate+1, recordStep) vertexArray, infectedIndices, removedIndices, contactGraphStats, removedGraphStats, finalRemovedDegrees = HIVModelUtils.generateStatistics(graph, times) stats = times, vertexArray, infectedIndices, removedGraphStats, finalRemovedDegrees, graphMetrics.objectives, compTimes Util.savePickle(stats, resultsFileName) lock.unlock() else: logging.debug("Results already computed: " + str(resultsFileName))
def saveResult(self, X, Y, learner, paramDict, fileName): """ Save a single result to file, checking if the results have already been computed """ filelock = FileLock(fileName) gc.collect() if not filelock.isLocked() and not filelock.fileExists(): filelock.lock() try: logging.debug("Computing file " + fileName) logging.debug("Shape of examples: " + str(X.shape) + ", number of +1: " + str(numpy.sum(Y==1)) + ", -1: " + str(numpy.sum(Y==-1))) #idxFull = Sampling.crossValidation(self.outerFolds, X.shape[0]) idxFull = StratifiedKFold(Y, self.outerFolds) errors = numpy.zeros(self.outerFolds) for i, (trainInds, testInds) in enumerate(idxFull): logging.debug("Outer fold: " + str(i)) trainX, trainY = X[trainInds, :], Y[trainInds] testX, testY = X[testInds, :], Y[testInds] #idx = Sampling.crossValidation(self.innerFolds, trainX.shape[0]) idx = StratifiedKFold(trainY, self.innerFolds) logging.debug("Initial learner is " + str(learner)) bestLearner, cvGrid = learner.parallelModelSelect(trainX, trainY, idx, paramDict) bestLearner = learner.getBestLearner(cvGrid, paramDict, trainX, trainY, idx, best="max") logging.debug("Best learner is " + str(bestLearner)) bestLearner.learnModel(trainX, trainY) predY = bestLearner.predict(testX) errors[i] = Evaluator.auc(predY, testY) logging.debug("Mean auc: " + str(numpy.mean(errors))) numpy.save(fileName, errors) logging.debug("Saved results as : " + fileName) finally: filelock.unlock() else: logging.debug("File exists, or is locked: " + fileName)
def computeLearningRates(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes, foldsSet): dataDir = PathDefaults.getDataDir() + "modelPenalisation/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses) for i in range(len(datasetNames)): logging.debug("Learning using dataset " + datasetNames[i][0]) outfileName = outputDir + datasetNames[i][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() numRealisations = datasetNames[i][1] gridShape = [numRealisations, sampleSizes.shape[0]] gridShape.extend(list(learner.gridShape(paramDict))) gridShape = tuple(gridShape) betaGrids = numpy.zeros(gridShape) for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] logging.debug("Using sample size " + str(sampleSize)) for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j) numpy.random.seed(21) trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] validX = trainX[trainInds,:] validY = trainY[trainInds] betaGrids[j, k, :] = learner.learningRate(validX, validY, foldsSet, paramDict) numpy.savez(outfileName, betaGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock()
def testLock(self): fileLock = FileLock(self.fileName) fileLock.lock()
def runToyExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/toy/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" svm = LibSVM() numCs = svm.getCs().shape[0] numGammas = svm.getGammas().shape[0] numMethods = 1 + (1 + cvScalings.shape[0]) numParams = 2 runIdeal = True runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) approxGrids = numpy.zeros( (numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numCs, numGammas) ) idealGrids = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numCs, numGammas)) data = numpy.load(dataDir + datasetName + ".npz") gridPoints, trainX, trainY, pdfX, pdfY1X, pdfYminus1X = ( data["arr_0"], data["arr_1"], data["arr_2"], data["arr_3"], data["arr_4"], data["arr_5"], ) # We form a test set from the grid points testX = numpy.zeros((gridPoints.shape[0] ** 2, 2)) for m in range(gridPoints.shape[0]): testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 0] = gridPoints testX[m * gridPoints.shape[0] : (m + 1) * gridPoints.shape[0], 1] = gridPoints[m] for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): folds = foldsSet[m] logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") perm = numpy.random.permutation(trainX.shape[0]) trainInds = perm[0:sampleSize] validX = trainX[trainInds, :] validY = trainY[trainInds] svm = LibSVM(processes=numProcesses) # Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealGrids[j, k, m, :, :] = parallelPenaltyGridRbf( svm, validX, validY, testX, gridPoints, pdfX, pdfY1X, pdfYminus1X ) # Cross validation if runCv: logging.debug("Running V-fold cross validation") methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) if sampleMethod == Sampling.bootstrap: bootstrap = True else: bootstrap = False bestSVM, cvGrid = svm.parallelVfcvRbf(validX, validY, idx, True, bootstrap) predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array([bestSVM.getC(), bestSVM.getKernelParams()]) errorGrids[j, k, m, methodInd, :, :] = cvGrid # v fold penalisation if runVfpen: logging.debug("Running penalisation") # BIC penalisation Cv = float((folds - 1) * numpy.log(validX.shape[0]) / 2) tempCvScalings = cvScalings * (folds - 1) tempCvScalings = numpy.insert(tempCvScalings, 0, Cv) # Use cross validation idx = sampleMethod(folds, validY.shape[0]) svmGridResults = svm.parallelVfPenRbf(validX, validY, idx, tempCvScalings) for n in range(len(tempCvScalings)): bestSVM, trainErrors, approxGrid = svmGridResults[n] methodInd = n + 1 predY, decisionsY = bestSVM.predict(testX, True) decisionGrid = numpy.reshape( decisionsY, (gridPoints.shape[0], gridPoints.shape[0]), order="F" ) errors[j, k, m, methodInd] = ModelSelectUtils.bayesError( gridPoints, decisionGrid, pdfX, pdfY1X, pdfYminus1X ) params[j, k, m, methodInd, :] = numpy.array( [bestSVM.getC(), bestSVM.getKernelParams()] ) errorGrids[j, k, m, methodInd, :, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) meanErrorGrids = numpy.mean(errorGrids, 0) stdErrorGrids = numpy.std(errorGrids, 0) meanIdealGrids = numpy.mean(idealGrids, 0) stdIdealGrids = numpy.std(idealGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez( outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanIdealGrids, stdIdealGrids, meanApproxGrids, stdApproxGrids, ) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")
def runBenchmarkExp(datasetNames, sampleSizes, foldsSet, cvScalings, sampleMethods, numProcesses, fileNameSuffix, learnerName, betaNameSuffix): dataDir = PathDefaults.getDataDir() + "modelPenalisation/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses) numParams = len(paramDict.keys()) numMethods = 1 + (cvScalings.shape[0] + 1) runCv = True runVfpen = True for i in range(len(datasetNames)): datasetName = datasetNames[i][0] numRealisations = datasetNames[i][1] logging.debug("Learning using dataset " + datasetName) #Load learning rates for penalisation betafileName = outputDir + datasetNames[i][0] + betaNameSuffix + ".npz" betaGrids = numpy.load(betafileName)["arr_0"] betaGrids = numpy.clip(betaGrids, 0, 1) for s in range(len(sampleMethods)): sampleMethod = sampleMethods[s][1] outfileName = outputDir + datasetName + sampleMethods[s][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() errors = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods)) params = numpy.zeros((numRealisations, len(sampleSizes), foldsSet.shape[0], numMethods, numParams)) errorShape = [numRealisations, len(sampleSizes), foldsSet.shape[0] ,numMethods] errorShape.extend(list(learner.gridShape(paramDict))) errorShape = tuple(errorShape) gridShape = [numRealisations, len(sampleSizes), foldsSet.shape[0] ,numMethods] gridShape.extend(list(learner.gridShape(paramDict))) gridShape = tuple(gridShape) idealErrorShape = [numRealisations, len(sampleSizes)] idealErrorShape.extend(list(learner.gridShape(paramDict))) idealErrorShape = tuple(idealErrorShape) errorGrids = numpy.zeros(errorShape) approxGrids = numpy.zeros(errorShape) for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") trainX, trainY, testX, testY = loadMethod(dataDir, datasetName, j) for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] for m in range(foldsSet.shape[0]): if foldsSet[m] < sampleSize: folds = foldsSet[m] else: folds = sampleSize logging.debug("Using sample size " + str(sampleSize) + " and " + str(folds) + " folds") numpy.random.seed(21) trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] validX = trainX[trainInds,:] validY = trainY[trainInds] #Cross validation if runCv: logging.debug("Running simple sampling using " + str(sampleMethod)) methodInd = 0 idx = sampleMethod(folds, validY.shape[0]) bestLearner, cvGrid = learner.parallelModelSelect(validX, validY, idx, paramDict) predY = bestLearner.predict(testX) errors[j, k, m, methodInd] = bestLearner.getMetricMethod()(testY, predY) params[j, k, m, methodInd, :] = bestLearner.getParamsArray(paramDict) errorGrids[j, k, m, methodInd, :] = cvGrid #v fold penalisation if runVfpen: logging.debug("Running penalisation using " + str(sampleMethod)) #Corrected penalisation give by using learning rate tempCvScalings = list(cvScalings * (folds-1)) tempCvScalings.insert(0, betaGrids[j, k, :]) idx = sampleMethod(folds, validY.shape[0]) learnerGridResults = learner.parallelPen(validX, validY, idx, paramDict, tempCvScalings) for n in range(len(tempCvScalings)): bestLearner, trainErrors, approxGrid = learnerGridResults[n] predY = bestLearner.predict(testX) methodInd = n + 1 errors[j, k, m, methodInd] = bestLearner.getMetricMethod()(testY, predY) params[j, k, m, methodInd, :] = bestLearner.getParamsArray(paramDict) errorGrids[j, k, m, methodInd, :] = trainErrors + approxGrid approxGrids[j, k, m, methodInd, :] = approxGrid meanErrors = numpy.mean(errors, 0) print(meanErrors) meanParams = numpy.mean(params, 0) print(meanParams) #When using CART trees the penalty can be inf in which case std is undefined #In this case we set to zero any infinite values meanErrorGrids = numpy.mean(errorGrids, 0) try: stdErrorGrids = numpy.std(errorGrids, 0) except FloatingPointError: errorGrids[numpy.isinf(errorGrids)] = 0 stdErrorGrids = numpy.std(errorGrids, 0) meanApproxGrids = numpy.mean(approxGrids, 0) try: stdApproxGrids = numpy.std(approxGrids, 0) except FloatingPointError: approxGrids[numpy.isinf(approxGrids)] = 0 stdApproxGrids = numpy.std(approxGrids, 0) numpy.savez(outfileName, errors, params, meanErrorGrids, stdErrorGrids, meanApproxGrids, stdApproxGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed") logging.debug("All done!")
def findErrorGrid(datasetNames, numProcesses, fileNameSuffix, learnerName, sampleSizes): dataDir = PathDefaults.getDataDir() + "modelPenalisation/" outputDir = PathDefaults.getOutputDir() + "modelPenalisation/" learner, loadMethod, dataDir, outputDir, paramDict = getSetup(learnerName, dataDir, outputDir, numProcesses) numParams = len(paramDict.keys()) runIdeal = True runTest = True for i in range(len(datasetNames)): logging.debug("Learning using dataset " + datasetNames[i][0]) outfileName = outputDir + datasetNames[i][0] + fileNameSuffix fileLock = FileLock(outfileName + ".npz") if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() numRealisations = datasetNames[i][1] gridShape = [numRealisations, sampleSizes.shape[0]] gridShape.extend(list(learner.gridShape(paramDict))) gridShape = tuple(gridShape) idealPenGrids = numpy.zeros(gridShape) idealErrorGrids = numpy.zeros(gridShape) idealErrors = numpy.zeros((numRealisations, sampleSizes.shape[0])) params = numpy.zeros((numRealisations, len(sampleSizes), numParams)) for k in range(sampleSizes.shape[0]): sampleSize = sampleSizes[k] logging.debug("Using sample size " + str(sampleSize)) for j in range(numRealisations): Util.printIteration(j, 1, numRealisations, "Realisation: ") trainX, trainY, testX, testY = loadMethod(dataDir, datasetNames[i][0], j) numpy.random.seed(21) trainInds = numpy.random.permutation(trainX.shape[0])[0:sampleSize] validX = trainX[trainInds,:] validY = trainY[trainInds] #Find ideal penalties if runIdeal: logging.debug("Finding ideal grid of penalties") idealPenGrids[j, k, :] = learner.parallelPenaltyGrid(validX, validY, testX, testY, paramDict) #Find ideal model using the test set if runTest: logging.debug("Running test set sampling") cvGrid = learner.parallelSplitGrid(validX, validY, testX, testY, paramDict) bestLearner = learner.getBestLearner(cvGrid, paramDict, validX, validY) predY = bestLearner.predict(testX) idealErrors[j, k] = bestLearner.getMetricMethod()(testY, predY) params[j, k, :] = bestLearner.getParamsArray(paramDict) idealErrorGrids[j, k, :] = cvGrid meanIdealPenGrids = idealPenGrids.mean(0) stdIdealPenGrids = idealPenGrids.std(0) meanIdealErrorGrids = idealErrorGrids.mean(0) stdIdealErrorGrids = idealErrorGrids.std(0) numpy.savez(outfileName, idealErrors, params, meanIdealErrorGrids, stdIdealErrorGrids, meanIdealPenGrids, stdIdealPenGrids) logging.debug("Saved results as file " + outfileName + ".npz") fileLock.unlock() else: logging.debug("Results already computed")
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")
def testInit(self): fileLock = FileLock(self.fileName)