def profileModelSelect(self): lmbdas = numpy.linspace(1.0, 0.01, 5) softImpute = IterativeSoftImpute(k=500) folds = 5 cvInds = Sampling.randCrossValidation(folds, self.X.nnz) ProfileUtils.profile('softImpute.modelSelect(self.X, lmbdas, cvInds)', globals(), locals())
def testRandCrossValidation(self): numExamples = 10 folds = 3 indices = Sampling.randCrossValidation(folds, numExamples) for i in range(folds): self.assertTrue((numpy.union1d(indices[i][0], indices[i][1]) == numpy.arange(numExamples)).all())
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def testParallelModelSelect(self): X = scipy.sparse.rand(10, 10, 0.5) X = X.tocsr() numExamples = X.getnnz() paramDict = {} paramDict["setRank"] = numpy.array([5, 10, 20]) folds = 3 idx = Sampling.randCrossValidation(folds, numExamples) method = "lsnmf" nimfaFactorise = NimfaFactorise(method) learner, meanErrors = nimfaFactorise.parallelModelSelect( X, idx, paramDict)
def testParallelModelSelect(self): X = scipy.sparse.rand(10, 10, 0.5) X = X.tocsr() numExamples = X.getnnz() paramDict = {} paramDict["setRank"] = numpy.array([5, 10, 20]) folds = 3 idx = Sampling.randCrossValidation(folds, numExamples) method = "lsnmf" nimfaFactorise = NimfaFactorise(method) learner, meanErrors = nimfaFactorise.parallelModelSelect(X, idx, paramDict)
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def testModelSelect(self): lmbda = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=None, svdAlg="propack", updateAlg="zero") iterativeSoftImpute.numProcesses = 1 rhos = numpy.linspace(0.5, 0.001, 20) ks = numpy.array([k], numpy.int) folds = 3 cvInds = Sampling.randCrossValidation(folds, X.nnz) meanTestErrors, meanTrainErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) #Now do model selection manually (rowInds, colInds) = X.nonzero() trainErrors = numpy.zeros((rhos.shape[0], len(cvInds))) testErrors = numpy.zeros((rhos.shape[0], len(cvInds))) for i, rho in enumerate(rhos): for j, (trainInds, testInds) in enumerate(cvInds): trainX = scipy.sparse.csc_matrix(X.shape) testX = scipy.sparse.csc_matrix(X.shape) for p in trainInds: trainX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] for p in testInds: testX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] softImpute = SoftImpute(numpy.array([rho]), k=ks[0]) ZList = [softImpute.learnModel(trainX, fullMatrices=False)] predTrainX = softImpute.predict(ZList, trainX.nonzero())[0] predX = softImpute.predict(ZList, testX.nonzero())[0] testErrors[i, j] = MCEvaluator.rootMeanSqError(testX, predX) trainErrors[i, j] = MCEvaluator.rootMeanSqError(trainX, predTrainX) meanTestErrors2 = testErrors.mean(1) meanTrainErrors2 = trainErrors.mean(1) nptst.assert_array_almost_equal(meanTestErrors.ravel(), meanTestErrors2, 1)
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX + trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append( (trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array( list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape( (nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not( numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin / (nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin % nLG) / nGamma))] gammaMin = gammas[int(scipy.floor(iMin % nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin / (nLG))), int(scipy.floor((iMin % nLG) / nGamma)), int(scipy.floor(iMin % nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
if not (fileLock.isLocked() or fileLock.fileExists()) or overwrite: fileLock.lock() logging.debug(learner) try: #Do some recommendation if type(learner) == IterativeSoftImpute: trainX = X.toScipyCsc() trainIterator = iter([trainX]) if modelSelect: modelSelectX, userInds = Sampling.sampleUsers2(X, modelSelectSamples) modelSelectX = modelSelectX.toScipyCsc() cvInds = Sampling.randCrossValidation(folds, modelSelectX.nnz) meanMetrics, stdMetrics = learner.modelSelect2(modelSelectX, rhosSi, ks, cvInds) ZList = learner.learnModel(trainIterator) U, s, V = ZList.next() U = U*s elif type(learner) == WeightedMf: trainX = X.toScipyCsr() if modelSelect: modelSelectX, userInds = Sampling.sampleUsers2(X, modelSelectSamples) modelSelectX = modelSelectX.toScipyCsc() meanMetrics, stdMetrics = learner.modelSelect(modelSelectX) learner.learnModel(trainX) U = learner.U
def modelSelect(self, X, ks, lmbdas, gammas, nFolds, maxNTry=5): """ Choose parameters based on a single matrix X. We do cross validation within, and set parameters according to the mean squared error. Return nothing. """ logging.debug("Performing model selection") # usefull X = X.tocoo() gc.collect() nK = len(ks) nLmbda = len(lmbdas) nGamma = len(gammas) nLG = nLmbda * nGamma errors = scipy.zeros((nK, nLmbda, nGamma, nFolds)) # generate cross validation sets cvInds = Sampling.randCrossValidation(nFolds, X.nnz) # compute error for each fold / setting for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, nFolds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) assert trainX.nnz == trainInds.shape[0] assert testX.nnz == testInds.shape[0] nptst.assert_array_almost_equal((testX+trainX).data, X.data) paramList = [] for ik, k in enumerate(ks): for ilmbda, lmbda in enumerate(lmbdas): for igamma, gamma in enumerate(gammas): paramList.append((trainX, testX, k, lmbda, gamma, maxNTry)) # ! Remark ! # we can parallelize the run of parameters easely. # parallelize the run of cv-folds is not done as it is much more # memory-consuming # parallel version (copied from IteraticeSoftImpute, but not tested) #pool = multiprocessing.Pool(processes=multiprocessing.cpu_count()/2, maxtasksperchild=10) #results = pool.imap(self.learnPredict, paramList) #pool.terminate() # non-parallel version results = scipy.array(list(itertools.starmap(self.learnPredict, paramList))) errors[:, :, :, icv] = scipy.array(results).reshape((nK, nLmbda, nGamma)) # compute cross validation error for each setting errors[errors == float("inf")] = errors[errors != float("inf")].max() errors[numpy.isnan(errors)] = numpy.max(errors[numpy.logical_not(numpy.isnan(errors))]) meanErrors = errors.mean(3) stdErrors = errors.std(3) logging.debug("Mean errors given (k, lambda, gamma):") logging.debug(meanErrors) logging.debug("... with standard deviation:") logging.debug(stdErrors) # keep the best iMin = meanErrors.argmin() kMin = ks[int(scipy.floor(iMin/(nLG)))] lmbdaMin = lmbdas[int(scipy.floor((iMin%nLG)/nGamma))] gammaMin = gammas[int(scipy.floor(iMin%nGamma))] logging.debug("argmin: (k, lambda, gamma) = (" + str(kMin) + ", " + str(lmbdaMin) + ", " + str(gammaMin) + ")") logging.debug("min = " + str(meanErrors[int(scipy.floor(iMin/(nLG))), int(scipy.floor((iMin%nLG)/nGamma)), int(scipy.floor(iMin%nGamma))])) self.baseLearner.k = kMin self.baseLearner.lmbda = lmbdaMin self.baseLearner.gamma = gammaMin return
def runExperiment(self, X): """ Run the selected ranking experiments and save results """ logging.debug("Splitting into train and test sets") #Make sure different runs get the same train/test split numpy.random.seed(21) m, n = X.shape #colProbs = (X.sum(0)+1)/float(m+1) #colProbs = colProbs**-self.algoArgs.itemExp #colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize) trainX, testX = trainTestXs[0] logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) #Have scipy versions of each array trainXScipy = trainX.toScipyCsc() testXScipy = testX.toScipyCsc() if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") resultsFileName = self.resultsDir + "ResultsSoftImpute.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True) try: learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi) learner.folds = self.algoArgs.folds learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz) meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runMaxLocalAuc: logging.debug("Running max local AUC") if self.algoArgs.loss != "tanh": resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz" else: resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient) learner.alpha = self.algoArgs.alpha learner.alphas = self.algoArgs.alphas learner.eta = self.algoArgs.eta learner.folds = self.algoArgs.folds learner.initialAlg = self.algoArgs.initialAlg learner.itemExpP = self.algoArgs.itemExpP learner.itemExpQ = self.algoArgs.itemExpQ learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasMlauc learner.loss = self.algoArgs.loss learner.maxIterations = self.algoArgs.maxIterations learner.maxNorms = self.algoArgs.maxNorms learner.maxNormU = self.algoArgs.maxNorm learner.maxNormV = self.algoArgs.maxNorm learner.metric = self.algoArgs.metric learner.normalise = self.algoArgs.normalise learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.numRowSamples = self.algoArgs.numRowSamples learner.rate = self.algoArgs.rate learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.rho = self.algoArgs.rhoMlauc learner.rhos = self.algoArgs.rhosMlauc learner.startAverage = self.algoArgs.startAverage learner.t0 = self.algoArgs.t0 learner.t0s = self.algoArgs.t0s learner.validationSize = self.algoArgs.validationSize learner.validationUsers = self.algoArgs.validationUsers modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName): logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) # meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX) meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX) numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS) logging.debug("Saved model selection grid as " + modelSelectFileName) elif self.algoArgs.modelSelect: data = numpy.load(modelSelectFileName) logging.debug("Read model selection file " + modelSelectFileName) meanMetricsLR = data["arr_0"] meanMetricsMS = data["arr_1"] learner.learningRateSelect(meanMetrics=meanMetricsLR) learner.modelSelectLmbda(meanMetrics=meanMetricsMS) #Turn on (optionally) parallel SGD only at the final learning stage learner.parallelSGD = self.algoArgs.parallelSGD learner.maxIterations *= 2 logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWarpMf: logging.debug("Running WARP loss MF") resultsFileName = self.resultsDir + "ResultsWarpMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u) learner.ks = self.algoArgs.ks learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) logging.debug("Mean local AUCs = " + str(meanAucs)) logging.debug("Std local AUCs = " + str(stdAucs)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWrMf: logging.debug("Running Weighted Regularized Matrix Factorization") resultsFileName = self.resultsDir + "ResultsWrMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() trainXScipy = trainXScipy.tocsr() testXScipy = testXScipy.tocsr() try: learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf) learner.folds = self.algoArgs.folds learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasWrMf learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runBpr: logging.debug("Running Bayesian Personalised Recommendation") resultsFileName = self.resultsDir + "ResultsBpr.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: #trainX = trainX.toScipyCsr() #testX = testX.toScipyCsr() learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasBpr learner.ks = self.algoArgs.ks learner.lmbdaItems = self.algoArgs.lmbdaItems learner.lmbdaUsers = self.algoArgs.lmbdaUsers learner.maxIterations = self.algoArgs.maxIterationsBpr learner.metric = self.algoArgs.metric #learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runKnn: logging.debug("Running kNN") resultsFileName = self.resultsDir + "ResultsKnn.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: trainX = trainX.toScipyCsr() testX = testX.toScipyCsr() learner = KNNRecommender(self.algoArgs.kns[0]) learner.numProcesses = self.algoArgs.processes logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runCLiMF: # !!!! no model selection logging.debug("Running CLiMF") resultsFileName = self.resultsDir + "ResultsCLiMF.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64) trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64) testX = testX.toScipyCsr() learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasCLiMF learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasCLiMF learner.max_iters = self.algoArgs.maxIterCLiMF learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize learner.verbose = self.algoArgs.verbose if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) meanObjs, stdObjs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanObjs, stdObjs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")