def initUV(self, X): m = X.shape[0] n = X.shape[1] if self.initialAlg == "rand": U = numpy.random.randn(m, self.k) * 0.1 V = numpy.random.randn(n, self.k) * 0.1 elif self.initialAlg == "svd": logging.debug("Initialising with Randomised SVD") U, s, V = RandomisedSVD.svd(X, self.k, self.p, self.q) U = U * s elif self.initialAlg == "softimpute": logging.debug("Initialising with softimpute") trainIterator = iter([X.toScipyCsc()]) rho = 0.01 learner = IterativeSoftImpute(rho, k=self.k, svdAlg="propack", postProcess=True) ZList = learner.learnModel(trainIterator) U, s, V = ZList.next() U = U * s elif self.initialAlg == "wrmf": logging.debug("Initialising with wrmf") learner = WeightedMf(self.k, w=self.w) U, V = learner.learnModel(X.toScipyCsr()) else: raise ValueError("Unknown initialisation: " + str(self.initialAlg)) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) return U, V
def testPredict(self): #Create a set of indices lmbda = 0.0 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=10) matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) XhatList = iterativeSoftImpute.predict(ZList, self.indsList) #Check we get the exact matrices returned for i, Xhat in enumerate(XhatList): nptst.assert_array_almost_equal(numpy.array(Xhat.todense()), self.matrixList[i].todense()) self.assertEquals(Xhat.nnz, self.indsList[i].shape[0]) self.assertAlmostEquals(MCEvaluator.meanSqError(Xhat, self.matrixList[i]), 0) self.assertAlmostEquals(MCEvaluator.rootMeanSqError(Xhat, self.matrixList[i]), 0) #Try moderate lambda lmbda = 0.1 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=10) matrixIterator = iter(self.matrixList) ZList = list(iterativeSoftImpute.learnModel(matrixIterator)) XhatList = iterativeSoftImpute.predict(iter(ZList), self.indsList) for i, Xhat in enumerate(XhatList): for ind in self.indsList[i]: U, s, V = ZList[i] Z = (U*s).dot(V.T) self.assertEquals(Xhat[numpy.unravel_index(ind, Xhat.shape)], Z[numpy.unravel_index(ind, Xhat.shape)]) self.assertEquals(Xhat.nnz, self.indsList[i].shape[0])
def testPostProcess(self): lmbda = 0.0 eps = 0.1 k = 20 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="rsvd", postProcess=True) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, numpy.array(self.matrixList[i].todense())) #Try case with iterativeSoftImpute.postProcessSamples < X.nnz matrixIterator = iter(self.matrixList) iterativeSoftImpute.postProcessSamples = int(self.matrixList[0].nnz/2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, self.matrixList[i].todense(), 2) #Try for larger lambda iterativeSoftImpute.setRho(0.2) ZList = iterativeSoftImpute.learnModel(matrixIterator) for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T)
def testModelSelect(self): lmbda = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(lmbda, k=None, svdAlg="propack", updateAlg="zero") iterativeSoftImpute.numProcesses = 1 rhos = numpy.linspace(0.5, 0.001, 20) ks = numpy.array([k], numpy.int) folds = 3 cvInds = Sampling.randCrossValidation(folds, X.nnz) meanTestErrors, meanTrainErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) #Now do model selection manually (rowInds, colInds) = X.nonzero() trainErrors = numpy.zeros((rhos.shape[0], len(cvInds))) testErrors = numpy.zeros((rhos.shape[0], len(cvInds))) for i, rho in enumerate(rhos): for j, (trainInds, testInds) in enumerate(cvInds): trainX = scipy.sparse.csc_matrix(X.shape) testX = scipy.sparse.csc_matrix(X.shape) for p in trainInds: trainX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] for p in testInds: testX[rowInds[p], colInds[p]] = X[rowInds[p], colInds[p]] softImpute = SoftImpute(numpy.array([rho]), k=ks[0]) ZList = [softImpute.learnModel(trainX, fullMatrices=False)] predTrainX = softImpute.predict(ZList, trainX.nonzero())[0] predX = softImpute.predict(ZList, testX.nonzero())[0] testErrors[i, j] = MCEvaluator.rootMeanSqError(testX, predX) trainErrors[i, j] = MCEvaluator.rootMeanSqError(trainX, predTrainX) meanTestErrors2 = testErrors.mean(1) meanTrainErrors2 = trainErrors.mean(1) nptst.assert_array_almost_equal(meanTestErrors.ravel(), meanTestErrors2, 1)
def testModelSelect2(self): rho = 0.1 shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) X = X.tocsc() U, s, V = numpy.linalg.svd(X.todense()) k = 15 iterativeSoftImpute = IterativeSoftImpute(rho, k=None, svdAlg="propack", updateAlg="initial") rhos = numpy.linspace(0.5, 0.001, 5) ks = numpy.array([5, 10, 15], numpy.int) folds = 3 cvInds = [] for i in range(folds): cvInds.append((numpy.arange(X.nnz), numpy.arange(X.nnz))) meanTestErrors, stdTestErrors = iterativeSoftImpute.modelSelect(X, rhos, ks, cvInds) self.assertAlmostEquals(numpy.linalg.norm(stdTestErrors), 0, 3) meanTestErrors2 = numpy.zeros((rhos.shape[0], ks.shape[0])) #Now compute errors manually for j, k in enumerate(ks): iterativeSoftImpute.setK(k) for i, rho in enumerate(rhos): iterativeSoftImpute.setRho(rho) ZIter = iterativeSoftImpute.learnModel(iter([X])) indList = [X.nonzero()] outIterator = iterativeSoftImpute.predict(ZIter, indList) Xhat = outIterator.next() meanTestErrors2[i, j] = MCEvaluator.rootMeanSqError(X, Xhat) nptst.assert_array_almost_equal(meanTestErrors, meanTestErrors2, 2)
def testLearnModel2(self): #Test the SVD updating solution in the case where we get an exact solution lmbda = 0.0 eps = 0.1 k = 20 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="rsvd") ZList = iterativeSoftImpute.learnModel(matrixIterator) #Check that ZList is the same as XList for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, self.matrixList[i].todense()) #Compare solution with that of SoftImpute class rhoList = [0.1, 0.2, 0.5, 1.0] for rho in rhoList: iterativeSoftImpute = IterativeSoftImpute(rho, k=k, eps=eps, svdAlg="rsvd", updateAlg="zero") matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) rhos = numpy.array([rho]) softImpute = SoftImpute(rhos, k=k, eps=eps) Z1 = softImpute.learnModel(self.matrixList[0]) Z2 = softImpute.learnModel(self.matrixList[1]) Z3 = softImpute.learnModel(self.matrixList[2]) ZList2 = [Z1, Z2, Z3] for j, Zhat in enumerate(ZList): U, s, V = Zhat Z = (U*s).dot(V.T) nptst.assert_array_almost_equal(Z, ZList2[j].todense()) #Also test with true solution Z = S_lambda(X + Z^\bot_\omega) Zomega = numpy.zeros(self.matrixList[j].shape) rowInds, colInds = self.matrixList[j].nonzero() for i in range(self.matrixList[j].nonzero()[0].shape[0]): Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]] U, s, V = ExpSU.SparseUtils.svdArpack(self.matrixList[j], 1, kmax=20) lmbda = rho*numpy.max(s) U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(self.matrixList[j]-Zomega+Z), lmbda) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
def testLearnModel(self): lmbda = 0.0 eps = 0.1 k = 10 matrixIterator = iter(self.matrixList) iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="propack") ZList = iterativeSoftImpute.learnModel(matrixIterator) #Check that ZList is the same as XList for i, Z in enumerate(ZList): U, s, V = Z Xhat = (U*s).dot(V.T) nptst.assert_array_almost_equal(Xhat, numpy.array(self.matrixList[i].todense())) #Compare solution with that of SoftImpute class lmbdaList = [0.1, 0.2, 0.5, 1.0] for lmbda in lmbdaList: iterativeSoftImpute = IterativeSoftImpute(lmbda, k=k, eps=eps, svdAlg="propack", updateAlg="zero") matrixIterator = iter(self.matrixList) ZList = iterativeSoftImpute.learnModel(matrixIterator) lmbdas = numpy.array([lmbda]) softImpute = SoftImpute(lmbdas, k=k, eps=eps) Z1 = softImpute.learnModel(self.matrixList[0]) Z2 = softImpute.learnModel(self.matrixList[1]) Z3 = softImpute.learnModel(self.matrixList[2]) ZList2 = [Z1, Z2, Z3] for j, Zhat in enumerate(ZList): U, s, V = Zhat Z = (U*s).dot(V.T) nptst.assert_array_almost_equal(Z, ZList2[j].todense()) #Also test with true solution Z = S_lambda(X + Z^\bot_\omega) Zomega = numpy.zeros(self.matrixList[j].shape) rowInds, colInds = self.matrixList[j].nonzero() for i in range(self.matrixList[j].nonzero()[0].shape[0]): Zomega[rowInds[i], colInds[i]] = Z[rowInds[i], colInds[i]] U, s, V = ExpSU.SparseUtils.svdSoft(numpy.array(self.matrixList[j]-Zomega+Z), lmbda) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z -(U*s).dot(V.T))**2 < tol)
modelSelect = args.modelSelect folds = 3 ks = numpy.array([64, 128, 256]) rhosSi = numpy.linspace(1.0, 0.0, 5) overwrite = args.overwrite datasets = ["Keyword", "Document"] resultsDir = PathDefaults.getOutputDir() + "coauthors/" contactsFilename = PathDefaults.getDataDir() + "reference/contacts_anonymised.tsv" interestsFilename = PathDefaults.getDataDir() + "reference/author_interest" #Create all the recommendation algorithms softImpute = IterativeSoftImpute(k=k, postProcess=True, svdAlg="rsvd") softImpute.maxIterations = maxIterations softImpute.metric = "f1" softImpute.q = 3 softImpute.p = 10 softImpute.rho = 0.1 softImpute.eps = 10**-4 softImpute.numProcesses = args.processes wrmf = WeightedMf(k=k, maxIterations=maxIterations, alpha=1.0) wrmf.ks = ks wrmf.folds = folds wrmf.lmbdas = 2.0**-numpy.arange(-1, 12, 2) wrmf.metric = "f1" wrmf.numProcesses = args.processes
def testWeightedLearning(self): #See if the weighted learning has any effect shape = (20, 20) r = 20 numInds = 100 noise = 0.2 X = ExpSU.SparseUtils.generateSparseLowRank(shape, r, numInds, noise) rho = 0.0 iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=True) iterX = iter([X]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=False) iterX = iter([X]) resultIter = iterativeSoftImpute.learnModel(iterX) Z2 = resultIter.next() #Check results when rho=0 nptst.assert_array_almost_equal((Z[0]*Z[1]).dot(Z[2].T), (Z2[0]*Z2[1]).dot(Z2[2].T)) nptst.assert_array_almost_equal(Z[1], Z2[1]) #Then check non-uniform matrix - entries clustered around middle indices shape = (20, 15) numInds = 200 maxInd = (shape[0]*shape[1]-1) nzInds = numpy.array(numpy.random.randn(numInds)*maxInd/4 + maxInd/2, numpy.int) trainInds = nzInds[0:int(nzInds.shape[0]/2)] testInds = nzInds[int(nzInds.shape[0]/2):] trainInds = numpy.unique(numpy.clip(trainInds, 0, maxInd)) testInds = numpy.unique(numpy.clip(testInds, 0, maxInd)) trainX = ExpSU.SparseUtils.generateSparseLowRank(shape, r, trainInds, noise) testX = ExpSU.SparseUtils.generateSparseLowRank(shape, r, testInds, noise) #Error using weighted soft impute #print("Running weighted soft impute") rho = 0.5 iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=True) iterX = iter([trainX]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterTestX = iter([testX]) predX = iterativeSoftImpute.predictOne(Z, testX.nonzero()) error = MCEvaluator.rootMeanSqError(testX, predX) #print(error) iterativeSoftImpute = IterativeSoftImpute(rho, k=10, weighted=False) iterX = iter([trainX]) resultIter = iterativeSoftImpute.learnModel(iterX) Z = resultIter.next() iterTestX = iter([testX]) predX = iterativeSoftImpute.predictOne(Z, testX.nonzero()) error = MCEvaluator.rootMeanSqError(testX, predX)
def runExperiment(self): """ Run the selected clustering experiments and save results """ if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") for svdAlg in self.algoArgs.svdAlgs: if svdAlg == "rsvd" or svdAlg == "rsvdUpdate" or svdAlg == "rsvdUpdate2": resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_p=" + str(self.algoArgs.p)+ "_q=" + str(self.algoArgs.q) + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" else: resultsFileName = self.resultsDir + "ResultsSoftImpute_alg=" + svdAlg + "_updateAlg=" + self.algoArgs.updateAlg + ".npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSoftImpute(svdAlg=svdAlg, logStep=self.logStep, kmax=self.algoArgs.kmax, postProcess=self.algoArgs.postProcess, weighted=self.algoArgs.weighted, p=self.algoArgs.p, q=self.algoArgs.q, verbose=self.algoArgs.verbose, updateAlg=self.algoArgs.updateAlg) if self.algoArgs.modelSelect: trainIterator = self.getTrainIterator() #Let's find the optimal lambda using the first matrix X = trainIterator.next() logging.debug("Performing model selection, taking subsample of entries of size " + str(self.sampleSize)) X = SparseUtils.submatrix(X, self.sampleSize) cvInds = Sampling.randCrossValidation(self.algoArgs.folds, X.nnz) meanErrors, stdErrors = learner.modelSelect(X, self.algoArgs.rhos, self.algoArgs.ks, cvInds) logging.debug("Mean errors = " + str(meanErrors)) logging.debug("Std errors = " + str(stdErrors)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) rho = self.algoArgs.rhos[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[0]] k = self.algoArgs.ks[numpy.unravel_index(numpy.argmin(meanErrors), meanErrors.shape)[1]] else: rho = self.algoArgs.rhos[0] k = self.algoArgs.ks[0] learner.setK(k) learner.setRho(rho) logging.debug(learner) trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runSgdMf: logging.debug("Running SGD MF") resultsFileName = self.resultsDir + "ResultsSgdMf.npz" fileLock = FileLock(resultsFileName) if not fileLock.isLocked() and not fileLock.fileExists(): fileLock.lock() try: learner = IterativeSGDNorm2Reg(k=self.algoArgs.ks[0], lmbda=self.algoArgs.lmbdas[0], gamma=self.algoArgs.gammas[0], eps=self.algoArgs.eps) if self.algoArgs.modelSelect: # Let's find optimal parameters using the first matrix learner.modelSelect(self.getTrainIterator().next(), self.algoArgs.ks, self.algoArgs.lmbdas, self.algoArgs.gammas, self.algoArgs.folds) trainIterator = self.getTrainIterator() trainIterator = self.getTrainIterator() ZIter = learner.learnModel(trainIterator) self.recordResults(ZIter, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")
maxLocalAuc.maxNormV = 100 maxLocalAuc.metric = "f1" maxLocalAuc.normalise = True maxLocalAuc.numAucSamples = 10 maxLocalAuc.numProcesses = 1 maxLocalAuc.numRecordAucSamples = 100 maxLocalAuc.numRowSamples = 30 maxLocalAuc.rate = "constant" maxLocalAuc.recordStep = 10 maxLocalAuc.rho = 1.0 maxLocalAuc.t0 = 1.0 maxLocalAuc.t0s = 2.0**-numpy.arange(7, 12, 1) maxLocalAuc.validationSize = 3 maxLocalAuc.validationUsers = 0 softImpute = IterativeSoftImpute(k=k2, postProcess=True) numProcesses = multiprocessing.cpu_count() os.system('taskset -p 0xffffffff %d' % os.getpid()) logging.debug("Starting training") def computeTestAuc(args): trainX, testX, maxLocalAuc, U, V = args numpy.random.seed(21) logging.debug(maxLocalAuc) #maxLocalAuc.learningRateSelect(trainX) U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(trainX, U=U, V=V, verbose=True) fprTrain, tprTrain = MCEvaluator.averageRocCurve(trainX, U, V)
def runExperiment(self, X): """ Run the selected ranking experiments and save results """ logging.debug("Splitting into train and test sets") #Make sure different runs get the same train/test split numpy.random.seed(21) m, n = X.shape #colProbs = (X.sum(0)+1)/float(m+1) #colProbs = colProbs**-self.algoArgs.itemExp #colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize) trainX, testX = trainTestXs[0] logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) #Have scipy versions of each array trainXScipy = trainX.toScipyCsc() testXScipy = testX.toScipyCsc() if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") resultsFileName = self.resultsDir + "ResultsSoftImpute.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True) try: learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi) learner.folds = self.algoArgs.folds learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz) meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runMaxLocalAuc: logging.debug("Running max local AUC") if self.algoArgs.loss != "tanh": resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz" else: resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient) learner.alpha = self.algoArgs.alpha learner.alphas = self.algoArgs.alphas learner.eta = self.algoArgs.eta learner.folds = self.algoArgs.folds learner.initialAlg = self.algoArgs.initialAlg learner.itemExpP = self.algoArgs.itemExpP learner.itemExpQ = self.algoArgs.itemExpQ learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasMlauc learner.loss = self.algoArgs.loss learner.maxIterations = self.algoArgs.maxIterations learner.maxNorms = self.algoArgs.maxNorms learner.maxNormU = self.algoArgs.maxNorm learner.maxNormV = self.algoArgs.maxNorm learner.metric = self.algoArgs.metric learner.normalise = self.algoArgs.normalise learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.numRowSamples = self.algoArgs.numRowSamples learner.rate = self.algoArgs.rate learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.rho = self.algoArgs.rhoMlauc learner.rhos = self.algoArgs.rhosMlauc learner.startAverage = self.algoArgs.startAverage learner.t0 = self.algoArgs.t0 learner.t0s = self.algoArgs.t0s learner.validationSize = self.algoArgs.validationSize learner.validationUsers = self.algoArgs.validationUsers modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName): logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) # meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX) meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX) numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS) logging.debug("Saved model selection grid as " + modelSelectFileName) elif self.algoArgs.modelSelect: data = numpy.load(modelSelectFileName) logging.debug("Read model selection file " + modelSelectFileName) meanMetricsLR = data["arr_0"] meanMetricsMS = data["arr_1"] learner.learningRateSelect(meanMetrics=meanMetricsLR) learner.modelSelectLmbda(meanMetrics=meanMetricsMS) #Turn on (optionally) parallel SGD only at the final learning stage learner.parallelSGD = self.algoArgs.parallelSGD learner.maxIterations *= 2 logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWarpMf: logging.debug("Running WARP loss MF") resultsFileName = self.resultsDir + "ResultsWarpMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u) learner.ks = self.algoArgs.ks learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) logging.debug("Mean local AUCs = " + str(meanAucs)) logging.debug("Std local AUCs = " + str(stdAucs)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWrMf: logging.debug("Running Weighted Regularized Matrix Factorization") resultsFileName = self.resultsDir + "ResultsWrMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() trainXScipy = trainXScipy.tocsr() testXScipy = testXScipy.tocsr() try: learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf) learner.folds = self.algoArgs.folds learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasWrMf learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runBpr: logging.debug("Running Bayesian Personalised Recommendation") resultsFileName = self.resultsDir + "ResultsBpr.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: #trainX = trainX.toScipyCsr() #testX = testX.toScipyCsr() learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasBpr learner.ks = self.algoArgs.ks learner.lmbdaItems = self.algoArgs.lmbdaItems learner.lmbdaUsers = self.algoArgs.lmbdaUsers learner.maxIterations = self.algoArgs.maxIterationsBpr learner.metric = self.algoArgs.metric #learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runKnn: logging.debug("Running kNN") resultsFileName = self.resultsDir + "ResultsKnn.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: trainX = trainX.toScipyCsr() testX = testX.toScipyCsr() learner = KNNRecommender(self.algoArgs.kns[0]) learner.numProcesses = self.algoArgs.processes logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runCLiMF: # !!!! no model selection logging.debug("Running CLiMF") resultsFileName = self.resultsDir + "ResultsCLiMF.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64) trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64) testX = testX.toScipyCsr() learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasCLiMF learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasCLiMF learner.max_iters = self.algoArgs.maxIterCLiMF learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize learner.verbose = self.algoArgs.verbose if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) meanObjs, stdObjs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanObjs, stdObjs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")