overwrite = args.overwrite datasets = ["Keyword", "Document"] resultsDir = PathDefaults.getOutputDir() + "coauthors/" contactsFilename = PathDefaults.getDataDir() + "reference/contacts_anonymised.tsv" interestsFilename = PathDefaults.getDataDir() + "reference/author_interest" #Create all the recommendation algorithms softImpute = IterativeSoftImpute(k=k, postProcess=True, svdAlg="rsvd") softImpute.maxIterations = maxIterations softImpute.metric = "f1" softImpute.q = 3 softImpute.p = 10 softImpute.rho = 0.1 softImpute.eps = 10**-4 softImpute.numProcesses = args.processes wrmf = WeightedMf(k=k, maxIterations=maxIterations, alpha=1.0) wrmf.ks = ks wrmf.folds = folds wrmf.lmbdas = 2.0**-numpy.arange(-1, 12, 2) wrmf.metric = "f1" wrmf.numProcesses = args.processes maxLocalAuc = MaxLocalAUC(k=k, w=0.9, maxIterations=50, lmbdaU=0.1, lmbdaV=0.1, stochastic=True) maxLocalAuc.numRowSamples = 10 maxLocalAuc.parallelSGD = True maxLocalAuc.initialAlg = "rand" maxLocalAuc.ks = ks
def runExperiment(self, X): """ Run the selected ranking experiments and save results """ logging.debug("Splitting into train and test sets") #Make sure different runs get the same train/test split numpy.random.seed(21) m, n = X.shape #colProbs = (X.sum(0)+1)/float(m+1) #colProbs = colProbs**-self.algoArgs.itemExp #colProbs = numpy.ones(n)/float(n) trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize) trainX, testX = trainTestXs[0] logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz)) logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz)) #Have scipy versions of each array trainXScipy = trainX.toScipyCsc() testXScipy = testX.toScipyCsc() if self.algoArgs.runSoftImpute: logging.debug("Running soft impute") resultsFileName = self.resultsDir + "ResultsSoftImpute.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True) try: learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi) learner.folds = self.algoArgs.folds learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz) meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanErrors, stdErrors) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runMaxLocalAuc: logging.debug("Running max local AUC") if self.algoArgs.loss != "tanh": resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz" else: resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient) learner.alpha = self.algoArgs.alpha learner.alphas = self.algoArgs.alphas learner.eta = self.algoArgs.eta learner.folds = self.algoArgs.folds learner.initialAlg = self.algoArgs.initialAlg learner.itemExpP = self.algoArgs.itemExpP learner.itemExpQ = self.algoArgs.itemExpQ learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasMlauc learner.loss = self.algoArgs.loss learner.maxIterations = self.algoArgs.maxIterations learner.maxNorms = self.algoArgs.maxNorms learner.maxNormU = self.algoArgs.maxNorm learner.maxNormV = self.algoArgs.maxNorm learner.metric = self.algoArgs.metric learner.normalise = self.algoArgs.normalise learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.numRowSamples = self.algoArgs.numRowSamples learner.rate = self.algoArgs.rate learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.rho = self.algoArgs.rhoMlauc learner.rhos = self.algoArgs.rhosMlauc learner.startAverage = self.algoArgs.startAverage learner.t0 = self.algoArgs.t0 learner.t0s = self.algoArgs.t0s learner.validationSize = self.algoArgs.validationSize learner.validationUsers = self.algoArgs.validationUsers modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName): logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) # meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX) meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX) numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS) logging.debug("Saved model selection grid as " + modelSelectFileName) elif self.algoArgs.modelSelect: data = numpy.load(modelSelectFileName) logging.debug("Read model selection file " + modelSelectFileName) meanMetricsLR = data["arr_0"] meanMetricsMS = data["arr_1"] learner.learningRateSelect(meanMetrics=meanMetricsLR) learner.modelSelectLmbda(meanMetrics=meanMetricsMS) #Turn on (optionally) parallel SGD only at the final learning stage learner.parallelSGD = self.algoArgs.parallelSGD learner.maxIterations *= 2 logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWarpMf: logging.debug("Running WARP loss MF") resultsFileName = self.resultsDir + "ResultsWarpMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u) learner.ks = self.algoArgs.ks learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) logging.debug("Mean local AUCs = " + str(meanAucs)) logging.debug("Std local AUCs = " + str(stdAucs)) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runWrMf: logging.debug("Running Weighted Regularized Matrix Factorization") resultsFileName = self.resultsDir + "ResultsWrMf.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() trainXScipy = trainXScipy.tocsr() testXScipy = testXScipy.tocsr() try: learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf) learner.folds = self.algoArgs.folds learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasWrMf learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runBpr: logging.debug("Running Bayesian Personalised Recommendation") resultsFileName = self.resultsDir + "ResultsBpr.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: #trainX = trainX.toScipyCsr() #testX = testX.toScipyCsr() learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasBpr learner.ks = self.algoArgs.ks learner.lmbdaItems = self.algoArgs.lmbdaItems learner.lmbdaUsers = self.algoArgs.lmbdaUsers learner.maxIterations = self.algoArgs.maxIterationsBpr learner.metric = self.algoArgs.metric #learner.numAucSamples = self.algoArgs.numAucSamples learner.numProcesses = self.algoArgs.processes learner.recommendSize = self.algoArgs.recommendSize learner.recordStep = self.algoArgs.recordStep learner.validationSize = self.algoArgs.validationSize if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) meanAucs, stdAucs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanAucs, stdAucs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runKnn: logging.debug("Running kNN") resultsFileName = self.resultsDir + "ResultsKnn.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: trainX = trainX.toScipyCsr() testX = testX.toScipyCsr() learner = KNNRecommender(self.algoArgs.kns[0]) learner.numProcesses = self.algoArgs.processes logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) if self.algoArgs.runCLiMF: # !!!! no model selection logging.debug("Running CLiMF") resultsFileName = self.resultsDir + "ResultsCLiMF.npz" fileLock = FileLock(resultsFileName) if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite: fileLock.lock() try: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True) modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64) trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64) testX = testX.toScipyCsr() learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF) learner.folds = self.algoArgs.folds learner.gammas = self.algoArgs.gammasCLiMF learner.ks = self.algoArgs.ks learner.lmbdas = self.algoArgs.lmbdasCLiMF learner.max_iters = self.algoArgs.maxIterCLiMF learner.metric = self.algoArgs.metric learner.numProcesses = self.algoArgs.processes learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples learner.recommendSize = self.algoArgs.recommendSize learner.validationSize = self.algoArgs.validationSize learner.verbose = self.algoArgs.verbose if self.algoArgs.modelSelect: logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples)) meanObjs, stdObjs = learner.modelSelect(modelSelectX) modelSelectFileName = resultsFileName.replace("Results", "ModelSelect") numpy.savez(modelSelectFileName, meanObjs, stdObjs) logging.debug("Saved model selection grid as " + modelSelectFileName) logging.debug(learner) self.recordResults(X, trainX, testX, learner, resultsFileName) finally: fileLock.unlock() else: logging.debug("File is locked or already computed: " + resultsFileName) logging.info("All done: see you around!")