Ejemplo n.º 1
0
modelSelect = args.modelSelect
folds = 3
ks = numpy.array([64, 128, 256])
rhosSi = numpy.linspace(1.0, 0.0, 5)

overwrite = args.overwrite
datasets = ["Keyword", "Document"]

resultsDir = PathDefaults.getOutputDir() + "coauthors/"
contactsFilename = PathDefaults.getDataDir() + "reference/contacts_anonymised.tsv"
interestsFilename = PathDefaults.getDataDir() + "reference/author_interest"


#Create all the recommendation algorithms
softImpute = IterativeSoftImpute(k=k, postProcess=True, svdAlg="rsvd")
softImpute.maxIterations = maxIterations
softImpute.metric = "f1" 
softImpute.q = 3
softImpute.p = 10
softImpute.rho = 0.1
softImpute.eps = 10**-4 
softImpute.numProcesses = args.processes

wrmf = WeightedMf(k=k, maxIterations=maxIterations, alpha=1.0)
wrmf.ks = ks
wrmf.folds = folds 
wrmf.lmbdas = 2.0**-numpy.arange(-1, 12, 2)
wrmf.metric = "f1" 
wrmf.numProcesses = args.processes

maxLocalAuc = MaxLocalAUC(k=k, w=0.9, maxIterations=50, lmbdaU=0.1, lmbdaV=0.1, stochastic=True)
Ejemplo n.º 2
0
    def runExperiment(self, X):
        """
        Run the selected ranking experiments and save results
        """
        logging.debug("Splitting into train and test sets")
        #Make sure different runs get the same train/test split
        numpy.random.seed(21)
        m, n = X.shape
        #colProbs = (X.sum(0)+1)/float(m+1)
        #colProbs = colProbs**-self.algoArgs.itemExp
        #colProbs = numpy.ones(n)/float(n)
        trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize)
        trainX, testX = trainTestXs[0]
        logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
        logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))

        #Have scipy versions of each array
        trainXScipy = trainX.toScipyCsc()
        testXScipy = testX.toScipyCsc()

        if self.algoArgs.runSoftImpute:
            logging.debug("Running soft impute")
            resultsFileName = self.resultsDir + "ResultsSoftImpute.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()
                logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True)

                try:
                    learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi)
                    learner.folds = self.algoArgs.folds
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz)
                        meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanErrors, stdErrors)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runMaxLocalAuc:
            logging.debug("Running max local AUC")

            if self.algoArgs.loss != "tanh":
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz"
            else:
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient)

                    learner.alpha = self.algoArgs.alpha
                    learner.alphas = self.algoArgs.alphas
                    learner.eta = self.algoArgs.eta
                    learner.folds = self.algoArgs.folds
                    learner.initialAlg = self.algoArgs.initialAlg
                    learner.itemExpP = self.algoArgs.itemExpP
                    learner.itemExpQ = self.algoArgs.itemExpQ
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasMlauc
                    learner.loss = self.algoArgs.loss
                    learner.maxIterations = self.algoArgs.maxIterations
                    learner.maxNorms = self.algoArgs.maxNorms
                    learner.maxNormU = self.algoArgs.maxNorm
                    learner.maxNormV = self.algoArgs.maxNorm
                    learner.metric = self.algoArgs.metric
                    learner.normalise = self.algoArgs.normalise
                    learner.numAucSamples = self.algoArgs.numAucSamples
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRowSamples = self.algoArgs.numRowSamples
                    learner.rate = self.algoArgs.rate
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.recordStep = self.algoArgs.recordStep
                    learner.rho = self.algoArgs.rhoMlauc
                    learner.rhos = self.algoArgs.rhosMlauc
                    learner.startAverage = self.algoArgs.startAverage
                    learner.t0 = self.algoArgs.t0
                    learner.t0s = self.algoArgs.t0s
                    learner.validationSize = self.algoArgs.validationSize
                    learner.validationUsers = self.algoArgs.validationUsers

                    modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")

                    if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName):
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)
                        #
                        meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX)
                        meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX)

                        numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)
                    elif self.algoArgs.modelSelect:
                        data = numpy.load(modelSelectFileName)
                        logging.debug("Read model selection file " + modelSelectFileName)
                        meanMetricsLR = data["arr_0"]
                        meanMetricsMS = data["arr_1"]

                        learner.learningRateSelect(meanMetrics=meanMetricsLR)
                        learner.modelSelectLmbda(meanMetrics=meanMetricsMS)

                    #Turn on (optionally) parallel SGD only at the final learning stage
                    learner.parallelSGD = self.algoArgs.parallelSGD
                    learner.maxIterations *= 2
                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runWarpMf:
            logging.debug("Running WARP loss MF")
            resultsFileName = self.resultsDir + "ResultsWarpMf.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u)
                    learner.ks = self.algoArgs.ks
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        logging.debug("Mean local AUCs = " + str(meanAucs))
                        logging.debug("Std local AUCs = " + str(stdAucs))

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runWrMf:
            logging.debug("Running Weighted Regularized Matrix Factorization")
            resultsFileName = self.resultsDir + "ResultsWrMf.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                trainXScipy = trainXScipy.tocsr()
                testXScipy = testXScipy.tocsr()

                try:
                    learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf)
                    learner.folds = self.algoArgs.folds
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasWrMf
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runBpr:
            logging.debug("Running Bayesian Personalised Recommendation")
            resultsFileName = self.resultsDir + "ResultsBpr.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    #trainX = trainX.toScipyCsr()
                    #testX = testX.toScipyCsr()

                    learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr)
                    learner.folds = self.algoArgs.folds
                    learner.gammas = self.algoArgs.gammasBpr
                    learner.ks = self.algoArgs.ks
                    learner.lmbdaItems = self.algoArgs.lmbdaItems
                    learner.lmbdaUsers = self.algoArgs.lmbdaUsers
                    learner.maxIterations = self.algoArgs.maxIterationsBpr
                    learner.metric = self.algoArgs.metric
                    #learner.numAucSamples = self.algoArgs.numAucSamples
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.recordStep = self.algoArgs.recordStep
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runKnn:
            logging.debug("Running kNN")
            resultsFileName = self.resultsDir + "ResultsKnn.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    trainX = trainX.toScipyCsr()
                    testX = testX.toScipyCsr()

                    learner = KNNRecommender(self.algoArgs.kns[0])
                    learner.numProcesses = self.algoArgs.processes

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runCLiMF:
            # !!!! no model selection
            logging.debug("Running CLiMF")
            resultsFileName = self.resultsDir + "ResultsCLiMF.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                    modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)
                    modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64)
                    trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64)
                    testX = testX.toScipyCsr()

                    learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF)
                    learner.folds = self.algoArgs.folds
                    learner.gammas = self.algoArgs.gammasCLiMF
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasCLiMF
                    learner.max_iters = self.algoArgs.maxIterCLiMF
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize
                    learner.verbose = self.algoArgs.verbose

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))

                        meanObjs, stdObjs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanObjs, stdObjs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        logging.info("All done: see you around!")