Beispiel #1
0
    def testAverageRocCurve(self):
        m = 50
        n = 20
        k = 8
        u = 20.0 / m
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix(
            (m, n), k, w, csarray=True, verbose=True, indsPerRow=200
        )

        fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)

        import matplotlib

        matplotlib.use("GTK3Agg")
        import matplotlib.pyplot as plt

        # plt.plot(fpr, tpr)
        # plt.show()

        # Now try case where we have a training set
        folds = 1
        testSize = 5
        trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
        trainX, testX = trainTestXs[0]

        fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
Beispiel #2
0
    def testAverageRocCurve(self):
        m = 50
        n = 20
        k = 8
        u = 20.0 / m
        w = 1 - u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n),
                                                                k,
                                                                w,
                                                                csarray=True,
                                                                verbose=True,
                                                                indsPerRow=200)

        fpr, tpr = MCEvaluator.averageRocCurve(X, U, V)

        import matplotlib
        matplotlib.use("GTK3Agg")
        import matplotlib.pyplot as plt
        #plt.plot(fpr, tpr)
        #plt.show()

        #Now try case where we have a training set
        folds = 1
        testSize = 5
        trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)
        trainX, testX = trainTestXs[0]

        fpr, tpr = MCEvaluator.averageRocCurve(testX, U, V, trainX=trainX)
Beispiel #3
0
    def parallelGridSearch(self,
                           X,
                           paramDict,
                           evaluationMethod,
                           testX=None,
                           minVal=True):
        """
        Perform parallel model selection using any learner. 
        """
        logging.debug("Parallel grid search with params: " + str(paramDict))

        m, n = X.shape
        if testX == None:
            trainTestXs = Sampling.shuffleSplitRows(X, self.folds,
                                                    self.validationSize)
        else:
            trainTestXs = [[X, testX]]

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanMetrics = numpy.zeros(tuple(gridSize))
        paramList = []

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()

                for i, (key, val) in enumerate(paramDict.items()):
                    setattr(learner, key, val[inds[i]])

                paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList,
                                        self.chunkSize)
        else:
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                metric = resultsIterator.next()
                meanMetrics[inds] += metric / float(self.folds)

        if self.numProcesses != 1:
            pool.terminate()

        resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict,
                                                     minVal)

        return meanMetrics
Beispiel #4
0
    def modelSelect(self, X, colProbs=None): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        #cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, colProbs=colProbs)
        testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))
        
        if self.metric == "mrr":
            evaluationMethod = computeTestMRR
        elif self.metric == "f1": 
            evaluationMethod = computeTestF1
        else: 
            raise ValueError("Invalid metric: " + self.metric)        
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            for j, lmbda in enumerate(self.lmbdas): 
                for icv, (trainX, testX) in enumerate(trainTestXs):                
                    learner = self.copy()
                    learner.k = k
                    learner.lmbda = lmbda 
                
                    paramList.append((trainX.toScipyCsr(), testX.toScipyCsr(), learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else: 
            import itertools
            resultsIterator = itertools.imap(evaluationMethod, paramList)
        
        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv in range(len(trainTestXs)):             
                    testMetrics[i, j, icv] = resultsIterator.next()
        
        if self.numProcesses != 1: 
            pool.terminate()
            
        meanTestMetrics= numpy.mean(testMetrics, 2)
        stdTestMetrics = numpy.std(testMetrics, 2)
        
        logging.debug("ks=" + str(self.ks)) 
        logging.debug("lmbdas=" + str(self.lmbdas)) 
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[0]]
        self.lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda))
         
        return meanTestMetrics, stdTestMetrics
Beispiel #5
0
    def parallelGridSearch(self, X, paramDict, evaluationMethod, testX=None, minVal=True):
        """
        Perform parallel model selection using any learner. 
        """
        logging.debug("Parallel grid search with params: " + str(paramDict))

        m, n = X.shape
        if testX == None:
            trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize)
        else:
            trainTestXs = [[X, testX]]

        gridSize = []
        gridInds = []
        for key in paramDict.keys():
            gridSize.append(paramDict[key].shape[0])
            gridInds.append(numpy.arange(paramDict[key].shape[0]))

        meanMetrics = numpy.zeros(tuple(gridSize))
        paramList = []

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)

            for inds in indexIter:
                learner = self.copy()

                for i, (key, val) in enumerate(paramDict.items()):
                    setattr(learner, key, val[inds[i]])

                paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList, self.chunkSize)
        else:
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for icv, (trainX, testX) in enumerate(trainTestXs):
            indexIter = itertools.product(*gridInds)
            for inds in indexIter:
                metric = resultsIterator.next()
                meanMetrics[inds] += metric / float(self.folds)

        if self.numProcesses != 1:
            pool.terminate()

        resultDict, bestMetric = self.setBestLearner(meanMetrics, paramDict, minVal)

        return meanMetrics
Beispiel #6
0
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data" 
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])), storagetype="row")
    X[data[:, 0]-1, data[:, 1]-1] = numpy.array(data[:, 2]>3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))
    
    u = 0.1 
    w = 1-u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
Beispiel #7
0
    def profileLearnModel2(self):
        #Profile stochastic case
        #X = DatasetUtils.flixster()
        #X = Sampling.sampleUsers(X, 1000)
        X, U, V = DatasetUtils.syntheticDataset1(u=0.001, m=10000, n=1000)

        rho = 0.00
        u = 0.2
        w = 1 - u
        eps = 10**-6
        alpha = 0.5
        k = self.k
        maxLocalAuc = MaxLocalAUC(k, w, alpha=alpha, eps=eps, stochastic=True)
        maxLocalAuc.numRowSamples = 2
        maxLocalAuc.numAucSamples = 10
        maxLocalAuc.maxIterations = 1
        maxLocalAuc.numRecordAucSamples = 100
        maxLocalAuc.recordStep = 10
        maxLocalAuc.initialAlg = "rand"
        maxLocalAuc.rate = "optimal"
        #maxLocalAuc.parallelSGD = True

        trainTestX = Sampling.shuffleSplitRows(X, maxLocalAuc.folds, 5)
        trainX, testX = trainTestX[0]

        def run():
            U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(
                trainX, True)
            #logging.debug("Train Precision@5=" + str(MCEvaluator.precisionAtK(trainX, U, V, 5)))
            #logging.debug("Train Precision@10=" + str(MCEvaluator.precisionAtK(trainX, U, V, 10)))
            #logging.debug("Train Precision@20=" + str(MCEvaluator.precisionAtK(trainX, U, V, 20)))
            #logging.debug("Train Precision@50=" + str(MCEvaluator.precisionAtK(trainX, U, V, 50)))

            #logging.debug("Test Precision@5=" + str(MCEvaluator.precisionAtK(testX, U, V, 5)))
            #logging.debug("Test Precision@10=" + str(MCEvaluator.precisionAtK(testX, U, V, 10)))
            #logging.debug("Test Precision@20=" + str(MCEvaluator.precisionAtK(testX, U, V, 20)))
            #logging.debug("Test Precision@50=" + str(MCEvaluator.precisionAtK(testX, U, V, 50)))

        ProfileUtils.profile('run()', globals(), locals())
Beispiel #8
0
def main():
    import sys
    logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
    matrixFileName = PathDefaults.getDataDir() + "movielens/ml-100k/u.data"
    data = numpy.loadtxt(matrixFileName)
    X = sppy.csarray((numpy.max(data[:, 0]), numpy.max(data[:, 1])),
                     storagetype="row")
    X[data[:, 0] - 1, data[:, 1] - 1] = numpy.array(data[:, 2] > 3, numpy.int)
    logging.debug("Read file: " + matrixFileName)
    logging.debug("Shape of data: " + str(X.shape))
    logging.debug("Number of non zeros " + str(X.nnz))

    u = 0.1
    w = 1 - u
    (m, n) = X.shape

    validationSize = 5
    trainTestXs = Sampling.shuffleSplitRows(X, 1, validationSize)
    trainX, testX = trainTestXs[0]
    trainX = trainX.toScipyCsr()

    learner = CLiMF(k=20, lmbda=0.001, gamma=0.0001)
    learner.learnModel(trainX)
Beispiel #9
0
    def testShuffleSplitRows(self): 
        m = 10
        n = 16
        k = 5 
        u = 0.5
        w = 1-u
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)
        
        #print(X.toarray())
        
        k2 = 5 
        testSize = 2
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=True)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            self.assertEquals(trainX.storagetype, "row")
            self.assertEquals(testX.storagetype, "row")
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, rowMajor=False)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                       
            self.assertEquals(trainX.storagetype, "col")
            self.assertEquals(testX.storagetype, "col")                       
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)        
        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, csarray=False)
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            
            nptst.assert_array_equal(numpy.ravel(testX.sum(1)), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)

        testSize = 0
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize)
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
                        
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            nptst.assert_array_equal(testX.sum(1), testSize*numpy.ones(m))
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
            self.assertEquals(testX.nnz, 0)
            
        #Test sampling a subset of the rows 
        testSize = 2
        numRows = 5
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=numRows, rowMajor=False)

        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
            
            nptst.assert_array_almost_equal(X.toarray(), (trainX+testX).toarray())
            self.assertEquals(numpy.nonzero(testX.sum(1))[0].shape[0], numRows)
            self.assertEquals(X.nnz, trainX.nnz + testX.nnz)
            self.assertEquals(testX.nnz, testSize*numRows)
            
        #Make sure column probabilities are correct 
        w = 0.0            
        X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, csarray=True, verbose=True, indsPerRow=200)            
            
        testSize = 5
        k2 = 500
        colProbs = numpy.arange(0, n, dtype=numpy.float)+1
        colProbs /= colProbs.sum() 
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
        colProbs2 = numpy.zeros(n)        
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
            
            colProbs2 += testX.sum(0)
        
        colProbs2 /= colProbs2.sum() 
        nptst.assert_array_almost_equal(colProbs, colProbs2, 2)
        
        #Now test when probabilities are uniform 
        colProbs = numpy.ones(n)/float(n)        
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
        colProbs = None
        trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize, colProbs=colProbs)
        
        colProbs2 = numpy.zeros(n)       
        colProbs3 = numpy.zeros(n) 
        
        for i in range(k2): 
            trainX = trainTestXs[i][0]
            testX = trainTestXs[i][1]
            colProbs2 += testX.sum(0)
            
            trainX = trainTestXs2[i][0]
            testX = trainTestXs2[i][1]
            colProbs3 += testX.sum(0)
        
        colProbs2 /= colProbs2.sum() 
        colProbs3 /= colProbs3.sum()
        nptst.assert_array_almost_equal(colProbs2, colProbs3, 2)
        
        #Test when numRows=m
        numpy.random.seed(21)
        trainTestXs = Sampling.shuffleSplitRows(X, k2, testSize, numRows=m)
        numpy.random.seed(21)
        trainTestXs2 = Sampling.shuffleSplitRows(X, k2, testSize)

        nptst.assert_array_equal(trainTestXs[0][0].toarray(), trainTestXs2[0][0].toarray())
        nptst.assert_array_equal(trainTestXs[0][1].toarray(), trainTestXs2[0][1].toarray())
Beispiel #10
0
    def parallelLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        # Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        # We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers
            )[0]
            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        # Not that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if U == None or V == None:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        numBlocks = self.numProcesses + 1
        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        # Some shared variables
        rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)
        colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)

        # Create shared factors
        U2 = sharedmem.zeros((m, self.k))
        V2 = sharedmem.zeros((n, self.k))
        muU2 = sharedmem.zeros((m, self.k))
        muV2 = sharedmem.zeros((n, self.k))

        U2[:] = U[:]
        V2[:] = V[:]
        muU2[:] = U[:]
        muV2[:] = V[:]
        del U, V

        rowBlockSize = int(numpy.ceil(float(m) / numBlocks))
        colBlockSize = int(numpy.ceil(float(n) / numBlocks))

        lock = multiprocessing.Lock()
        startTime = time.time()
        loopInd = 0
        iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))

        self.learnerCython = self.getCythonLearner()
        nextRecord = 0

        while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
            if loopInd >= nextRecord:
                if loopInd != 0:
                    print("")

                printStr = self.recordResults(
                    muU2,
                    muV2,
                    trainMeasures,
                    testMeasures,
                    loopInd,
                    rowSamples,
                    indPtr,
                    colInds,
                    testIndPtr,
                    testColInds,
                    allIndPtr,
                    allColInds,
                    gi,
                    gp,
                    gq,
                    trainX,
                    startTime,
                )
                logging.debug(printStr)

                if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric:
                    bestMetric = testMeasures[-1][metricInd]
                    bestU = muU2.copy()
                    bestV = muV2.copy()
                elif testIndPtr is None:
                    bestU = muU2.copy()
                    bestV = muV2.copy()

                # Compute objective averaged over last 5 recorded steps
                trainMeasuresArr = numpy.array(trainMeasures)
                lastObj = currentObj
                currentObj = numpy.mean(trainMeasuresArr[-5:, 0])

                nextRecord += self.recordStep

            iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))
            self.parallelUpdateUV(
                X,
                U2,
                V2,
                muU2,
                muV2,
                numBlocks,
                rowBlockSize,
                colBlockSize,
                rowIsFree,
                colIsFree,
                indPtr,
                colInds,
                lock,
                gi,
                gp,
                gq,
                normGp,
                normGq,
                iterationsPerBlock,
                loopInd,
            )
            loopInd += numpy.floor(iterationsPerBlock.mean())

        totalTime = time.time() - startTime

        # Compute quantities for last U and V
        print("")
        totalTime = time.time() - startTime
        printStr = "Finished, time=" + str("%.1f" % totalTime) + " "
        printStr += self.recordResults(
            muU2,
            muV2,
            trainMeasures,
            testMeasures,
            loopInd,
            rowSamples,
            indPtr,
            colInds,
            testIndPtr,
            testColInds,
            allIndPtr,
            allColInds,
            gi,
            gp,
            gq,
            trainX,
            startTime,
        )
        printStr += " delta obj" + "%.3e" % abs(lastObj - currentObj)
        logging.debug(printStr)

        self.U = bestU
        self.V = bestV
        self.gi = gi
        self.gp = gp
        self.gq = gq

        if verbose:
            return self.U, self.V, numpy.array(trainMeasures), numpy.array(testMeasures), loopInd, totalTime
        else:
            return self.U, self.V
Beispiel #11
0
    def singleLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        # Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        # We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers
            )[0]

            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)

            logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
            logging.debug("Validation X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        # Note that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if type(U) != numpy.ndarray and type(V) != numpy.ndarray:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        muU = U.copy()
        muV = V.copy()
        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        # Try alternative number of iterations
        # numIterations = trainX.nnz/self.numAucSamples
        numIterations = max(m, n)

        self.learnerCython = self.getCythonLearner()

        # Set up order of indices for stochastic methods
        permutedRowInds = numpy.array(numpy.random.permutation(m), numpy.uint32)
        permutedColInds = numpy.array(numpy.random.permutation(n), numpy.uint32)

        startTime = time.time()

        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        while loopInd < self.maxIterations and abs(lastObj - currentObj) > self.eps:
            sigmaU = self.getSigma(loopInd, self.alpha, m)
            sigmaV = self.getSigma(loopInd, self.alpha, m)

            if loopInd % self.recordStep == 0:
                if loopInd != 0 and self.stochastic:
                    print("")

                printStr = self.recordResults(
                    muU,
                    muV,
                    trainMeasures,
                    testMeasures,
                    loopInd,
                    rowSamples,
                    indPtr,
                    colInds,
                    testIndPtr,
                    testColInds,
                    allIndPtr,
                    allColInds,
                    gi,
                    gp,
                    gq,
                    trainX,
                    startTime,
                )
                logging.debug(printStr)

                if testIndPtr is not None and testMeasures[-1][metricInd] >= bestMetric:
                    bestMetric = testMeasures[-1][metricInd]
                    logging.debug("Current best metric=" + str(bestMetric))
                    bestU = muU.copy()
                    bestV = muV.copy()
                elif testIndPtr is None:
                    bestU = muU.copy()
                    bestV = muV.copy()

                # Compute objective averaged over last 5 recorded steps
                trainMeasuresArr = numpy.array(trainMeasures)
                lastObj = currentObj
                currentObj = numpy.mean(trainMeasuresArr[-5:, 0])

            U = numpy.ascontiguousarray(U)
            self.updateUV(
                indPtr,
                colInds,
                U,
                V,
                muU,
                muV,
                permutedRowInds,
                permutedColInds,
                gp,
                gq,
                normGp,
                normGq,
                loopInd,
                sigmaU,
                sigmaV,
                numIterations,
            )
            loopInd += 1

        # Compute quantities for last U and V
        totalTime = time.time() - startTime
        printStr = "\nFinished, time=" + str("%.1f" % totalTime) + " "
        printStr += self.recordResults(
            muU,
            muV,
            trainMeasures,
            testMeasures,
            loopInd,
            rowSamples,
            indPtr,
            colInds,
            testIndPtr,
            testColInds,
            allIndPtr,
            allColInds,
            gi,
            gp,
            gq,
            trainX,
            startTime,
        )
        printStr += " delta obj=" + "%.3e" % abs(lastObj - currentObj)
        logging.debug(printStr)

        self.U = bestU
        self.V = bestV
        self.gi = gi
        self.gp = gp
        self.gq = gq

        trainMeasures = numpy.array(trainMeasures)
        testMeasures = numpy.array(testMeasures)

        if verbose:
            return self.U, self.V, trainMeasures, testMeasures, loopInd, totalTime
        else:
            return self.U, self.V
Beispiel #12
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                csarray=True,
                                                colProbs=colProbs)
        testMetrics = numpy.zeros(
            (self.ks.shape[0], self.lmbdaUsers.shape[0],
             self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs)))

        logging.debug(
            "Performing model selection with test leave out per row of " +
            str(self.validationSize))
        paramList = []

        for i, k in enumerate(self.ks):
            for j, lmbdaUser in enumerate(self.lmbdaUsers):
                for s, lmbdaItem in enumerate(self.lmbdaItems):
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            learner = self.copy()
                            learner.k = k
                            learner.lmbdaUser = lmbdaUser
                            learner.lmbdaPos = lmbdaItem
                            learner.lmbdaNeg = lmbdaItem
                            learner.gamma = gamma

                            paramList.append((trainX, testX, learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList,
                                        self.chunkSize)
        else:
            import itertools
            resultsIterator = itertools.imap(computeTestF1, paramList)

        for i, k in enumerate(self.ks):
            for j, lmbdaUser in enumerate(self.lmbdaUsers):
                for s, lmbdaPos in enumerate(self.lmbdaItems):
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            testMetrics[i, j, s, t,
                                        icv] = resultsIterator.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanTestMetrics = numpy.mean(testMetrics, 4)
        stdTestMetrics = numpy.std(testMetrics, 4)

        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdaUsers=" + str(self.lmbdaUsers))
        logging.debug("lmbdaItems=" + str(self.lmbdaItems))
        logging.debug("gammas=" + str(self.gammas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))

        indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(
            meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[indK]
        self.lmbdaUser = self.lmbdaUsers[indLmdabUser]
        self.lmbdaPos = self.lmbdaItems[indLmbdaItem]
        self.lmbdaNeg = self.lmbdaItems[indLmbdaItem]
        self.gamma = self.gammas[indGamma]

        logging.debug("Model parameters: " + str(self))

        return meanTestMetrics, stdTestMetrics
Beispiel #13
0
    def parallelLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with parallel (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        #Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        #We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers)[0]
            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        #Not that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if U == None or V == None:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        numBlocks = self.numProcesses + 1
        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        #Some shared variables
        rowIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)
        colIsFree = sharedmem.ones(numBlocks, dtype=numpy.bool)

        #Create shared factors
        U2 = sharedmem.zeros((m, self.k))
        V2 = sharedmem.zeros((n, self.k))
        muU2 = sharedmem.zeros((m, self.k))
        muV2 = sharedmem.zeros((n, self.k))

        U2[:] = U[:]
        V2[:] = V[:]
        muU2[:] = U[:]
        muV2[:] = V[:]
        del U, V

        rowBlockSize = int(numpy.ceil(float(m) / numBlocks))
        colBlockSize = int(numpy.ceil(float(n) / numBlocks))

        lock = multiprocessing.Lock()
        startTime = time.time()
        loopInd = 0
        iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))

        self.learnerCython = self.getCythonLearner()
        nextRecord = 0

        while loopInd < self.maxIterations and abs(lastObj -
                                                   currentObj) > self.eps:
            if loopInd >= nextRecord:
                if loopInd != 0:
                    print("")

                printStr = self.recordResults(muU2, muV2, trainMeasures,
                                              testMeasures, loopInd,
                                              rowSamples, indPtr, colInds,
                                              testIndPtr, testColInds,
                                              allIndPtr, allColInds, gi, gp,
                                              gq, trainX, startTime)
                logging.debug(printStr)

                if testIndPtr is not None and testMeasures[-1][
                        metricInd] >= bestMetric:
                    bestMetric = testMeasures[-1][metricInd]
                    bestU = muU2.copy()
                    bestV = muV2.copy()
                elif testIndPtr is None:
                    bestU = muU2.copy()
                    bestV = muV2.copy()

                #Compute objective averaged over last 5 recorded steps
                trainMeasuresArr = numpy.array(trainMeasures)
                lastObj = currentObj
                currentObj = numpy.mean(trainMeasuresArr[-5:, 0])

                nextRecord += self.recordStep

            iterationsPerBlock = sharedmem.zeros((numBlocks, numBlocks))
            self.parallelUpdateUV(X, U2, V2, muU2, muV2, numBlocks,
                                  rowBlockSize, colBlockSize, rowIsFree,
                                  colIsFree, indPtr, colInds, lock, gi, gp, gq,
                                  normGp, normGq, iterationsPerBlock, loopInd)
            loopInd += numpy.floor(iterationsPerBlock.mean())

        totalTime = time.time() - startTime

        #Compute quantities for last U and V
        print("")
        totalTime = time.time() - startTime
        printStr = "Finished, time=" + str('%.1f' % totalTime) + " "
        printStr += self.recordResults(muU2, muV2, trainMeasures, testMeasures,
                                       loopInd, rowSamples, indPtr, colInds,
                                       testIndPtr, testColInds, allIndPtr,
                                       allColInds, gi, gp, gq, trainX,
                                       startTime)
        printStr += " delta obj" + "%.3e" % abs(lastObj - currentObj)
        logging.debug(printStr)

        self.U = bestU
        self.V = bestV
        self.gi = gi
        self.gp = gp
        self.gq = gq

        if verbose:
            return self.U, self.V, numpy.array(trainMeasures), numpy.array(
                testMeasures), loopInd, totalTime
        else:
            return self.U, self.V
Beispiel #14
0
    def modelSelect(self, X, colProbs=None): 
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=True, colProbs=colProbs)
        testMetrics = numpy.zeros((self.ks.shape[0], self.lmbdaUsers.shape[0], self.lmbdaItems.shape[0], self.gammas.shape[0], len(trainTestXs)))
        
        logging.debug("Performing model selection with test leave out per row of " + str(self.validationSize))
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            for j, lmbdaUser in enumerate(self.lmbdaUsers): 
                for s, lmbdaItem in enumerate(self.lmbdaItems): 
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):
                            learner = self.copy()
                            learner.k = k  
                            learner.lmbdaUser = lmbdaUser 
                            learner.lmbdaPos = lmbdaItem
                            learner.lmbdaNeg = lmbdaItem
                            learner.gamma = gamma
                        
                            paramList.append((trainX, testX, learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
        else: 
            import itertools
            resultsIterator = itertools.imap(computeTestF1, paramList)
        
        for i, k in enumerate(self.ks): 
            for j, lmbdaUser in enumerate(self.lmbdaUsers): 
                for s, lmbdaPos in enumerate(self.lmbdaItems): 
                    for t, gamma in enumerate(self.gammas):
                        for icv, (trainX, testX) in enumerate(trainTestXs):        
                            testMetrics[i, j, s, t, icv] = resultsIterator.next()
                
        if self.numProcesses != 1: 
            pool.terminate()
        
        meanTestMetrics = numpy.mean(testMetrics, 4)
        stdTestMetrics = numpy.std(testMetrics, 4)
        
        logging.debug("ks=" + str(self.ks)) 
        logging.debug("lmbdaUsers=" + str(self.lmbdaUsers)) 
        logging.debug("lmbdaItems=" + str(self.lmbdaItems)) 
        logging.debug("gammas=" + str(self.gammas)) 
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        
        indK, indLmdabUser, indLmbdaItem, indGamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[indK]
        self.lmbdaUser = self.lmbdaUsers[indLmdabUser]
        self.lmbdaPos = self.lmbdaItems[indLmbdaItem]
        self.lmbdaNeg = self.lmbdaItems[indLmbdaItem]
        self.gamma = self.gammas[indGamma]

        logging.debug("Model parameters: " + str(self))
         
        return meanTestMetrics, stdTestMetrics
Beispiel #15
0
    def runExperiment(self, X):
        """
        Run the selected ranking experiments and save results
        """
        logging.debug("Splitting into train and test sets")
        #Make sure different runs get the same train/test split
        numpy.random.seed(21)
        m, n = X.shape
        #colProbs = (X.sum(0)+1)/float(m+1)
        #colProbs = colProbs**-self.algoArgs.itemExp
        #colProbs = numpy.ones(n)/float(n)
        trainTestXs = Sampling.shuffleSplitRows(X, 1, self.algoArgs.testSize)
        trainX, testX = trainTestXs[0]
        logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " + str(trainX.nnz))
        logging.debug("Test X shape and nnz: " + str(testX.shape) + " " + str(testX.nnz))

        #Have scipy versions of each array
        trainXScipy = trainX.toScipyCsc()
        testXScipy = testX.toScipyCsc()

        if self.algoArgs.runSoftImpute:
            logging.debug("Running soft impute")
            resultsFileName = self.resultsDir + "ResultsSoftImpute.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()
                logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                modelSelectX, userInds = Sampling.sampleUsers2(trainXScipy, self.algoArgs.modelSelectSamples, prune=True)

                try:
                    learner = IterativeSoftImpute(self.algoArgs.rhoSi, eps=self.algoArgs.epsSi, k=self.algoArgs.k, svdAlg=self.algoArgs.svdAlg, postProcess=self.algoArgs.postProcess, p=self.algoArgs.pSi, q=self.algoArgs.qSi)
                    learner.folds = self.algoArgs.folds
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        cvInds = Sampling.randCrossValidation(self.algoArgs.folds, modelSelectX.nnz)
                        meanErrors, stdErrors = learner.modelSelect2(modelSelectX, self.algoArgs.rhosSi, self.algoArgs.ks, cvInds)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanErrors, stdErrors)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runMaxLocalAuc:
            logging.debug("Running max local AUC")

            if self.algoArgs.loss != "tanh":
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + ".npz"
            else:
                resultsFileName = self.resultsDir + "ResultsMaxLocalAUC_loss=" + self.algoArgs.loss + "_rho=" + str(self.algoArgs.rhoMlauc) + ".npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    learner = MaxLocalAUC(self.algoArgs.k, 1-self.algoArgs.u, lmbdaU=self.algoArgs.lmbdaUMlauc, lmbdaV=self.algoArgs.lmbdaVMlauc, eps=self.algoArgs.epsMlauc, stochastic=not self.algoArgs.fullGradient)

                    learner.alpha = self.algoArgs.alpha
                    learner.alphas = self.algoArgs.alphas
                    learner.eta = self.algoArgs.eta
                    learner.folds = self.algoArgs.folds
                    learner.initialAlg = self.algoArgs.initialAlg
                    learner.itemExpP = self.algoArgs.itemExpP
                    learner.itemExpQ = self.algoArgs.itemExpQ
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasMlauc
                    learner.loss = self.algoArgs.loss
                    learner.maxIterations = self.algoArgs.maxIterations
                    learner.maxNorms = self.algoArgs.maxNorms
                    learner.maxNormU = self.algoArgs.maxNorm
                    learner.maxNormV = self.algoArgs.maxNorm
                    learner.metric = self.algoArgs.metric
                    learner.normalise = self.algoArgs.normalise
                    learner.numAucSamples = self.algoArgs.numAucSamples
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRowSamples = self.algoArgs.numRowSamples
                    learner.rate = self.algoArgs.rate
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.recordStep = self.algoArgs.recordStep
                    learner.rho = self.algoArgs.rhoMlauc
                    learner.rhos = self.algoArgs.rhosMlauc
                    learner.startAverage = self.algoArgs.startAverage
                    learner.t0 = self.algoArgs.t0
                    learner.t0s = self.algoArgs.t0s
                    learner.validationSize = self.algoArgs.validationSize
                    learner.validationUsers = self.algoArgs.validationUsers

                    modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")

                    if self.algoArgs.modelSelect and not os.path.isfile(modelSelectFileName):
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)
                        #
                        meanMetricsLR, paramDictLR = learner.learningRateSelect(modelSelectX)
                        meanMetricsMS, paramDictMS = learner.modelSelectLmbda(modelSelectX)

                        numpy.savez(modelSelectFileName, meanMetricsLR, meanMetricsMS)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)
                    elif self.algoArgs.modelSelect:
                        data = numpy.load(modelSelectFileName)
                        logging.debug("Read model selection file " + modelSelectFileName)
                        meanMetricsLR = data["arr_0"]
                        meanMetricsMS = data["arr_1"]

                        learner.learningRateSelect(meanMetrics=meanMetricsLR)
                        learner.modelSelectLmbda(meanMetrics=meanMetricsMS)

                    #Turn on (optionally) parallel SGD only at the final learning stage
                    learner.parallelSGD = self.algoArgs.parallelSGD
                    learner.maxIterations *= 2
                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runWarpMf:
            logging.debug("Running WARP loss MF")
            resultsFileName = self.resultsDir + "ResultsWarpMf.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    learner = WarpMf(self.algoArgs.k, self.algoArgs.lmbdas[0], u=self.algoArgs.u)
                    learner.ks = self.algoArgs.ks
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        logging.debug("Mean local AUCs = " + str(meanAucs))
                        logging.debug("Std local AUCs = " + str(stdAucs))

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runWrMf:
            logging.debug("Running Weighted Regularized Matrix Factorization")
            resultsFileName = self.resultsDir + "ResultsWrMf.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                trainXScipy = trainXScipy.tocsr()
                testXScipy = testXScipy.tocsr()

                try:
                    learner = WeightedMf(self.algoArgs.k, alpha=self.algoArgs.alphaWrMf, lmbda=self.algoArgs.lmbdasWrMf[0], maxIterations=self.algoArgs.maxIterationsWrMf)
                    learner.folds = self.algoArgs.folds
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasWrMf
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainXScipy, testXScipy, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runBpr:
            logging.debug("Running Bayesian Personalised Recommendation")
            resultsFileName = self.resultsDir + "ResultsBpr.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    #trainX = trainX.toScipyCsr()
                    #testX = testX.toScipyCsr()

                    learner = BprRecommender(self.algoArgs.k, lmbdaUser=self.algoArgs.lmbdaUserBpr, lmbdaPos=self.algoArgs.lmbdaItemBpr, lmbdaNeg=self.algoArgs.lmbdaItemBpr, gamma=self.algoArgs.gammaBpr)
                    learner.folds = self.algoArgs.folds
                    learner.gammas = self.algoArgs.gammasBpr
                    learner.ks = self.algoArgs.ks
                    learner.lmbdaItems = self.algoArgs.lmbdaItems
                    learner.lmbdaUsers = self.algoArgs.lmbdaUsers
                    learner.maxIterations = self.algoArgs.maxIterationsBpr
                    learner.metric = self.algoArgs.metric
                    #learner.numAucSamples = self.algoArgs.numAucSamples
                    learner.numProcesses = self.algoArgs.processes
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.recordStep = self.algoArgs.recordStep
                    learner.validationSize = self.algoArgs.validationSize

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                        modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)

                        meanAucs, stdAucs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanAucs, stdAucs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runKnn:
            logging.debug("Running kNN")
            resultsFileName = self.resultsDir + "ResultsKnn.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    trainX = trainX.toScipyCsr()
                    testX = testX.toScipyCsr()

                    learner = KNNRecommender(self.algoArgs.kns[0])
                    learner.numProcesses = self.algoArgs.processes

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        if self.algoArgs.runCLiMF:
            # !!!! no model selection
            logging.debug("Running CLiMF")
            resultsFileName = self.resultsDir + "ResultsCLiMF.npz"

            fileLock = FileLock(resultsFileName)

            if not (fileLock.isLocked() or fileLock.fileExists()) or self.algoArgs.overwrite:
                fileLock.lock()

                try:
                    logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))
                    modelSelectX, userInds = Sampling.sampleUsers2(trainX, self.algoArgs.modelSelectSamples, prune=True)
                    modelSelectX = scipy.sparse.csr_matrix(modelSelectX.toScipyCsr(), dtype=numpy.float64)
                    trainX = scipy.sparse.csr_matrix(trainX.toScipyCsr(), dtype=numpy.float64)
                    testX = testX.toScipyCsr()

                    learner = CLiMF(self.algoArgs.k, self.algoArgs.lmbdaCLiMF, self.algoArgs.gammaCLiMF)
                    learner.folds = self.algoArgs.folds
                    learner.gammas = self.algoArgs.gammasCLiMF
                    learner.ks = self.algoArgs.ks
                    learner.lmbdas = self.algoArgs.lmbdasCLiMF
                    learner.max_iters = self.algoArgs.maxIterCLiMF
                    learner.metric = self.algoArgs.metric
                    learner.numProcesses = self.algoArgs.processes
                    learner.numRecordAucSamples = self.algoArgs.numRecordAucSamples
                    learner.recommendSize = self.algoArgs.recommendSize
                    learner.validationSize = self.algoArgs.validationSize
                    learner.verbose = self.algoArgs.verbose

                    if self.algoArgs.modelSelect:
                        logging.debug("Performing model selection, taking sample size " + str(self.algoArgs.modelSelectSamples))

                        meanObjs, stdObjs = learner.modelSelect(modelSelectX)

                        modelSelectFileName = resultsFileName.replace("Results", "ModelSelect")
                        numpy.savez(modelSelectFileName, meanObjs, stdObjs)
                        logging.debug("Saved model selection grid as " + modelSelectFileName)

                    logging.debug(learner)

                    self.recordResults(X, trainX, testX, learner, resultsFileName)
                finally:
                    fileLock.unlock()
            else:
                logging.debug("File is locked or already computed: " + resultsFileName)

        logging.info("All done: see you around!")
Beispiel #16
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape

        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs)
        datas = []
        for (trainX, testX) in trainTestXs:
            testOmegaList = SparseUtils.getOmegaList(testX)
            #testX = trainX+testX
            datas.append((trainX, testX, testOmegaList))
        testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs)))
        
        logging.debug("Performing model selection")
        paramList = []        
        
        for i, k in enumerate(self.ks): 
            U, V = self.initUV(X, k)
            for lmbda in self.lmbdas:
                for gamma in self.gammas:
                    for (trainX, testX, testOmegaList) in datas:
                        learner = self.copy()
                        learner.k = k
                        learner.U = U.copy()
                        learner.V = V.copy()
                        learner.lmbda = lmbda
                        learner.gamma = gamma
                    
                        paramList.append((scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner))
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize)
        else: 
            resultsIterator = itertools.imap(computeTestF1, paramList)
        
        for i_k in range(len(self.ks)):
            for i_lmbda in range(len(self.lmbdas)):
                for i_gamma in range(len(self.gammas)):
                    for i_cv in range(len(trainTestXs)):             
                        testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next()
        
        if self.numProcesses != 1: 
            pool.terminate()
        
        meanTestMetrics = numpy.mean(testAucs, 3)
        stdTestMetrics = numpy.std(testAucs, 3)
        
        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdas=" + str(self.lmbdas))
        logging.debug("gammas=" + str(self.gammas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))
        
        i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape)
        self.k = self.ks[i_k]
        self.lmbda = self.lmbdas[i_lmbda]
        self.gamma = self.gammas[i_gamma]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma))
         
        return meanTestMetrics, stdTestMetrics
Beispiel #17
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape

        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                csarray=False,
                                                colProbs=colProbs)
        datas = []
        for (trainX, testX) in trainTestXs:
            testOmegaList = SparseUtils.getOmegaList(testX)
            #testX = trainX+testX
            datas.append((trainX, testX, testOmegaList))
        testAucs = numpy.zeros((len(self.ks), len(self.lmbdas),
                                len(self.gammas), len(trainTestXs)))

        logging.debug("Performing model selection")
        paramList = []

        for i, k in enumerate(self.ks):
            U, V = self.initUV(X, k)
            for lmbda in self.lmbdas:
                for gamma in self.gammas:
                    for (trainX, testX, testOmegaList) in datas:
                        learner = self.copy()
                        learner.k = k
                        learner.U = U.copy()
                        learner.V = V.copy()
                        learner.lmbda = lmbda
                        learner.gamma = gamma

                        paramList.append(
                            (scipy.sparse.csr_matrix(trainX,
                                                     dtype=numpy.float64),
                             scipy.sparse.csr_matrix(testX,
                                                     dtype=numpy.float64),
                             learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(computeTestF1, paramList,
                                        self.chunkSize)
        else:
            resultsIterator = itertools.imap(computeTestF1, paramList)

        for i_k in range(len(self.ks)):
            for i_lmbda in range(len(self.lmbdas)):
                for i_gamma in range(len(self.gammas)):
                    for i_cv in range(len(trainTestXs)):
                        testAucs[i_k, i_lmbda, i_gamma,
                                 i_cv] = resultsIterator.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanTestMetrics = numpy.mean(testAucs, 3)
        stdTestMetrics = numpy.std(testAucs, 3)

        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdas=" + str(self.lmbdas))
        logging.debug("gammas=" + str(self.gammas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))

        i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(),
                                                    meanTestMetrics.shape)
        self.k = self.ks[i_k]
        self.lmbda = self.lmbdas[i_lmbda]
        self.gamma = self.gammas[i_gamma]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" +
                      str(self.lmbda) + " gamma=" + str(self.gamma))

        return meanTestMetrics, stdTestMetrics
Beispiel #18
0
    def singleLearnModel(self, X, verbose=False, U=None, V=None):
        """
        Max local AUC with Frobenius norm penalty on V. Solve with (stochastic) gradient descent. 
        The input is a sparse array. 
        """
        #Convert to a csarray for faster access
        if scipy.sparse.issparse(X):
            logging.debug("Converting to csarray")
            X2 = sppy.csarray(X, storagetype="row")
            X = X2

        m, n = X.shape

        #We keep a validation set in order to determine when to stop
        if self.validationUsers != 0:
            numValidationUsers = int(m * self.validationUsers)
            trainX, testX, rowSamples = Sampling.shuffleSplitRows(
                X, 1, self.validationSize, numRows=numValidationUsers)[0]

            testIndPtr, testColInds = SparseUtils.getOmegaListPtr(testX)

            logging.debug("Train X shape and nnz: " + str(trainX.shape) + " " +
                          str(trainX.nnz))
            logging.debug("Validation X shape and nnz: " + str(testX.shape) +
                          " " + str(testX.nnz))
        else:
            trainX = X
            testX = None
            rowSamples = None
            testIndPtr, testColInds = None, None

        #Note that to compute the test AUC we pick i \in X and j \notin X \cup testX
        indPtr, colInds = SparseUtils.getOmegaListPtr(trainX)
        allIndPtr, allColInds = SparseUtils.getOmegaListPtr(X)

        if type(U) != numpy.ndarray and type(V) != numpy.ndarray:
            U, V = self.initUV(trainX)

        if self.metric == "f1":
            metricInd = 2
        elif self.metric == "mrr":
            metricInd = 3
        else:
            raise ValueError("Unknown metric: " + self.metric)

        muU = U.copy()
        muV = V.copy()
        bestMetric = 0
        bestU = 0
        bestV = 0
        trainMeasures = []
        testMeasures = []
        loopInd = 0
        lastObj = 0
        currentObj = lastObj - 2 * self.eps

        #Try alternative number of iterations
        #numIterations = trainX.nnz/self.numAucSamples
        numIterations = max(m, n)

        self.learnerCython = self.getCythonLearner()

        #Set up order of indices for stochastic methods
        permutedRowInds = numpy.array(numpy.random.permutation(m),
                                      numpy.uint32)
        permutedColInds = numpy.array(numpy.random.permutation(n),
                                      numpy.uint32)

        startTime = time.time()

        gi, gp, gq = self.computeGipq(X)
        normGp, normGq = self.computeNormGpq(indPtr, colInds, gp, gq, m)

        while loopInd < self.maxIterations and abs(lastObj -
                                                   currentObj) > self.eps:
            sigmaU = self.getSigma(loopInd, self.alpha, m)
            sigmaV = self.getSigma(loopInd, self.alpha, m)

            if loopInd % self.recordStep == 0:
                if loopInd != 0 and self.stochastic:
                    print("")

                printStr = self.recordResults(muU, muV, trainMeasures,
                                              testMeasures, loopInd,
                                              rowSamples, indPtr, colInds,
                                              testIndPtr, testColInds,
                                              allIndPtr, allColInds, gi, gp,
                                              gq, trainX, startTime)
                logging.debug(printStr)

                if testIndPtr is not None and testMeasures[-1][
                        metricInd] >= bestMetric:
                    bestMetric = testMeasures[-1][metricInd]
                    logging.debug("Current best metric=" + str(bestMetric))
                    bestU = muU.copy()
                    bestV = muV.copy()
                elif testIndPtr is None:
                    bestU = muU.copy()
                    bestV = muV.copy()

                #Compute objective averaged over last 5 recorded steps
                trainMeasuresArr = numpy.array(trainMeasures)
                lastObj = currentObj
                currentObj = numpy.mean(trainMeasuresArr[-5:, 0])

            U = numpy.ascontiguousarray(U)
            self.updateUV(indPtr, colInds, U, V, muU, muV, permutedRowInds,
                          permutedColInds, gp, gq, normGp, normGq, loopInd,
                          sigmaU, sigmaV, numIterations)
            loopInd += 1

        #Compute quantities for last U and V
        totalTime = time.time() - startTime
        printStr = "\nFinished, time=" + str('%.1f' % totalTime) + " "
        printStr += self.recordResults(muU, muV, trainMeasures, testMeasures,
                                       loopInd, rowSamples, indPtr, colInds,
                                       testIndPtr, testColInds, allIndPtr,
                                       allColInds, gi, gp, gq, trainX,
                                       startTime)
        printStr += " delta obj=" + "%.3e" % abs(lastObj - currentObj)
        logging.debug(printStr)

        self.U = bestU
        self.V = bestV
        self.gi = gi
        self.gp = gp
        self.gq = gq

        trainMeasures = numpy.array(trainMeasures)
        testMeasures = numpy.array(testMeasures)

        if verbose:
            return self.U, self.V, trainMeasures, testMeasures, loopInd, totalTime
        else:
            return self.U, self.V
Beispiel #19
0
    dataset = sys.argv[1]
else: 
    dataset = "synthetic"

saveResults = True
prefix = "LossROC"
outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" 
X = DatasetUtils.getDataset(dataset, nnz=20000)

m, n = X.shape
u = 0.1 
w = 1-u

testSize = 5
folds = 5
trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize)

numRecordAucSamples = 200

k2 = 8
u2 = 0.5
w2 = 1-u2
eps = 10**-4
lmbda = 0.0
maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=lmbda, lmbdaV=lmbda, stochastic=True)
maxLocalAuc.alpha = 0.05
maxLocalAuc.alphas = 2.0**-numpy.arange(0, 5, 1)
maxLocalAuc.folds = 1
maxLocalAuc.initialAlg = "rand"
maxLocalAuc.itemExpP = 0.0
maxLocalAuc.itemExpQ = 0.0
    def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. In this case we remove a few non zeros from each row 
        to form the test set. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all(): 
            raise ValueError("rhos must be in descending order")    

        trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, rowMajor=False, colProbs=colProbs)
        metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))
        
        if self.metric == "mse": 
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr": 
            metricFuction = learnPredictRanking
        else: 
            raise ValueError("Unknown metric: " + self.metric)
            
            
        paramList = []
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")

            for m, k in enumerate(ks): 
                learner = self.copy()
                learner.updateAlg="initial" 
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos)) 
            
        if self.numProcesses != 1: 
            pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=10)
            resultsIter = pool.imap(metricFuction, paramList)
        else: 
            resultsIter = itertools.imap(metricFuction, paramList)
        
        for i, (trainX, testX) in enumerate(trainTestXs):
            for m, k in enumerate(ks):
                metrics[:, m, i] = resultsIter.next()
        
        if self.numProcesses != 1: 
            pool.terminate()

        meanMetrics = metrics.mean(2)
        stdMetrics = metrics.std(2)
        
        logging.debug("ks=" + str(ks))
        logging.debug("rhos=" + str(rhos))
        logging.debug(meanMetrics)
        
        #Set the parameters 
        if self.metric == "mse": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics), meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr": 
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[0]]) 
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics), meanMetrics.shape)[1]])
            

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" + str(self.rho))

        return meanMetrics, stdMetrics
Beispiel #21
0
    def modelSelect(self, X, colProbs=None):
        """
        Perform model selection on X and return the best parameters. 
        """
        m, n = X.shape
        #cvInds = Sampling.randCrossValidation(self.folds, X.nnz)
        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                colProbs=colProbs)
        testMetrics = numpy.zeros(
            (self.ks.shape[0], self.lmbdas.shape[0], len(trainTestXs)))

        if self.metric == "mrr":
            evaluationMethod = computeTestMRR
        elif self.metric == "f1":
            evaluationMethod = computeTestF1
        else:
            raise ValueError("Invalid metric: " + self.metric)

        logging.debug("Performing model selection")
        paramList = []

        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv, (trainX, testX) in enumerate(trainTestXs):
                    learner = self.copy()
                    learner.k = k
                    learner.lmbda = lmbda

                    paramList.append(
                        (trainX.toScipyCsr(), testX.toScipyCsr(), learner))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=100)
            resultsIterator = pool.imap(evaluationMethod, paramList,
                                        self.chunkSize)
        else:
            import itertools
            resultsIterator = itertools.imap(evaluationMethod, paramList)

        for i, k in enumerate(self.ks):
            for j, lmbda in enumerate(self.lmbdas):
                for icv in range(len(trainTestXs)):
                    testMetrics[i, j, icv] = resultsIterator.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanTestMetrics = numpy.mean(testMetrics, 2)
        stdTestMetrics = numpy.std(testMetrics, 2)

        logging.debug("ks=" + str(self.ks))
        logging.debug("lmbdas=" + str(self.lmbdas))
        logging.debug("Mean metrics=" + str(meanTestMetrics))

        self.k = self.ks[numpy.unravel_index(numpy.argmax(meanTestMetrics),
                                             meanTestMetrics.shape)[0]]
        self.lmbda = self.lmbdas[numpy.unravel_index(
            numpy.argmax(meanTestMetrics), meanTestMetrics.shape)[1]]

        logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" +
                      str(self.lmbda))

        return meanTestMetrics, stdTestMetrics
Beispiel #22
0
    def modelSelect2(self, X, rhos, ks, cvInds, colProbs=None):
        """
        Pick a value of rho based on a single matrix X. We do cross validation
        within, and return the best value of lambda (according to the mean
        squared error). The rhos must be in decreasing order and we use 
        warm restarts. In this case we remove a few non zeros from each row 
        to form the test set. 
        """
        if (numpy.flipud(numpy.sort(rhos)) != rhos).all():
            raise ValueError("rhos must be in descending order")

        trainTestXs = Sampling.shuffleSplitRows(X,
                                                self.folds,
                                                self.validationSize,
                                                csarray=False,
                                                rowMajor=False,
                                                colProbs=colProbs)
        metrics = numpy.zeros((rhos.shape[0], ks.shape[0], len(cvInds)))

        if self.metric == "mse":
            metricFuction = learnPredictMSE
        elif self.metric == "f1" or self.metric == "mrr":
            metricFuction = learnPredictRanking
        else:
            raise ValueError("Unknown metric: " + self.metric)

        paramList = []

        for i, (trainX, testX) in enumerate(trainTestXs):
            Util.printIteration(i, 1, len(cvInds), "Fold: ")

            for m, k in enumerate(ks):
                learner = self.copy()
                learner.updateAlg = "initial"
                learner.setK(k)
                paramList.append((learner, trainX, testX, rhos))

        if self.numProcesses != 1:
            pool = multiprocessing.Pool(processes=self.numProcesses,
                                        maxtasksperchild=10)
            resultsIter = pool.imap(metricFuction, paramList)
        else:
            resultsIter = itertools.imap(metricFuction, paramList)

        for i, (trainX, testX) in enumerate(trainTestXs):
            for m, k in enumerate(ks):
                metrics[:, m, i] = resultsIter.next()

        if self.numProcesses != 1:
            pool.terminate()

        meanMetrics = metrics.mean(2)
        stdMetrics = metrics.std(2)

        logging.debug("ks=" + str(ks))
        logging.debug("rhos=" + str(rhos))
        logging.debug(meanMetrics)

        #Set the parameters
        if self.metric == "mse":
            self.setRho(rhos[numpy.unravel_index(numpy.argmin(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmin(meanMetrics),
                                             meanMetrics.shape)[1]])
        elif self.metric == "f1" or self.metric == "mrr":
            self.setRho(rhos[numpy.unravel_index(numpy.argmax(meanMetrics),
                                                 meanMetrics.shape)[0]])
            self.setK(ks[numpy.unravel_index(numpy.argmax(meanMetrics),
                                             meanMetrics.shape)[1]])

        logging.debug("Model parameters: k=" + str(self.k) + " rho=" +
                      str(self.rho))

        return meanMetrics, stdMetrics