def testGetOmegaList(self): import sppy m = 10 n = 5 X = scipy.sparse.rand(m, n, 0.1) X = X.tocsr() omegaList = SparseUtils.getOmegaList(X) for i in range(m): nptst.assert_array_almost_equal(omegaList[i], X.toarray()[i, :].nonzero()[0]) Xsppy = sppy.csarray(X) omegaList = SparseUtils.getOmegaList(Xsppy) for i in range(m): nptst.assert_array_almost_equal(omegaList[i], X.toarray()[i, :].nonzero()[0])
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros( (self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) localAucs = numpy.zeros((self.ks.shape[0], self.lmbdas.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): maxLocalAuc = self.copy() maxLocalAuc.k = k paramList.append((trainX, testX, testOmegaList, maxLocalAuc)) pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(localAucsLmbdas, paramList, self.chunkSize) #import itertools #resultsIterator = itertools.imap(localAucsLmbdas, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempAucs = resultsIterator.next() localAucs[i, :, icv] = tempAucs pool.terminate() meanLocalAucs = numpy.mean(localAucs, 2) stdLocalAucs = numpy.std(localAucs, 2) logging.debug(meanLocalAucs) k = self.ks[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[0]] lmbda = self.lmbdas[numpy.unravel_index(numpy.argmax(meanLocalAucs), meanLocalAucs.shape)[1]] logging.debug("Model parameters: k=" + str(k) + " lmbda=" + str(lmbda)) self.k = k self.lmbda = lmbda return meanLocalAucs, stdLocalAucs
def modelSelect(self, X): """ Perform model selection on X and return the best parameters. """ m, n = X.shape cvInds = Sampling.randCrossValidation(self.folds, X.nnz) precisions = numpy.zeros((self.ks.shape[0], len(cvInds))) logging.debug("Performing model selection") paramList = [] for icv, (trainInds, testInds) in enumerate(cvInds): Util.printIteration(icv, 1, self.folds, "Fold: ") trainX = SparseUtils.submatrix(X, trainInds) testX = SparseUtils.submatrix(X, testInds) testOmegaList = SparseUtils.getOmegaList(testX) for i, k in enumerate(self.ks): learner = self.copy() learner.k = k paramList.append((trainX, testX, testOmegaList, learner)) #pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) #resultsIterator = pool.imap(computePrecision, paramList, self.chunkSize) import itertools resultsIterator = itertools.imap(computePrecision, paramList) for icv, (trainInds, testInds) in enumerate(cvInds): for i, k in enumerate(self.ks): tempPrecision = resultsIterator.next() precisions[i, icv] = tempPrecision #pool.terminate() meanPrecisions = numpy.mean(precisions, 1) stdPrecisions = numpy.std(precisions, 1) logging.debug(meanPrecisions) k = self.ks[numpy.argmax(meanPrecisions)] logging.debug("Model parameters: k=" + str(k)) self.k = k return meanPrecisions, stdPrecisions
def localAUCApprox2(X, U, V, w, numAucSamples=50, omegaList=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. """ #For now let's compute the full matrix Z = U.dot(V.T) localAuc = numpy.zeros(X.shape[0]) allInds = numpy.arange(X.shape[1]) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if omegaList == None: omegaList = SparseUtils.getOmegaList(X) for i in range(X.shape[0]): omegai = omegaList[i] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for j in range(numAucSamples): ind = numpy.random.randint(omegai.shape[0] * omegaBari.shape[0]) p = omegai[int(ind / omegaBari.shape[0])] q = omegaBari[ind % omegaBari.shape[0]] if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc / float(numAucSamples) localAuc = localAuc.mean() return localAuc
def localAUCApprox2(X, U, V, w, numAucSamples=50, omegaList=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. """ #For now let's compute the full matrix Z = U.dot(V.T) localAuc = numpy.zeros(X.shape[0]) allInds = numpy.arange(X.shape[1]) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if omegaList==None: omegaList = SparseUtils.getOmegaList(X) for i in range(X.shape[0]): omegai = omegaList[i] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for j in range(numAucSamples): ind = numpy.random.randint(omegai.shape[0]*omegaBari.shape[0]) p = omegai[int(ind/omegaBari.shape[0])] q = omegaBari[ind % omegaBari.shape[0]] if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc/float(numAucSamples) localAuc = localAuc.mean() return localAuc
def profileLocalAucApprox(self): m = 500 n = 1000 k = 10 X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) u = 0.1 w = 1 - u numAucSamples = 200 omegaList = SparseUtils.getOmegaList(X) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) numRuns = 10 def run(): for i in range(numRuns): MCEvaluator.localAUCApprox(X, U, V, omegaList, numAucSamples, r) ProfileUtils.profile('run()', globals(), locals())
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs) datas = [] for (trainX, testX) in trainTestXs: testOmegaList = SparseUtils.getOmegaList(testX) #testX = trainX+testX datas.append((trainX, testX, testOmegaList)) testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs))) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): U, V = self.initUV(X, k) for lmbda in self.lmbdas: for gamma in self.gammas: for (trainX, testX, testOmegaList) in datas: learner = self.copy() learner.k = k learner.U = U.copy() learner.V = V.copy() learner.lmbda = lmbda learner.gamma = gamma paramList.append( (scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: resultsIterator = itertools.imap(computeTestF1, paramList) for i_k in range(len(self.ks)): for i_lmbda in range(len(self.lmbdas)): for i_gamma in range(len(self.gammas)): for i_cv in range(len(trainTestXs)): testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testAucs, 3) stdTestMetrics = numpy.std(testAucs, 3) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[i_k] self.lmbda = self.lmbdas[i_lmbda] self.gamma = self.gammas[i_gamma] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma)) return meanTestMetrics, stdTestMetrics
def shuffleSplitRows(X, k, testSize, numRows=None, csarray=True, rowMajor=True, colProbs=None): """ Take a sparse binary matrix and create k number of train-test splits in which the test split contains at most testSize elements and the train split contains the remaining elements from X for each row. The splits are computed randomly. Returns sppy.csarray objects by default. :param colProbs: This is the probability of choosing the corresponding column/item. If None, we assume uniform probabilities. """ if csarray: mattype = "csarray" else: mattype = "scipy" if rowMajor: storagetype = "row" else: storagetype = "col" if numRows == None: numRows = X.shape[0] outputRows = False else: outputRows = True trainTestXList = [] omegaList = SparseUtils.getOmegaList(X) m, n = X.shape for i in range(k): trainInd = 0 testInd = 0 trainRowInds = numpy.zeros(X.nnz, numpy.int32) trainColInds = numpy.zeros(X.nnz, numpy.int32) testRowInds = numpy.zeros(X.shape[0]*testSize, numpy.int32) testColInds = numpy.zeros(X.shape[0]*testSize, numpy.int32) rowSample = numpy.sort(numpy.random.choice(m, numRows, replace=False)) for j in range(m): if j in rowSample: if colProbs == None: inds = numpy.random.permutation(omegaList[j].shape[0]) else: probs = colProbs[omegaList[j]] probs /= probs.sum() inds = numpy.random.choice(omegaList[j].shape[0], omegaList[j].shape[0], p=probs, replace=False) trainInds = inds[testSize:] testInds = inds[0:testSize] else: trainInds = numpy.arange(omegaList[j].shape[0]) testInds = numpy.array([], numpy.int) trainRowInds[trainInd:trainInd+trainInds.shape[0]] = numpy.ones(trainInds.shape[0], dtype=numpy.uint)*j trainColInds[trainInd:trainInd+trainInds.shape[0]] = omegaList[j][trainInds] trainInd += trainInds.shape[0] testRowInds[testInd:testInd+testInds.shape[0]] = numpy.ones(testInds.shape[0], dtype=numpy.uint)*j testColInds[testInd:testInd+testInds.shape[0]] = omegaList[j][testInds] testInd += testInds.shape[0] trainRowInds = trainRowInds[0:trainInd] trainColInds = trainColInds[0:trainInd] testRowInds = testRowInds[0:testInd] testColInds = testColInds[0:testInd] trainX = SparseUtils.sparseMatrix(numpy.ones(trainRowInds.shape[0], numpy.int), trainRowInds, trainColInds, X.shape, mattype, storagetype) testX = SparseUtils.sparseMatrix(numpy.ones(testRowInds.shape[0], numpy.int), testRowInds, testColInds, X.shape, mattype, storagetype) if not outputRows: trainTestXList.append((trainX, testX)) else: trainTestXList.append((trainX, testX, rowSample)) return trainTestXList
def shuffleSplitRows(X, k, testSize, numRows=None, csarray=True, rowMajor=True, colProbs=None): """ Take a sparse binary matrix and create k number of train-test splits in which the test split contains at most testSize elements and the train split contains the remaining elements from X for each row. The splits are computed randomly. Returns sppy.csarray objects by default. :param colProbs: This is the probability of choosing the corresponding column/item. If None, we assume uniform probabilities. """ if csarray: mattype = "csarray" else: mattype = "scipy" if rowMajor: storagetype = "row" else: storagetype = "col" if numRows == None: numRows = X.shape[0] outputRows = False else: outputRows = True trainTestXList = [] omegaList = SparseUtils.getOmegaList(X) m, n = X.shape for i in range(k): trainInd = 0 testInd = 0 trainRowInds = numpy.zeros(X.nnz, numpy.int32) trainColInds = numpy.zeros(X.nnz, numpy.int32) testRowInds = numpy.zeros(X.shape[0] * testSize, numpy.int32) testColInds = numpy.zeros(X.shape[0] * testSize, numpy.int32) rowSample = numpy.sort( numpy.random.choice(m, numRows, replace=False)) for j in range(m): if j in rowSample: if colProbs == None: inds = numpy.random.permutation(omegaList[j].shape[0]) else: probs = colProbs[omegaList[j]] probs /= probs.sum() inds = numpy.random.choice(omegaList[j].shape[0], omegaList[j].shape[0], p=probs, replace=False) trainInds = inds[testSize:] testInds = inds[0:testSize] else: trainInds = numpy.arange(omegaList[j].shape[0]) testInds = numpy.array([], numpy.int) trainRowInds[trainInd:trainInd + trainInds.shape[0]] = numpy.ones( trainInds.shape[0], dtype=numpy.uint) * j trainColInds[trainInd:trainInd + trainInds.shape[0]] = omegaList[j][trainInds] trainInd += trainInds.shape[0] testRowInds[testInd:testInd + testInds.shape[0]] = numpy.ones( testInds.shape[0], dtype=numpy.uint) * j testColInds[testInd:testInd + testInds.shape[0]] = omegaList[j][testInds] testInd += testInds.shape[0] trainRowInds = trainRowInds[0:trainInd] trainColInds = trainColInds[0:trainInd] testRowInds = testRowInds[0:testInd] testColInds = testColInds[0:testInd] trainX = SparseUtils.sparseMatrix( numpy.ones(trainRowInds.shape[0], numpy.int), trainRowInds, trainColInds, X.shape, mattype, storagetype) testX = SparseUtils.sparseMatrix( numpy.ones(testRowInds.shape[0], numpy.int), testRowInds, testColInds, X.shape, mattype, storagetype) if not outputRows: trainTestXList.append((trainX, testX)) else: trainTestXList.append((trainX, testX, rowSample)) return trainTestXList
#Create a low rank matrix m = 500 n = 1000 k = 20 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, 0.95) logging.debug("Number of non zero elements: " + str(X.nnz)) lmbda = 0.0 numAucSamples = 1000 u = 0.1 sigma = 1 nu = 1 nuBar = 1 project = False omegaList = SparseUtils.getOmegaList(X) U = numpy.random.rand(m, k) V = numpy.random.rand(n, k) r = SparseUtilsCython.computeR(U, V, 1-u, numAucSamples) numPoints = 50 sampleSize = 10 numAucSamplesList = numpy.linspace(1, 50, numPoints) norms = numpy.zeros(numPoints) originalU = U.copy() for s in range(sampleSize): print(s) i = numpy.random.randint(m)
def modelSelect(self, X, colProbs=None): """ Perform model selection on X and return the best parameters. """ m, n = X.shape trainTestXs = Sampling.shuffleSplitRows(X, self.folds, self.validationSize, csarray=False, colProbs=colProbs) datas = [] for (trainX, testX) in trainTestXs: testOmegaList = SparseUtils.getOmegaList(testX) #testX = trainX+testX datas.append((trainX, testX, testOmegaList)) testAucs = numpy.zeros((len(self.ks), len(self.lmbdas), len(self.gammas), len(trainTestXs))) logging.debug("Performing model selection") paramList = [] for i, k in enumerate(self.ks): U, V = self.initUV(X, k) for lmbda in self.lmbdas: for gamma in self.gammas: for (trainX, testX, testOmegaList) in datas: learner = self.copy() learner.k = k learner.U = U.copy() learner.V = V.copy() learner.lmbda = lmbda learner.gamma = gamma paramList.append((scipy.sparse.csr_matrix(trainX, dtype=numpy.float64), scipy.sparse.csr_matrix(testX, dtype=numpy.float64), learner)) if self.numProcesses != 1: pool = multiprocessing.Pool(processes=self.numProcesses, maxtasksperchild=100) resultsIterator = pool.imap(computeTestF1, paramList, self.chunkSize) else: resultsIterator = itertools.imap(computeTestF1, paramList) for i_k in range(len(self.ks)): for i_lmbda in range(len(self.lmbdas)): for i_gamma in range(len(self.gammas)): for i_cv in range(len(trainTestXs)): testAucs[i_k, i_lmbda, i_gamma, i_cv] = resultsIterator.next() if self.numProcesses != 1: pool.terminate() meanTestMetrics = numpy.mean(testAucs, 3) stdTestMetrics = numpy.std(testAucs, 3) logging.debug("ks=" + str(self.ks)) logging.debug("lmbdas=" + str(self.lmbdas)) logging.debug("gammas=" + str(self.gammas)) logging.debug("Mean metrics=" + str(meanTestMetrics)) i_k, i_lmbda, i_gamma = numpy.unravel_index(meanTestMetrics.argmax(), meanTestMetrics.shape) self.k = self.ks[i_k] self.lmbda = self.lmbdas[i_lmbda] self.gamma = self.gammas[i_gamma] logging.debug("Model parameters: k=" + str(self.k) + " lmbda=" + str(self.lmbda) + " gamma=" + str(self.gamma)) return meanTestMetrics, stdTestMetrics
dataset = "epinions" #Create a low rank matrix saveResults = True prefix = "Convergence2" outputFile = PathDefaults.getOutputDir() + "ranking/" + prefix + dataset.title() + "Results.npz" X = DatasetUtils.getDataset(dataset, nnz=20000) m, n = X.shape folds = 3 testSize = 5 trainTestXs = Sampling.shuffleSplitRows(X, folds, testSize) trainX, testX = trainTestXs[0] trainOmegaList = SparseUtils.getOmegaList(trainX) trainOmegaPtr = SparseUtils.getOmegaListPtr(trainX) testOmegaList = SparseUtils.getOmegaList(testX) testOmegaPtr = SparseUtils.getOmegaListPtr(testX) allOmegaPtr = SparseUtils.getOmegaListPtr(X) numRecordAucSamples = 200 logging.debug("Number of non-zero elements: " + str((trainX.nnz, testX.nnz))) k2 = 64 u2 = 5/float(n) w2 = 1-u2 eps = 10**-8 lmbda = 0.01 maxLocalAuc = MaxLocalAUC(k2, w2, eps=eps, lmbdaU=0.1, lmbdaV=0.1, stochastic=True) maxLocalAuc.alpha = 32