def testComputeR(self): U = numpy.random.rand(10, 5) V = numpy.random.rand(15, 5) Z = U.dot(V.T) u = 1.0 r = SparseUtilsCython.computeR(U, V, u, indsPerRow=1000) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z.max(1) - r)/numpy.linalg.norm(Z.max(1)) < tol) u = 0.0 r = SparseUtilsCython.computeR(U, V, u, indsPerRow=1000) self.assertTrue(numpy.linalg.norm(Z.min(1) - r)/numpy.linalg.norm(Z.min(1)) < tol) u = 0.3 r = SparseUtilsCython.computeR(U, V, u, indsPerRow=1000) r2 = numpy.percentile(Z, u*100.0, 1) #nptst.assert_array_almost_equal(r, r2, 2) self.assertTrue(numpy.linalg.norm(r - r2)/numpy.linalg.norm(r) < tol) #Try a larger matrix U = numpy.random.rand(100, 5) V = numpy.random.rand(105, 5) Z = U.dot(V.T) r = SparseUtilsCython.computeR(U, V, u) r2 = numpy.percentile(Z, u*100.0, 1) self.assertTrue(numpy.linalg.norm(r-r2) < 0.5)
def testPartialOuterProduct(self): m = 15 n = 10 u = numpy.random.rand(m) v = numpy.random.rand(n) Y = numpy.outer(u, v) inds = numpy.nonzero(Y) rowInds = numpy.array(inds[0], numpy.int32) colInds = numpy.array(inds[1], numpy.int32) vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, u, v) X = numpy.reshape(vals, Y.shape) nptst.assert_almost_equal(X, Y) #Try just some indices density = 0.2 A = scipy.sparse.rand(n, n, density) inds = A.nonzero() rowInds = numpy.array(inds[0], numpy.int32) colInds = numpy.array(inds[1], numpy.int32) vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, u, v) for i in range(inds[0].shape[0]): j = inds[0][i] k = inds[1][i] self.assertAlmostEquals(vals[i], Y[j, k]) self.assertEquals(A.nnz, inds[0].shape[0])
def centerRows(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each row nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0]) inds = nonZeroCounts==0 nonZeroCounts += inds #This is required because when we do X.sum(1) for centering it uses the same #dtype as X to store the sum, and this can result in overflow for e.g. uint8 if X.dtype == numpy.uint8: sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(X[rowInds, colInds]).flatten(), X.shape[0]) else: sumCol = numpy.array(X.sum(1)).flatten() mu = sumCol/nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def centerRows(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each row nonZeroCounts = numpy.bincount(rowInds, minlength=X.shape[0]) inds = nonZeroCounts == 0 nonZeroCounts += inds #This is required because when we do X.sum(1) for centering it uses the same #dtype as X to store the sum, and this can result in overflow for e.g. uint8 if X.dtype == numpy.uint8: sumCol = SparseUtilsCython.sumCols( rowInds, numpy.array(X[rowInds, colInds]).flatten(), X.shape[0]) else: sumCol = numpy.array(X.sum(1)).flatten() mu = sumCol / nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def testPartialReconstructValsPQ(self): n = 10 Y = numpy.random.rand(n, n) U, s, V = numpy.linalg.svd(Y) V = V.T V = numpy.ascontiguousarray(V) rowInds, colInds = numpy.nonzero(Y) rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals = SparseUtilsCython.partialReconstructValsPQ(rowInds, colInds, numpy.ascontiguousarray(U*s), V) X = numpy.reshape(vals, Y.shape) nptst.assert_almost_equal(X, Y) #Try just some indices density = 0.2 A = scipy.sparse.rand(n, n, density) inds = A.nonzero() rowInds = numpy.array(inds[0], numpy.int32) colInds = numpy.array(inds[1], numpy.int32) vals = SparseUtilsCython.partialReconstructValsPQ(rowInds, colInds, numpy.ascontiguousarray(U*s), V) for i in range(inds[0].shape[0]): j = inds[0][i] k = inds[1][i] self.assertAlmostEquals(vals[i], Y[j, k]) self.assertEquals(A.nnz, inds[0].shape[0])
def testGenerateSparseBinaryMatrix(self): m = 5 n = 10 k = 3 quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) Xscipy = numpy.array(X.todense()) nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) self.assertTrue(numpy.linalg.norm(X - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.todense(), numpy.ones((m,n))) quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) Xcsarray = X.toarray() nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) self.assertTrue(numpy.linalg.norm(X.toarray() - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.toarray(), numpy.ones((m,n))) nptst.assert_array_equal(Xcsarray, Xscipy) #Test variation in the quantiles w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.1, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2) #Test a larger standard deviation w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.5, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>=r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2)
def uncenter(X, mu1, mu2): """ Uncenter a matrix with mu1 and mu2, the row and columns means of the original matrix. X is the centered matrix. """ rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals1 = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.array(mu1, numpy.float), numpy.ones(X.shape[1])) vals2 = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.ones(X.shape[0]), numpy.array(mu2, numpy.float)) X[rowInds, colInds] = X[rowInds, colInds] + vals1 + vals2 return X
def testCenterRowsCsarray(self): numRuns = 10 for i in range(numRuns): density = numpy.random.rand() m = numpy.random.randint(10, 100) n = numpy.random.randint(10, 100) X = sppy.rand((m,n), density) SparseUtilsCython.centerRowsCsarray(X) nptst.assert_array_almost_equal(X.sum(1), numpy.zeros(m))
def unshrink(self, X, U, V): """ Perform post-processing on a factorisation of a matrix X use factor vectors U and V. """ logging.debug("Post processing singular values") #Fix for versions of numpy < 1.7 inds = numpy.unique( numpy.random.randint( 0, X.data.shape[0], numpy.min([self.postProcessSamples, X.data.shape[0]]))) a = numpy.array(X[X.nonzero()]).ravel()[inds] B = numpy.zeros((a.shape[0], U.shape[1])) rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds[inds], numpy.int32) colInds = numpy.array(colInds[inds], numpy.int32) #Populate B for i in range(U.shape[1]): B[:, i] = SparseUtilsCython.partialOuterProduct( rowInds, colInds, U[:, i], V[:, i]) s = numpy.linalg.pinv(B.T.dot(B)).dot(B.T).dot(a) return s
def reconstructLowRank(U, s, V, k): """ Take the SVD of a low rank matrix and partially compute it with at most k values. If k is an array of values [0, U.shape[0]*V.shape[0]] then these indices are used for reconstruction. """ (m, n) = (U.shape[0], V.shape[0]) if type(k) == numpy.ndarray: inds = k inds = numpy.unique(inds) rowInds, colInds = numpy.unravel_index(inds, (m, n)) elif type(k) == tuple: rowInds, colInds = k else: inds = numpy.random.randint(0, n * m, k) inds = numpy.unique(inds) rowInds, colInds = numpy.unravel_index(inds, (m, n)) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) X = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), U * s, V) return X
def localAUCApprox(positiveArray, U, V, w, numAucSamples=50, r=None, allArray=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. The AUC is computed using positiveArray which is a tuple (indPtr, colInds) assuming allArray is None. If allArray is not None then positive items are chosen from positiveArray and negative ones are chosen to complement allArray. """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) indPtr, colInds = positiveArray U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) if r is None: r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if allArray is None: return MCEvaluatorCython.localAUCApprox(indPtr, colInds, indPtr, colInds, U, V, numAucSamples, r) else: allIndPtr, allColInd = allArray return MCEvaluatorCython.localAUCApprox(indPtr, colInds, allIndPtr, allColInd, U, V, numAucSamples, r)
def generateSparseBinaryMatrix(shape, p, w=0.9, sd=0, csarray=False, verbose=False, indsPerRow=50): """ Create an underlying matrix Z = UsV.T of rank p and then go through each row and threshold so that a proportion quantile numbers are kept. The final matrix is a 0/1 matrix. We order each row of Z in ascending order and then keep those bigger than u. In other words w=0 keeps all numbers and w=1.0 keeps none. """ m, n = shape U, s, V = SparseUtils.generateLowRank(shape, p) X = (U*s).dot(V.T) wv = numpy.random.randn(m)*sd + w wv = numpy.clip(wv, 0, 1) r = SparseUtilsCython.computeR2((U*s), V, wv, indsPerRow=indsPerRow) for i in range(m): X[i, X[i, :] >= r[i]] = 1 X[i, X[i, :] < r[i]] = 0 if csarray: import sppy X = sppy.csarray(X, storagetype="row") else: X = scipy.sparse.csr_matrix(X) if verbose: return X, U, s, V, wv else: return X
def centerCols(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each col nonZeroCounts = numpy.bincount(colInds, minlength=X.shape[1]) inds = nonZeroCounts == 0 nonZeroCounts += inds mu = numpy.array(X.sum(0), numpy.float).ravel() / nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.ones(X.shape[0]), numpy.array(mu, numpy.float)) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def uncenter(X, mu1, mu2): """ Uncenter a matrix with mu1 and mu2, the row and columns means of the original matrix. X is the centered matrix. """ rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals1 = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.array(mu1, numpy.float), numpy.ones(X.shape[1])) vals2 = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.ones(X.shape[0]), numpy.array(mu2, numpy.float)) X[rowInds, colInds] = X[rowInds, colInds] + vals1 + vals2 return X
def testSumCols(self): A = scipy.sparse.rand(10, 15, 0.5)*10 A = scipy.sparse.csc_matrix(A, dtype=numpy.uint8) rowInds, colInds = A.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) sumCol = SparseUtilsCython.sumCols(rowInds, numpy.array(A[rowInds, colInds]).flatten(), A.shape[0]) nptst.assert_array_equal(numpy.array(A.sum(1)).flatten(), sumCol)
def testStratifiedRecallAtk(self): m = 20 n = 50 r = 3 alpha = 1 X, U, V = SparseUtilsCython.generateSparseBinaryMatrixPL((m, n), r, density=0.2, alpha=alpha, csarray=True) itemCounts = numpy.array(X.sum(0) + 1, numpy.int32) (indPtr, colInds) = X.nonzeroRowsPtr() indPtr = numpy.array(indPtr, numpy.uint32) colInds = numpy.array(colInds, numpy.uint32) k = 5 orderedItems = numpy.random.randint(0, n, m * k) orderedItems = numpy.reshape(orderedItems, (m, k)) orderedItems = numpy.array(orderedItems, numpy.int32) beta = 0.5 recalls, denominators = MCEvaluatorCython.stratifiedRecallAtk( indPtr, colInds, orderedItems, itemCounts, beta) recalls2 = numpy.zeros(m) #Now compute recalls from scratch for i in range(m): omegai = colInds[indPtr[i]:indPtr[i + 1]] numerator = 0 for j in range(k): if orderedItems[i, j] in omegai: numerator += 1 / itemCounts[orderedItems[i, j]]**beta denominator = 0 for j in omegai: denominator += 1 / itemCounts[j]**beta recalls2[i] = numerator / denominator nptst.assert_array_equal(recalls, recalls2) #Now try to match with normal recall itemCounts = numpy.ones(n, numpy.int32) recalls, denominators = MCEvaluatorCython.stratifiedRecallAtk( indPtr, colInds, orderedItems, itemCounts, beta) recalls2 = MCEvaluatorCython.recallAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(recalls, recalls2)
def testStratifiedRecallAtk(self): m = 20 n = 50 r = 3 alpha = 1 X, U, V = SparseUtilsCython.generateSparseBinaryMatrixPL((m,n), r, density=0.2, alpha=alpha, csarray=True) itemCounts = numpy.array(X.sum(0)+1, numpy.int32) (indPtr, colInds) = X.nonzeroRowsPtr() indPtr = numpy.array(indPtr, numpy.uint32) colInds = numpy.array(colInds, numpy.uint32) k = 5 orderedItems = numpy.random.randint(0, n, m*k) orderedItems = numpy.reshape(orderedItems, (m, k)) orderedItems = numpy.array(orderedItems, numpy.int32) beta = 0.5 recalls, denominators = MCEvaluatorCython.stratifiedRecallAtk(indPtr, colInds, orderedItems, itemCounts, beta) recalls2 = numpy.zeros(m) #Now compute recalls from scratch for i in range(m): omegai = colInds[indPtr[i]:indPtr[i+1]] numerator = 0 for j in range(k): if orderedItems[i, j] in omegai: numerator += 1/itemCounts[orderedItems[i, j]]**beta denominator = 0 for j in omegai: denominator += 1/itemCounts[j]**beta recalls2[i] = numerator/denominator nptst.assert_array_equal(recalls, recalls2) #Now try to match with normal recall itemCounts = numpy.ones(n, numpy.int32) recalls, denominators = MCEvaluatorCython.stratifiedRecallAtk(indPtr, colInds, orderedItems, itemCounts, beta) recalls2 = MCEvaluatorCython.recallAtk(indPtr, colInds, orderedItems) nptst.assert_array_equal(recalls, recalls2)
def testGenerateSparseBinaryMatrixPL(self): m = 200 n = 100 k = 3 density = 0.1 numpy.random.seed(21) X, U, V = SparseUtilsCython.generateSparseBinaryMatrixPL((m,n), k, density=density, csarray=True) #Just check that the distributions are roughtly power law print(numpy.histogram(X.sum(0))) print(numpy.histogram(X.sum(1))) self.assertAlmostEqual(X.nnz/float(m*n), density, 2) self.assertEquals(X.shape, (m, n))
def uncenterRows(X, mu): """ Take a matrix with rows centered using mu, and return them to their original state. Note that one should call X.eliminate_zeros() beforehand. """ if X.shape[0] != mu.shape[0]: raise ValueError("Invalid number of rows") rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[rowInds, colInds] = numpy.array(X[rowInds, colInds] + vals, numpy.float) return X
def reconstructLowRankPQ(P, Q, inds): """ Given an array of unique indices inds in [0, U.shape[0]*V.shape[0]-1], partially reconstruct $P*Q^T$. The returned matrix is a scipy csc_matrix. """ (m, n) = (P.shape[0], Q.shape[0]) if type(inds) == tuple: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int) colInds = numpy.array(colInds, numpy.int) else: rowInds, colInds = numpy.unravel_index(inds, (m, n)) X = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), P, Q) return X
def localAucsLmbdas(args): trainX, testX, testOmegaList, learner = args (m, n) = trainX.shape localAucs = numpy.zeros(learner.lmbdas.shape[0]) for j, lmbda in enumerate(learner.lmbdas): learner.lmbda = lmbda U, V = learner.learnModel(trainX) r = SparseUtilsCython.computeR(U, V, 1-learner.u, learner.numAucSamples) localAucs[j] = MCEvaluator.localAUCApprox(testX, U, V, testOmegaList, learner.numAucSamples, r) logging.debug("Local AUC: " + str(localAucs[j]) + " with k = " + str(learner.k) + " and lmbda= " + str(learner.lmbda)) return localAucs
def reconstructLowRankPQ(P, Q, inds): """ Given an array of unique indices inds in [0, U.shape[0]*V.shape[0]-1], partially reconstruct $P*Q^T$. The returned matrix is a scipy csc_matrix. """ (m, n) = (P.shape[0], Q.shape[0]) if type(inds) == tuple: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int) colInds = numpy.array(colInds, numpy.int) else: rowInds, colInds = numpy.unravel_index(inds, (m, n)) X = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), P, Q) return X
def testComputeR2(self): m = 10 n = 15 U = numpy.random.rand(m, 5) V = numpy.random.rand(n, 5) Z = U.dot(V.T) w = numpy.ones(m)*1.0 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z.max(1) - r)/numpy.linalg.norm(Z.max(1)) < tol) w = numpy.zeros(m) r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) self.assertTrue(numpy.linalg.norm(Z.min(1) - r)/numpy.linalg.norm(Z.min(1)) < tol) w = numpy.zeros(m) w[5:10] = 1 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) self.assertTrue(numpy.linalg.norm(Z[0:5, :].min(1) - r[0:5])/numpy.linalg.norm(Z[0:5, :].min(1)) < tol) self.assertTrue(numpy.linalg.norm(Z[5:, :].max(1) - r[5:])/numpy.linalg.norm(Z[5:, :].min(1)) < tol) w = numpy.ones(m)*0.3 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r2 - r)/numpy.linalg.norm(r2) < tol) w = numpy.random.rand(m) r = SparseUtilsCython.computeR2(U, V, w) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r2 - r)/numpy.linalg.norm(r2) < tol) #Try a larger matrix m = 100 n = 105 U = numpy.random.rand(m, 5) V = numpy.random.rand(n, 5) Z = U.dot(V.T) w = numpy.random.rand(m) r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=10000) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r-r2) < 0.4)
def uncenterRows(X, mu): """ Take a matrix with rows centered using mu, and return them to their original state. Note that one should call X.eliminate_zeros() beforehand. """ if X.shape[0] != mu.shape[0]: raise ValueError("Invalid number of rows") rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals = SparseUtilsCython.partialOuterProduct( rowInds, colInds, numpy.array(mu, numpy.float), numpy.ones(X.shape[1])) X[rowInds, colInds] = numpy.array(X[rowInds, colInds] + vals, numpy.float) return X
def localAucsLmbdas(args): trainX, testX, testOmegaList, learner = args (m, n) = trainX.shape localAucs = numpy.zeros(learner.lmbdas.shape[0]) for j, lmbda in enumerate(learner.lmbdas): learner.lmbda = lmbda U, V = learner.learnModel(trainX) r = SparseUtilsCython.computeR(U, V, 1 - learner.u, learner.numAucSamples) localAucs[j] = MCEvaluator.localAUCApprox(testX, U, V, testOmegaList, learner.numAucSamples, r) logging.debug("Local AUC: " + str(localAucs[j]) + " with k = " + str(learner.k) + " and lmbda= " + str(learner.lmbda)) return localAucs
def testPartialReconstructValsPQ2(self): numRuns = 10 for i in range(numRuns): m = numpy.random.randint(5, 50) n = numpy.random.randint(5, 50) Y = numpy.random.rand(m, n) U, s, V = numpy.linalg.svd(Y, full_matrices=0) V = V.T V = numpy.ascontiguousarray(V) rowInds, colInds = numpy.nonzero(Y) rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) #print(U.shape, V.shape) vals = SparseUtilsCython.partialReconstructValsPQ(rowInds, colInds, numpy.ascontiguousarray(U*s), V) X = numpy.reshape(vals, Y.shape) nptst.assert_almost_equal(X, Y)
def localAUCApprox2(X, U, V, w, numAucSamples=50, omegaList=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. """ #For now let's compute the full matrix Z = U.dot(V.T) localAuc = numpy.zeros(X.shape[0]) allInds = numpy.arange(X.shape[1]) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if omegaList == None: omegaList = SparseUtils.getOmegaList(X) for i in range(X.shape[0]): omegai = omegaList[i] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for j in range(numAucSamples): ind = numpy.random.randint(omegai.shape[0] * omegaBari.shape[0]) p = omegai[int(ind / omegaBari.shape[0])] q = omegaBari[ind % omegaBari.shape[0]] if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc / float(numAucSamples) localAuc = localAuc.mean() return localAuc
def localAUCApprox2(X, U, V, w, numAucSamples=50, omegaList=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. """ #For now let's compute the full matrix Z = U.dot(V.T) localAuc = numpy.zeros(X.shape[0]) allInds = numpy.arange(X.shape[1]) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if omegaList==None: omegaList = SparseUtils.getOmegaList(X) for i in range(X.shape[0]): omegai = omegaList[i] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for j in range(numAucSamples): ind = numpy.random.randint(omegai.shape[0]*omegaBari.shape[0]) p = omegai[int(ind/omegaBari.shape[0])] q = omegaBari[ind % omegaBari.shape[0]] if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc/float(numAucSamples) localAuc = localAuc.mean() return localAuc
def localAUC(positiveArray, U, V, w, numRowInds=None): """ Compute the local AUC for the score functions UV^T relative to X with quantile w. """ if numRowInds == None: numRowInds = V.shape[0] if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) #For now let's compute the full matrix Z = U.dot(V.T) r = SparseUtilsCython.computeR(U, V, w, numRowInds) localAuc = numpy.zeros(U.shape[0]) allInds = numpy.arange(V.shape[0]) indPtr, colInds = positiveArray for i in range(U.shape[0]): omegai = colInds[indPtr[i]:indPtr[i + 1]] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for p in omegai: for q in omegaBari: if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc / float( omegai.shape[0] * omegaBari.shape[0]) localAuc = localAuc.mean() return localAuc
def localAUCApprox(positiveArray, U, V, w, numAucSamples=50, r=None, allArray=None): """ Compute the estimated local AUC for the score functions UV^T relative to X with quantile w. The AUC is computed using positiveArray which is a tuple (indPtr, colInds) assuming allArray is None. If allArray is not None then positive items are chosen from positiveArray and negative ones are chosen to complement allArray. """ if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) indPtr, colInds = positiveArray U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) if r is None: r = SparseUtilsCython.computeR(U, V, w, numAucSamples) if allArray is None: return MCEvaluatorCython.localAUCApprox(indPtr, colInds, indPtr, colInds, U, V, numAucSamples, r) else: allIndPtr, allColInd = allArray return MCEvaluatorCython.localAUCApprox(indPtr, colInds, allIndPtr, allColInd, U, V, numAucSamples, r)
def centerCols(X, mu=None, inds=None): """ Simply subtract the mean value of a row from each non-zero element. """ if inds == None: rowInds, colInds = X.nonzero() else: rowInds, colInds = inds rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) if mu == None: #This is the mean of the nonzero values in each col nonZeroCounts = numpy.bincount(colInds, minlength=X.shape[1]) inds = nonZeroCounts==0 nonZeroCounts += inds mu = numpy.array(X.sum(0), numpy.float).ravel()/nonZeroCounts mu[inds] = 0 vals = SparseUtilsCython.partialOuterProduct(rowInds, colInds, numpy.ones(X.shape[0]), numpy.array(mu, numpy.float)) X[X.nonzero()] = numpy.array(X[X.nonzero()] - vals, numpy.float) return X, mu
def localAUC(positiveArray, U, V, w, numRowInds=None): """ Compute the local AUC for the score functions UV^T relative to X with quantile w. """ if numRowInds == None: numRowInds = V.shape[0] if type(positiveArray) != tuple: positiveArray = SparseUtils.getOmegaListPtr(positiveArray) #For now let's compute the full matrix Z = U.dot(V.T) r = SparseUtilsCython.computeR(U, V, w, numRowInds) localAuc = numpy.zeros(U.shape[0]) allInds = numpy.arange(V.shape[0]) indPtr, colInds = positiveArray for i in range(U.shape[0]): omegai = colInds[indPtr[i]:indPtr[i+1]] omegaBari = numpy.setdiff1d(allInds, omegai, assume_unique=True) if omegai.shape[0] * omegaBari.shape[0] != 0: partialAuc = 0 for p in omegai: for q in omegaBari: if Z[i, p] > Z[i, q] and Z[i, p] > r[i]: partialAuc += 1 localAuc[i] = partialAuc/float(omegai.shape[0] * omegaBari.shape[0]) localAuc = localAuc.mean() return localAuc
def generateSparseBinaryMatrix(shape, p, w=0.9, sd=0, csarray=False, verbose=False, indsPerRow=50): """ Create an underlying matrix Z = UsV.T of rank p and then go through each row and threshold so that a proportion quantile numbers are kept. The final matrix is a 0/1 matrix. We order each row of Z in ascending order and then keep those bigger than u. In other words w=0 keeps all numbers and w=1.0 keeps none. """ m, n = shape U, s, V = SparseUtils.generateLowRank(shape, p) X = (U * s).dot(V.T) wv = numpy.random.randn(m) * sd + w wv = numpy.clip(wv, 0, 1) r = SparseUtilsCython.computeR2((U * s), V, wv, indsPerRow=indsPerRow) for i in range(m): X[i, X[i, :] >= r[i]] = 1 X[i, X[i, :] < r[i]] = 0 if csarray: import sppy X = sppy.csarray(X, storagetype="row") else: X = scipy.sparse.csr_matrix(X) if verbose: return X, U, s, V, wv else: return X
def profileLocalAucApprox(self): m = 500 n = 1000 k = 10 X, U, s, V = SparseUtils.generateSparseBinaryMatrix((m, n), k, csarray=True, verbose=True) u = 0.1 w = 1 - u numAucSamples = 200 omegaList = SparseUtils.getOmegaList(X) r = SparseUtilsCython.computeR(U, V, w, numAucSamples) numRuns = 10 def run(): for i in range(numRuns): MCEvaluator.localAUCApprox(X, U, V, omegaList, numAucSamples, r) ProfileUtils.profile('run()', globals(), locals())
def reconstructLowRank(U, s, V, k): """ Take the SVD of a low rank matrix and partially compute it with at most k values. If k is an array of values [0, U.shape[0]*V.shape[0]] then these indices are used for reconstruction. """ (m, n) = (U.shape[0], V.shape[0]) if type(k) == numpy.ndarray: inds = k inds = numpy.unique(inds) rowInds, colInds = numpy.unravel_index(inds, (m, n)) elif type(k) == tuple: rowInds, colInds = k else: inds = numpy.random.randint(0, n*m, k) inds = numpy.unique(inds) rowInds, colInds = numpy.unravel_index(inds, (m, n)) U = numpy.ascontiguousarray(U) V = numpy.ascontiguousarray(V) X = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), U*s, V) return X
def unshrink(self, X, U, V): """ Perform post-processing on a factorisation of a matrix X use factor vectors U and V. """ logging.debug("Post processing singular values") #Fix for versions of numpy < 1.7 inds = numpy.unique(numpy.random.randint(0, X.data.shape[0], numpy.min([self.postProcessSamples, X.data.shape[0]]))) a = numpy.array(X[X.nonzero()]).ravel()[inds] B = numpy.zeros((a.shape[0], U.shape[1])) rowInds, colInds = X.nonzero() rowInds = numpy.array(rowInds[inds], numpy.int32) colInds = numpy.array(colInds[inds], numpy.int32) #Populate B for i in range(U.shape[1]): B[:, i] = SparseUtilsCython.partialOuterProduct(rowInds, colInds, U[:, i], V[:, i]) s = numpy.linalg.pinv(B.T.dot(B)).dot(B.T).dot(a) return s
def learnModel(self, X, fullMatrices=True): """ Learn the matrix completion using a sparse matrix X. This is the simple version of the soft impute algorithm in which we store the entire matrices, newZ and oldZ. """ if not scipy.sparse.isspmatrix_csc(X): raise ValueError("Input matrix must be csc_matrix") (n, m) = X.shape oldU = numpy.zeros((n, 1)) oldS = numpy.zeros(1) oldV = numpy.zeros((m, 1)) omega = X.nonzero() tol = 10**-6 rowInds = numpy.array(omega[0], numpy.int) colInds = numpy.array(omega[1], numpy.int) ZList = [] for rho in self.rhos: gamma = self.eps + 1 i = 0 Y = scipy.sparse.csc_matrix(X, dtype=numpy.float) U, s, V = ExpSU.SparseUtils.svdArpack(Y, 1, kmax=20) lmbda = rho*numpy.max(s) while gamma > self.eps: ZOmega = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), oldU*oldS, oldV) Y = X - ZOmega Y = Y.tocsc() newU, newS, newV = ExpSU.SparseUtils.svdSparseLowRank(Y, oldU, oldS, oldV) #Soft threshold newS = newS - lmbda newS = numpy.clip(newS, 0, numpy.max(newS)) normOldZ = (oldS**2).sum() normNewZmOldZ = (oldS**2).sum() + (newS**2).sum() - 2*numpy.trace((oldV.T.dot(newV*newS)).dot(newU.T.dot(oldU*oldS))) #We can get newZ == oldZ in which case we break if normNewZmOldZ < tol: gamma = 0 elif abs(normOldZ) < tol: gamma = self.eps + 1 else: gamma = normNewZmOldZ/normOldZ oldU = newU.copy() oldS = newS.copy() oldV = newV.copy() logging.debug("Iteration " + str(i) + " gamma="+str(gamma)) i += 1 logging.debug("Number of iterations for lambda="+str(rho) + ": " + str(i)) if fullMatrices: newZ = scipy.sparse.lil_matrix((newU*newS).dot(newV.T)) ZList.append(newZ) else: ZList.append((newU,newS,newV)) if self.rhos.shape[0] != 1: return ZList else: return ZList[0]
def generateMatrices(self): """ This function returns a list of 20 train/test matrices for incremental collaborative filtering. Each item in the list is (trainX, testX). """ numpy.random.seed(21) r = 50 U, s, V = SparseUtils.generateLowRank((self.endM, self.endN), r, normalise=False) self.startNumInds = self.pnz*self.startM*self.startN self.endNumInds = self.pnz*self.endM*self.endN if not self.nonUniform: inds = numpy.random.randint(0, self.endM*self.endN-1, self.endNumInds) else: logging.debug("Using non uniform dataset") inds = numpy.array(numpy.random.randn(self.endNumInds)*(self.endM*self.endN-1)/4 +(self.endM*self.endN-1)/2, numpy.int) inds = numpy.clip(inds, 0, (self.endM*self.endN-1)) inds = numpy.unique(inds) numpy.random.shuffle(inds) self.endNumInds = inds.shape[0] rowInds, colInds = numpy.unravel_index(inds, (self.endM, self.endN)) rowInds = numpy.array(rowInds, numpy.int32) colInds = numpy.array(colInds, numpy.int32) vals = SparseUtilsCython.partialReconstructValsPQ(rowInds, colInds, U*s, V) vals /= vals.std() vals += numpy.random.randn(vals.shape[0])*self.noise isTrainInd = numpy.array(numpy.random.rand(inds.shape[0]) <= self.trainSplit, numpy.bool) assert (self.trainSplit - isTrainInd.sum()/float(isTrainInd.shape[0])) XMaskTrain = scipy.sparse.csc_matrix((isTrainInd, (rowInds, colInds)), dtype=numpy.bool, shape=(self.endM, self.endN)) XMaskTest = scipy.sparse.csc_matrix((numpy.logical_not(isTrainInd), (rowInds, colInds)), dtype=numpy.bool, shape=(self.endM, self.endN)) #In the first phase, the matrices stay the same size but there are more nonzero #entries numMatrices = 10 stepList = numpy.linspace(self.startNumInds, self.endNumInds, numMatrices) trainXList = [] testXList = [] for i in range(numMatrices): currentVals = vals[0:stepList[i]] currentRowInds = rowInds[0:stepList[i]] currentColInds = colInds[0:stepList[i]] X = scipy.sparse.csc_matrix((currentVals, (currentRowInds, currentColInds)), dtype=numpy.float, shape=(self.endM, self.endN)) #print("pnz=" + str(X.nnz/float(X.shape[0]*X.shape[1]))) trainX = X.multiply(XMaskTrain)[0:self.startM, 0:self.startN] trainX.eliminate_zeros() trainX.prune() testX = X.multiply(XMaskTest)[0:self.startM, 0:self.startN] testX.eliminate_zeros() testX.prune() trainXList.append(trainX) testXList.append(testX) #Now we increase the size of matrix numMatrices = 10 mStepList = numpy.linspace(self.startM, self.endM, numMatrices) nStepList = numpy.linspace(self.startN, self.endN, numMatrices) X = scipy.sparse.csc_matrix((vals, (rowInds, colInds)), dtype=numpy.float, shape=(self.endM, self.endN)) for i in range(numMatrices): trainX = X.multiply(XMaskTrain)[0:mStepList[i], :][:, 0:nStepList[i]] trainX.eliminate_zeros() trainX.prune() testX = X.multiply(XMaskTest)[0:mStepList[i], :][:, 0:nStepList[i]] testX.eliminate_zeros() testX.prune() trainXList.append(trainX) testXList.append(testX) return trainXList, testXList
logging.debug("Starting training") logging.debug(maxLocalAuc) #modelSelectX = trainX[0:100, :] #maxLocalAuc.learningRateSelect(trainX) #maxLocalAuc.modelSelect(trainX) #ProfileUtils.profile('U, V, trainObjs, trainAucs, testObjs, testAucs, iterations, time = maxLocalAuc.learnModel(trainX, testX=testX, verbose=True)', globals(), locals()) U, V, trainMeasures, testMeasures, iterations, time = maxLocalAuc.learnModel(trainX, verbose=True) p = 10 trainOrderedItems = MCEvaluator.recommendAtk(U, V, p) testOrderedItems = MCEvaluatorCython.recommendAtk(U, V, p, trainX) r = SparseUtilsCython.computeR(U, V, maxLocalAuc.w, maxLocalAuc.numRecordAucSamples) trainObjVec = maxLocalAuc.objectiveApprox(trainOmegaPtr, U, V, r, maxLocalAuc.gi, maxLocalAuc.gp, maxLocalAuc.gq, full=True) testObjVec = maxLocalAuc.objectiveApprox(testOmegaPtr, U, V, r, maxLocalAuc.gi, maxLocalAuc.gp, maxLocalAuc.gq, allArray=allOmegaPtr, full=True) itemCounts = numpy.array(X.sum(0)+1, numpy.int32) beta = 0.5 for p in [1, 3, 5, 10]: trainPrecision = MCEvaluator.precisionAtK(trainOmegaPtr, trainOrderedItems, p) testPrecision = MCEvaluator.precisionAtK(testOmegaPtr, testOrderedItems, p) logging.debug("Train/test precision@" + str(p) + "=" + str(trainPrecision) + "/" + str(testPrecision)) for p in [1, 3, 5, 10]: trainRecall = MCEvaluator.stratifiedRecallAtK(trainOmegaPtr, trainOrderedItems, p, itemCounts, beta) testRecall = MCEvaluator.stratifiedRecallAtK(testOmegaPtr, testOrderedItems, p, itemCounts, beta) logging.debug("Train/test stratified recall@" + str(p) + "=" + str(trainRecall) + "/" + str(testRecall))
rReal = numpy.mean(Z, 1) errors[0, i, j] = numpy.linalg.norm(rReal - r) r = computeR(U, V, aucSamples, numpy.median) rReal = numpy.median(Z, 1) errors[1, i, j] = numpy.linalg.norm(rReal - r) r = computeR(U, V, aucSamples, numpy.min, 1) rReal = numpy.min(Z, 1) errors[2, i, j] = numpy.linalg.norm(rReal - r) r = computeR(U, V, aucSamples, numpy.max, 1) rReal = numpy.max(Z, 1) errors[3, i, j] = numpy.linalg.norm(rReal - r) r = SparseUtilsCython.computeR(U, V, w, aucSamples) rReal = numpy.percentile(Z, w*100.0, 1) errors[4, i, j] = numpy.linalg.norm(rReal - r) meanErrors = numpy.mean(errors, 2) print(meanErrors) plt.plot(numAucSamples, meanErrors[0, :], label="mean") plt.plot(numAucSamples, meanErrors[1, :], label="median") plt.plot(numAucSamples, meanErrors[2, :], label="min") plt.plot(numAucSamples, meanErrors[3, :], label="max") plt.plot(numAucSamples, meanErrors[4, :], label="u=0.1") plt.legend() plt.show()
def run(): for i in range(numRuns): SparseUtilsCython.computeR(U, V, w, indsPerRow)
def testGenerateSparseBinaryMatrix(self): m = 5 n = 10 k = 3 quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile) Xscipy = numpy.array(X.todense()) nptst.assert_array_equal( numpy.array(X.sum(1)).flatten(), numpy.ones(m) * 3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile) self.assertTrue(numpy.linalg.norm(X - numpy.ones((m, n))) < 1.1) #nptst.assert_array_almost_equal(X.todense(), numpy.ones((m,n))) quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile, csarray=True) Xcsarray = X.toarray() nptst.assert_array_equal( numpy.array(X.sum(1)).flatten(), numpy.ones(m) * 3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile, csarray=True) self.assertTrue( numpy.linalg.norm(X.toarray() - numpy.ones((m, n))) < 1.1) #nptst.assert_array_almost_equal(X.toarray(), numpy.ones((m,n))) nptst.assert_array_equal(Xcsarray, Xscipy) #Test variation in the quantiles w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, sd=0.1, csarray=True, verbose=True) Z = (U * s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i] * 100) X2[i, Z[i, :] > r2[i]] = 1 r = SparseUtilsCython.computeR2(U * s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2) #Test a larger standard deviation w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, sd=0.5, csarray=True, verbose=True) Z = (U * s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i] * 100) X2[i, Z[i, :] >= r2[i]] = 1 r = SparseUtilsCython.computeR2(U * s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2)
def next(self): X = self.XIterator.next() logging.debug("Learning on matrix with shape: " + str(X.shape) + " and " + str(X.nnz) + " non-zeros") if self.iterativeSoftImpute.weighted: #Compute row and col probabilities up, vp = SparseUtils.nonzeroRowColsProbs(X) nzuInds = up==0 nzvInds = vp==0 u = numpy.sqrt(1/(up + numpy.array(nzuInds, numpy.int))) v = numpy.sqrt(1/(vp + numpy.array(nzvInds, numpy.int))) u[nzuInds] = 0 v[nzvInds] = 0 if self.rhos != None: self.iterativeSoftImpute.setRho(self.rhos.next()) if not scipy.sparse.isspmatrix_csc(X): raise ValueError("X must be a csc_matrix not " + str(type(X))) #Figure out what lambda should be #PROPACK has problems with convergence Y = scipy.sparse.csc_matrix(X, dtype=numpy.float) U, s, V = ExpSU.SparseUtils.svdArpack(Y, 1, kmax=20) del Y #U, s, V = SparseUtils.svdPropack(X, 1, kmax=20) maxS = s[0] logging.debug("Largest singular value : " + str(maxS)) (n, m) = X.shape if self.j == 0: self.oldU = numpy.zeros((n, 1)) self.oldS = numpy.zeros(1) self.oldV = numpy.zeros((m, 1)) else: oldN = self.oldU.shape[0] oldM = self.oldV.shape[0] if self.iterativeSoftImpute.updateAlg == "initial": if n > oldN: self.oldU = Util.extendArray(self.oldU, (n, self.oldU.shape[1])) elif n < oldN: self.oldU = self.oldU[0:n, :] if m > oldM: self.oldV = Util.extendArray(self.oldV, (m, self.oldV.shape[1])) elif m < oldN: self.oldV = self.oldV[0:m, :] elif self.iterativeSoftImpute.updateAlg == "zero": self.oldU = numpy.zeros((n, 1)) self.oldS = numpy.zeros(1) self.oldV = numpy.zeros((m, 1)) else: raise ValueError("Unknown SVD update algorithm: " + self.updateAlg) rowInds, colInds = X.nonzero() gamma = self.iterativeSoftImpute.eps + 1 i = 0 self.iterativeSoftImpute.measures = numpy.zeros((self.iterativeSoftImpute.maxIterations, 4)) while gamma > self.iterativeSoftImpute.eps: if i == self.iterativeSoftImpute.maxIterations: logging.debug("Maximum number of iterations reached") break ZOmega = SparseUtilsCython.partialReconstructPQ((rowInds, colInds), self.oldU*self.oldS, self.oldV) Y = X - ZOmega #Y = Y.tocsc() #del ZOmega Y = csarray(Y, storagetype="row") gc.collect() #os.system('taskset -p 0xffffffff %d' % os.getpid()) if self.iterativeSoftImpute.svdAlg=="propack": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=False) newU, newS, newV = SparseUtils.svdPropack(L, k=self.iterativeSoftImpute.k, kmax=self.iterativeSoftImpute.kmax) elif self.iterativeSoftImpute.svdAlg=="arpack": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=False) newU, newS, newV = SparseUtils.svdArpack(L, k=self.iterativeSoftImpute.k, kmax=self.iterativeSoftImpute.kmax) elif self.iterativeSoftImpute.svdAlg=="svdUpdate": newU, newS, newV = SVDUpdate.addSparseProjected(self.oldU, self.oldS, self.oldV, Y, self.iterativeSoftImpute.k) elif self.iterativeSoftImpute.svdAlg=="rsvd": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) newU, newS, newV = RandomisedSVD.svd(L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) elif self.iterativeSoftImpute.svdAlg=="rsvdUpdate": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) if self.j == 0: newU, newS, newV = RandomisedSVD.svd(L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) else: newU, newS, newV = RandomisedSVD.svd(L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.qu, omega=self.oldV) elif self.iterativeSoftImpute.svdAlg=="rsvdUpdate2": if self.j == 0: L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) newU, newS, newV = RandomisedSVD.svd(L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) else: #Need linear operator which is U s V L = LinOperatorUtils.lowRankOp(self.oldU, self.oldS, self.oldV) Y = GeneralLinearOperator.asLinearOperator(Y, parallel=True) newU, newS, newV = RandomisedSVD.updateSvd(L, self.oldU, self.oldS, self.oldV, Y, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p) else: raise ValueError("Unknown SVD algorithm: " + self.iterativeSoftImpute.svdAlg) if self.iterativeSoftImpute.weighted and i==0: delta = numpy.diag((u*newU.T).dot(newU)) pi = numpy.diag((v*newV.T).dot(newV)) lmbda = (maxS/numpy.max(delta*pi))*self.iterativeSoftImpute.rho lmbdav = lmbda*delta*pi elif not self.iterativeSoftImpute.weighted: lmbda = maxS*self.iterativeSoftImpute.rho if i==0: logging.debug("lambda: " + str(lmbda)) lmbdav = lmbda newS = newS - lmbdav #Soft threshold newS = numpy.clip(newS, 0, numpy.max(newS)) normOldZ = (self.oldS**2).sum() normNewZmOldZ = (self.oldS**2).sum() + (newS**2).sum() - 2*numpy.trace((self.oldV.T.dot(newV*newS)).dot(newU.T.dot(self.oldU*self.oldS))) #We can get newZ == oldZ in which case we break if normNewZmOldZ < self.tol: gamma = 0 elif abs(normOldZ) < self.tol: gamma = self.iterativeSoftImpute.eps + 1 else: gamma = normNewZmOldZ/normOldZ if self.iterativeSoftImpute.verbose: theta1 = (self.iterativeSoftImpute.k - numpy.linalg.norm(self.oldU.T.dot(newU), 'fro')**2)/self.iterativeSoftImpute.k theta2 = (self.iterativeSoftImpute.k - numpy.linalg.norm(self.oldV.T.dot(newV), 'fro')**2)/self.iterativeSoftImpute.k thetaS = numpy.linalg.norm(newS - self.oldS)**2/numpy.linalg.norm(newS)**2 self.iterativeSoftImpute.measures[i, :] = numpy.array([gamma, theta1, theta2, thetaS]) self.oldU = newU.copy() self.oldS = newS.copy() self.oldV = newV.copy() logging.debug("Iteration " + str(i) + " gamma="+str(gamma)) i += 1 if self.iterativeSoftImpute.postProcess: #Add the mean vectors previousS = newS newU = numpy.c_[newU, numpy.array(X.mean(1)).ravel()] newV = numpy.c_[newV, numpy.array(X.mean(0)).ravel()] newS = self.iterativeSoftImpute.unshrink(X, newU, newV) #Note that this increases the rank of U and V by 1 #print("Difference in s after postprocessing: " + str(numpy.linalg.norm(previousS - newS[0:-1]))) logging.debug("Difference in s after postprocessing: " + str(numpy.linalg.norm(previousS - newS[0:-1]))) logging.debug("Number of iterations for rho="+str(self.iterativeSoftImpute.rho) + ": " + str(i)) self.j += 1 return (newU, newS, newV)
def recordResults(self, muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime): sigmaU = self.getSigma(loopInd, self.alpha, muU.shape[0]) sigmaV = self.getSigma(loopInd, self.alpha, muU.shape[0]) r = SparseUtilsCython.computeR(muU, muV, self.w, self.numRecordAucSamples) objArr = self.objectiveApprox((indPtr, colInds), muU, muV, r, gi, gp, gq, full=True) if trainMeasures == None: trainMeasures = [] trainMeasures.append([ objArr.sum(), MCEvaluator.localAUCApprox((indPtr, colInds), muU, muV, self.w, self.numRecordAucSamples, r), time.time() - startTime, loopInd ]) printStr = "iter " + str(loopInd) + ":" printStr += " sigmaU=" + str('%.4f' % sigmaU) printStr += " sigmaV=" + str('%.4f' % sigmaV) printStr += " train: obj~" + str('%.4f' % trainMeasures[-1][0]) printStr += " LAUC~" + str('%.4f' % trainMeasures[-1][1]) if testIndPtr is not None: testMeasuresRow = [] testMeasuresRow.append( self.objectiveApprox((testIndPtr, testColInds), muU, muV, r, gi, gp, gq, allArray=(allIndPtr, allColInds))) testMeasuresRow.append( MCEvaluator.localAUCApprox((testIndPtr, testColInds), muU, muV, self.w, self.numRecordAucSamples, r, allArray=(allIndPtr, allColInds))) testOrderedItems = MCEvaluatorCython.recommendAtk( muU, muV, numpy.max(self.recommendSize), trainX) printStr += " validation: obj~" + str('%.4f' % testMeasuresRow[0]) printStr += " LAUC~" + str('%.4f' % testMeasuresRow[1]) try: for p in self.recommendSize: f1Array, orderedItems = MCEvaluator.f1AtK( (testIndPtr, testColInds), testOrderedItems, p, verbose=True) testMeasuresRow.append(f1Array[rowSamples].mean()) except: f1Array, orderedItems = MCEvaluator.f1AtK( (testIndPtr, testColInds), testOrderedItems, self.recommendSize, verbose=True) testMeasuresRow.append(f1Array[rowSamples].mean()) printStr += " f1@" + str(self.recommendSize) + "=" + str( '%.4f' % testMeasuresRow[-1]) try: for p in self.recommendSize: mrr, orderedItems = MCEvaluator.mrrAtK( (testIndPtr, testColInds), testOrderedItems, p, verbose=True) testMeasuresRow.append(mrr[rowSamples].mean()) except: mrr, orderedItems = MCEvaluator.mrrAtK( (testIndPtr, testColInds), testOrderedItems, self.recommendSize, verbose=True) testMeasuresRow.append(mrr[rowSamples].mean()) printStr += " mrr@" + str(self.recommendSize) + "=" + str( '%.4f' % testMeasuresRow[-1]) testMeasures.append(testMeasuresRow) printStr += " ||U||=" + str('%.3f' % numpy.linalg.norm(muU)) printStr += " ||V||=" + str('%.3f' % numpy.linalg.norm(muV)) if self.bound: trainObj = objArr.sum() expectationBound = self.computeBound(trainX, muU, muV, trainObj, self.delta) printStr += " bound=" + str('%.3f' % expectationBound) trainMeasures[-1].append(expectationBound) return printStr
def recordResults( self, muU, muV, trainMeasures, testMeasures, loopInd, rowSamples, indPtr, colInds, testIndPtr, testColInds, allIndPtr, allColInds, gi, gp, gq, trainX, startTime, ): sigmaU = self.getSigma(loopInd, self.alpha, muU.shape[0]) sigmaV = self.getSigma(loopInd, self.alpha, muU.shape[0]) r = SparseUtilsCython.computeR(muU, muV, self.w, self.numRecordAucSamples) objArr = self.objectiveApprox((indPtr, colInds), muU, muV, r, gi, gp, gq, full=True) if trainMeasures == None: trainMeasures = [] trainMeasures.append( [ objArr.sum(), MCEvaluator.localAUCApprox((indPtr, colInds), muU, muV, self.w, self.numRecordAucSamples, r), time.time() - startTime, loopInd, ] ) printStr = "iter " + str(loopInd) + ":" printStr += " sigmaU=" + str("%.4f" % sigmaU) printStr += " sigmaV=" + str("%.4f" % sigmaV) printStr += " train: obj~" + str("%.4f" % trainMeasures[-1][0]) printStr += " LAUC~" + str("%.4f" % trainMeasures[-1][1]) if testIndPtr is not None: testMeasuresRow = [] testMeasuresRow.append( self.objectiveApprox( (testIndPtr, testColInds), muU, muV, r, gi, gp, gq, allArray=(allIndPtr, allColInds) ) ) testMeasuresRow.append( MCEvaluator.localAUCApprox( (testIndPtr, testColInds), muU, muV, self.w, self.numRecordAucSamples, r, allArray=(allIndPtr, allColInds), ) ) testOrderedItems = MCEvaluatorCython.recommendAtk(muU, muV, numpy.max(self.recommendSize), trainX) printStr += " validation: obj~" + str("%.4f" % testMeasuresRow[0]) printStr += " LAUC~" + str("%.4f" % testMeasuresRow[1]) try: for p in self.recommendSize: f1Array, orderedItems = MCEvaluator.f1AtK( (testIndPtr, testColInds), testOrderedItems, p, verbose=True ) testMeasuresRow.append(f1Array[rowSamples].mean()) except: f1Array, orderedItems = MCEvaluator.f1AtK( (testIndPtr, testColInds), testOrderedItems, self.recommendSize, verbose=True ) testMeasuresRow.append(f1Array[rowSamples].mean()) printStr += " f1@" + str(self.recommendSize) + "=" + str("%.4f" % testMeasuresRow[-1]) try: for p in self.recommendSize: mrr, orderedItems = MCEvaluator.mrrAtK((testIndPtr, testColInds), testOrderedItems, p, verbose=True) testMeasuresRow.append(mrr[rowSamples].mean()) except: mrr, orderedItems = MCEvaluator.mrrAtK( (testIndPtr, testColInds), testOrderedItems, self.recommendSize, verbose=True ) testMeasuresRow.append(mrr[rowSamples].mean()) printStr += " mrr@" + str(self.recommendSize) + "=" + str("%.4f" % testMeasuresRow[-1]) testMeasures.append(testMeasuresRow) printStr += " ||U||=" + str("%.3f" % numpy.linalg.norm(muU)) printStr += " ||V||=" + str("%.3f" % numpy.linalg.norm(muV)) if self.bound: trainObj = objArr.sum() expectationBound = self.computeBound(trainX, muU, muV, trainObj, self.delta) printStr += " bound=" + str("%.3f" % expectationBound) trainMeasures[-1].append(expectationBound) return printStr
def next(self): X = self.XIterator.next() logging.debug("Learning on matrix with shape: " + str(X.shape) + " and " + str(X.nnz) + " non-zeros") if self.iterativeSoftImpute.weighted: #Compute row and col probabilities up, vp = SparseUtils.nonzeroRowColsProbs(X) nzuInds = up == 0 nzvInds = vp == 0 u = numpy.sqrt(1 / (up + numpy.array(nzuInds, numpy.int))) v = numpy.sqrt(1 / (vp + numpy.array(nzvInds, numpy.int))) u[nzuInds] = 0 v[nzvInds] = 0 if self.rhos != None: self.iterativeSoftImpute.setRho(self.rhos.next()) if not scipy.sparse.isspmatrix_csc(X): raise ValueError("X must be a csc_matrix not " + str(type(X))) #Figure out what lambda should be #PROPACK has problems with convergence Y = scipy.sparse.csc_matrix(X, dtype=numpy.float) U, s, V = ExpSU.SparseUtils.svdArpack(Y, 1, kmax=20) del Y #U, s, V = SparseUtils.svdPropack(X, 1, kmax=20) maxS = s[0] logging.debug("Largest singular value : " + str(maxS)) (n, m) = X.shape if self.j == 0: self.oldU = numpy.zeros((n, 1)) self.oldS = numpy.zeros(1) self.oldV = numpy.zeros((m, 1)) else: oldN = self.oldU.shape[0] oldM = self.oldV.shape[0] if self.iterativeSoftImpute.updateAlg == "initial": if n > oldN: self.oldU = Util.extendArray( self.oldU, (n, self.oldU.shape[1])) elif n < oldN: self.oldU = self.oldU[0:n, :] if m > oldM: self.oldV = Util.extendArray( self.oldV, (m, self.oldV.shape[1])) elif m < oldN: self.oldV = self.oldV[0:m, :] elif self.iterativeSoftImpute.updateAlg == "zero": self.oldU = numpy.zeros((n, 1)) self.oldS = numpy.zeros(1) self.oldV = numpy.zeros((m, 1)) else: raise ValueError("Unknown SVD update algorithm: " + self.updateAlg) rowInds, colInds = X.nonzero() gamma = self.iterativeSoftImpute.eps + 1 i = 0 self.iterativeSoftImpute.measures = numpy.zeros( (self.iterativeSoftImpute.maxIterations, 4)) while gamma > self.iterativeSoftImpute.eps: if i == self.iterativeSoftImpute.maxIterations: logging.debug("Maximum number of iterations reached") break ZOmega = SparseUtilsCython.partialReconstructPQ( (rowInds, colInds), self.oldU * self.oldS, self.oldV) Y = X - ZOmega #Y = Y.tocsc() #del ZOmega Y = csarray(Y, storagetype="row") gc.collect() #os.system('taskset -p 0xffffffff %d' % os.getpid()) if self.iterativeSoftImpute.svdAlg == "propack": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=False) newU, newS, newV = SparseUtils.svdPropack( L, k=self.iterativeSoftImpute.k, kmax=self.iterativeSoftImpute.kmax) elif self.iterativeSoftImpute.svdAlg == "arpack": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=False) newU, newS, newV = SparseUtils.svdArpack( L, k=self.iterativeSoftImpute.k, kmax=self.iterativeSoftImpute.kmax) elif self.iterativeSoftImpute.svdAlg == "svdUpdate": newU, newS, newV = SVDUpdate.addSparseProjected( self.oldU, self.oldS, self.oldV, Y, self.iterativeSoftImpute.k) elif self.iterativeSoftImpute.svdAlg == "rsvd": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) newU, newS, newV = RandomisedSVD.svd( L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) elif self.iterativeSoftImpute.svdAlg == "rsvdUpdate": L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) if self.j == 0: newU, newS, newV = RandomisedSVD.svd( L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) else: newU, newS, newV = RandomisedSVD.svd( L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.qu, omega=self.oldV) elif self.iterativeSoftImpute.svdAlg == "rsvdUpdate2": if self.j == 0: L = LinOperatorUtils.sparseLowRankOp(Y, self.oldU, self.oldS, self.oldV, parallel=True) newU, newS, newV = RandomisedSVD.svd( L, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p, q=self.iterativeSoftImpute.q) else: #Need linear operator which is U s V L = LinOperatorUtils.lowRankOp( self.oldU, self.oldS, self.oldV) Y = GeneralLinearOperator.asLinearOperator( Y, parallel=True) newU, newS, newV = RandomisedSVD.updateSvd( L, self.oldU, self.oldS, self.oldV, Y, self.iterativeSoftImpute.k, p=self.iterativeSoftImpute.p) else: raise ValueError("Unknown SVD algorithm: " + self.iterativeSoftImpute.svdAlg) if self.iterativeSoftImpute.weighted and i == 0: delta = numpy.diag((u * newU.T).dot(newU)) pi = numpy.diag((v * newV.T).dot(newV)) lmbda = (maxS / numpy.max( delta * pi)) * self.iterativeSoftImpute.rho lmbdav = lmbda * delta * pi elif not self.iterativeSoftImpute.weighted: lmbda = maxS * self.iterativeSoftImpute.rho if i == 0: logging.debug("lambda: " + str(lmbda)) lmbdav = lmbda newS = newS - lmbdav #Soft threshold newS = numpy.clip(newS, 0, numpy.max(newS)) normOldZ = (self.oldS**2).sum() normNewZmOldZ = (self.oldS**2).sum() + ( newS**2).sum() - 2 * numpy.trace( (self.oldV.T.dot(newV * newS)).dot( newU.T.dot(self.oldU * self.oldS))) #We can get newZ == oldZ in which case we break if normNewZmOldZ < self.tol: gamma = 0 elif abs(normOldZ) < self.tol: gamma = self.iterativeSoftImpute.eps + 1 else: gamma = normNewZmOldZ / normOldZ if self.iterativeSoftImpute.verbose: theta1 = ( self.iterativeSoftImpute.k - numpy.linalg.norm(self.oldU.T.dot(newU), 'fro')** 2) / self.iterativeSoftImpute.k theta2 = ( self.iterativeSoftImpute.k - numpy.linalg.norm(self.oldV.T.dot(newV), 'fro')** 2) / self.iterativeSoftImpute.k thetaS = numpy.linalg.norm( newS - self.oldS)**2 / numpy.linalg.norm(newS)**2 self.iterativeSoftImpute.measures[i, :] = numpy.array( [gamma, theta1, theta2, thetaS]) self.oldU = newU.copy() self.oldS = newS.copy() self.oldV = newV.copy() logging.debug("Iteration " + str(i) + " gamma=" + str(gamma)) i += 1 if self.iterativeSoftImpute.postProcess: #Add the mean vectors previousS = newS newU = numpy.c_[newU, numpy.array(X.mean(1)).ravel()] newV = numpy.c_[newV, numpy.array(X.mean(0)).ravel()] newS = self.iterativeSoftImpute.unshrink(X, newU, newV) #Note that this increases the rank of U and V by 1 #print("Difference in s after postprocessing: " + str(numpy.linalg.norm(previousS - newS[0:-1]))) logging.debug("Difference in s after postprocessing: " + str(numpy.linalg.norm(previousS - newS[0:-1]))) logging.debug("Number of iterations for rho=" + str(self.iterativeSoftImpute.rho) + ": " + str(i)) self.j += 1 return (newU, newS, newV)
import os import sys import sppy.io import numpy import logging from sandbox.util.SparseUtilsCython import SparseUtilsCython from sandbox.util.SparseUtils import SparseUtils from sandbox.util.PathDefaults import PathDefaults logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) numpy.random.seed(21) m = 600 n = 300 k = 8 density = 0.1 X, U, V = SparseUtilsCython.generateSparseBinaryMatrixPL((m,n), k, density=density, alpha=1, csarray=True) X = SparseUtils.pruneMatrixRows(X, minNnzRows=10) resultsDir = PathDefaults.getDataDir() + "syntheticRanking/" if not os.path.exists(resultsDir): os.mkdir(resultsDir) matrixFileName = resultsDir + "dataset1.mtx" sppy.io.mmwrite(matrixFileName, X) logging.debug("Non-zero elements: " + str(X.nnz) + " shape: " + str(X.shape)) logging.debug("Saved file: " + matrixFileName)