def testGenerateSparseBinaryMatrix(self): m = 5 n = 10 k = 3 quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) Xscipy = numpy.array(X.todense()) nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile) self.assertTrue(numpy.linalg.norm(X - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.todense(), numpy.ones((m,n))) quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) Xcsarray = X.toarray() nptst.assert_array_equal(numpy.array(X.sum(1)).flatten(), numpy.ones(m)*3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m,n), k, quantile, csarray=True) self.assertTrue(numpy.linalg.norm(X.toarray() - numpy.ones((m,n))) < 1.1) #nptst.assert_array_almost_equal(X.toarray(), numpy.ones((m,n))) nptst.assert_array_equal(Xcsarray, Xscipy) #Test variation in the quantiles w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.1, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2) #Test a larger standard deviation w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m,n), k, w, sd=0.5, csarray=True, verbose=True) Z = (U*s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i]*100) X2[i, Z[i, :]>=r2[i]] = 1 r = SparseUtilsCython.computeR2(U*s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2)
def generateSparseBinaryMatrix(shape, p, w=0.9, sd=0, csarray=False, verbose=False, indsPerRow=50): """ Create an underlying matrix Z = UsV.T of rank p and then go through each row and threshold so that a proportion quantile numbers are kept. The final matrix is a 0/1 matrix. We order each row of Z in ascending order and then keep those bigger than u. In other words w=0 keeps all numbers and w=1.0 keeps none. """ m, n = shape U, s, V = SparseUtils.generateLowRank(shape, p) X = (U*s).dot(V.T) wv = numpy.random.randn(m)*sd + w wv = numpy.clip(wv, 0, 1) r = SparseUtilsCython.computeR2((U*s), V, wv, indsPerRow=indsPerRow) for i in range(m): X[i, X[i, :] >= r[i]] = 1 X[i, X[i, :] < r[i]] = 0 if csarray: import sppy X = sppy.csarray(X, storagetype="row") else: X = scipy.sparse.csr_matrix(X) if verbose: return X, U, s, V, wv else: return X
def testComputeR2(self): m = 10 n = 15 U = numpy.random.rand(m, 5) V = numpy.random.rand(n, 5) Z = U.dot(V.T) w = numpy.ones(m)*1.0 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) tol = 0.1 self.assertTrue(numpy.linalg.norm(Z.max(1) - r)/numpy.linalg.norm(Z.max(1)) < tol) w = numpy.zeros(m) r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) self.assertTrue(numpy.linalg.norm(Z.min(1) - r)/numpy.linalg.norm(Z.min(1)) < tol) w = numpy.zeros(m) w[5:10] = 1 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) self.assertTrue(numpy.linalg.norm(Z[0:5, :].min(1) - r[0:5])/numpy.linalg.norm(Z[0:5, :].min(1)) < tol) self.assertTrue(numpy.linalg.norm(Z[5:, :].max(1) - r[5:])/numpy.linalg.norm(Z[5:, :].min(1)) < tol) w = numpy.ones(m)*0.3 r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=1000) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r2 - r)/numpy.linalg.norm(r2) < tol) w = numpy.random.rand(m) r = SparseUtilsCython.computeR2(U, V, w) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r2 - r)/numpy.linalg.norm(r2) < tol) #Try a larger matrix m = 100 n = 105 U = numpy.random.rand(m, 5) V = numpy.random.rand(n, 5) Z = U.dot(V.T) w = numpy.random.rand(m) r = SparseUtilsCython.computeR2(U, V, w, indsPerRow=10000) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(Z[i, :], w[i]*100.0) self.assertTrue(numpy.linalg.norm(r-r2) < 0.4)
def generateSparseBinaryMatrix(shape, p, w=0.9, sd=0, csarray=False, verbose=False, indsPerRow=50): """ Create an underlying matrix Z = UsV.T of rank p and then go through each row and threshold so that a proportion quantile numbers are kept. The final matrix is a 0/1 matrix. We order each row of Z in ascending order and then keep those bigger than u. In other words w=0 keeps all numbers and w=1.0 keeps none. """ m, n = shape U, s, V = SparseUtils.generateLowRank(shape, p) X = (U * s).dot(V.T) wv = numpy.random.randn(m) * sd + w wv = numpy.clip(wv, 0, 1) r = SparseUtilsCython.computeR2((U * s), V, wv, indsPerRow=indsPerRow) for i in range(m): X[i, X[i, :] >= r[i]] = 1 X[i, X[i, :] < r[i]] = 0 if csarray: import sppy X = sppy.csarray(X, storagetype="row") else: X = scipy.sparse.csr_matrix(X) if verbose: return X, U, s, V, wv else: return X
def testGenerateSparseBinaryMatrix(self): m = 5 n = 10 k = 3 quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile) Xscipy = numpy.array(X.todense()) nptst.assert_array_equal( numpy.array(X.sum(1)).flatten(), numpy.ones(m) * 3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile) self.assertTrue(numpy.linalg.norm(X - numpy.ones((m, n))) < 1.1) #nptst.assert_array_almost_equal(X.todense(), numpy.ones((m,n))) quantile = 0.7 numpy.random.seed(21) X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile, csarray=True) Xcsarray = X.toarray() nptst.assert_array_equal( numpy.array(X.sum(1)).flatten(), numpy.ones(m) * 3) quantile = 0.0 X = SparseUtils.generateSparseBinaryMatrix((m, n), k, quantile, csarray=True) self.assertTrue( numpy.linalg.norm(X.toarray() - numpy.ones((m, n))) < 1.1) #nptst.assert_array_almost_equal(X.toarray(), numpy.ones((m,n))) nptst.assert_array_equal(Xcsarray, Xscipy) #Test variation in the quantiles w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, sd=0.1, csarray=True, verbose=True) Z = (U * s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i] * 100) X2[i, Z[i, :] > r2[i]] = 1 r = SparseUtilsCython.computeR2(U * s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2) #Test a larger standard deviation w = 0.7 X, U, s, V, wv = SparseUtils.generateSparseBinaryMatrix((m, n), k, w, sd=0.5, csarray=True, verbose=True) Z = (U * s).dot(V.T) X2 = numpy.zeros((m, n)) r2 = numpy.zeros(m) for i in range(m): r2[i] = numpy.percentile(numpy.sort(Z[i, :]), wv[i] * 100) X2[i, Z[i, :] >= r2[i]] = 1 r = SparseUtilsCython.computeR2(U * s, V, wv) nptst.assert_array_almost_equal(X.toarray(), X2) nptst.assert_array_almost_equal(r, r2)