def test_large_mult_vec(): # 10M * 500 = 2.5B >= INT_MAX nrows = 10000000 ncols = 500 dense = 250 nnz = nrows * dense rowptrs = np.arange(0, nnz + 1, dense, dtype=np.int64) assert len(rowptrs) == nrows + 1 assert rowptrs[-1] == nnz try: _log.info('allocating indexes') colinds = np.empty(nnz, dtype=np.intc) _log.info('allocating values') values = np.zeros(nnz) except MemoryError: skip('insufficient memory') _log.info('randomizing array contents') fill_rows(values, colinds, nrows, ncols, dense) csr = CSR(nrows, ncols, nnz, rowptrs, colinds, values) v = np.random.randn(ncols) res = csr.mult_vec(v) assert res.shape == (nrows, ) assert np.all(~np.isnan(res))
def _train_matrix_cd(mat: CSR, this: np.ndarray, other: np.ndarray, reg: float): """ One half of an explicit ALS training round using coordinate descent. Args: mat: the :math:`m \\times n` matrix of ratings this: the :math:`m \\times k` matrix to train other: the :math:`n \\times k` matrix of sample features reg: the regularization term """ nr = mat.nrows nf = other.shape[1] assert mat.ncols == other.shape[0] assert mat.nrows == this.shape[0] assert this.shape[1] == nf frob = 0.0 for i in prange(nr): cols = mat.row_cs(i) if len(cols) == 0: continue vals = mat.row_vs(i) w = this[i, :].copy() _rr_solve(other, cols, vals, w, reg * len(cols), 2) delta = this[i, :] - w frob += np.dot(delta, delta) this[i, :] = w return np.sqrt(frob)
def test_mult_ab_by_size(kernel, benchmark, size): A = sps.random(size, size, 0.1, format='csr') B = sps.random(size, size, 0.1, format='csr') A = CSR.from_scipy(A) B = CSR.from_scipy(B) # make sure it's compiled A.multiply(B) def op(): A.multiply(B) benchmark(op)
def test_mult_abt_by_density(kernel, benchmark, density): A = sps.random(100, 100, density, format='csr') B = sps.random(100, 100, density, format='csr') A = CSR.from_scipy(A) B = CSR.from_scipy(B) # make sure it's compiled A.multiply(B, transpose=True) def op(): A.multiply(B, transpose=True) benchmark(op)
def test_mult_ab(kernel, benchmark): A = sps.random(100, 500, 0.1, format='csr') B = sps.random(500, 200, 0.2, format='csr') A = CSR.from_scipy(A) B = CSR.from_scipy(B) # make sure it's compiled A.multiply(B) def op(): A.multiply(B) benchmark(op)
def test_csr_from_coo_novals(data, nrows, ncols): n = nrows * ncols nnz = data.draw(st.integers(0, int(n * 0.75))) _log.info('testing %d×%d (%d nnz) with no values', nrows, ncols, nnz) coords = st.integers(0, max(n - 1, 0)) coords = data.draw(nph.arrays(np.int32, nnz, elements=coords, unique=True)) rows = np.mod(coords, nrows, dtype=np.int32) cols = np.floor_divide(coords, nrows, dtype=np.int32) csr = CSR.from_coo(rows, cols, None, (nrows, ncols)) rowinds = csr.rowinds() assert csr.nrows == nrows assert csr.ncols == ncols assert csr.nnz == nnz for i in range(nrows): sp = csr.rowptrs[i] ep = csr.rowptrs[i + 1] assert ep - sp == np.sum(rows == i) points, = np.nonzero(rows == i) assert len(points) == ep - sp po = np.argsort(cols[points]) points = points[po] assert all(np.sort(csr.colinds[sp:ep]) == cols[points]) assert all(np.sort(csr.row_cs(i)) == cols[points]) assert all(rowinds[sp:ep] == i) row = csr.row(i) assert np.sum(row) == ep - sp
def test_shard(csr): SHARD_SIZE = 1000 shards = csr._shard_rows(SHARD_SIZE) # we have the whole matrix assert sum(s.nnz for s in shards) == csr.nnz # everything is in spec assert all(s.nnz <= SHARD_SIZE for s in shards) # all row counts match assert np.all( np.concatenate([s.row_nnzs() for s in shards]) == csr.row_nnzs()) # all column indices match assert np.all(np.concatenate([s.colinds for s in shards]) == csr.colinds) # all values match assert np.all(np.concatenate([s.values for s in shards]) == csr.values) # we can reassemble the shards csr2 = CSR._assemble_shards(shards) assert csr2.nrows == csr.nrows assert csr2.ncols == csr.ncols assert csr2.nnz == csr.nnz assert np.all(csr2.rowptrs == csr.rowptrs) assert np.all(csr2.colinds == csr.colinds) assert np.all(csr2.values == csr.values)
def mult_ab(a_h, b_h): """ Multiply matrices A and B. Args: a_h: the handle of matrix A b_h: the handle of matrix B Returns: the handle of the product; it must be released when no longer needed. """ assert a_h.ncols == b_h.nrows c_rp = np.zeros(a_h.nrows + 1, np.intc) # step 1: symbolic multiplication c_ci = _sym_mm(a_h, b_h, c_rp) c_nnz = c_rp[a_h.nrows] # step 2: numeric multiplication c_vs = _num_mm(a_h, b_h, c_rp, c_ci) # build the result return CSR(a_h.nrows, b_h.ncols, c_nnz, c_rp, c_ci, c_vs)
def test_csr_str(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) assert '4x3' in str(csr)
def mkh(csr): vs = csr._required_values().astype(np.float64) csr2 = CSR(csr.nrows, csr.ncols, csr.nnz, csr.rowptrs, csr.colinds, vs) if csr.nnz == 0: return mkl_h(0, csr.nrows, csr.ncols, csr2) return _make_handle(csr2)
def test_csr_rowinds(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) ris = csr.rowinds() assert all(ris == rows)
def test_empty(nrows, ncols): csr = CSR.empty(nrows, ncols) assert csr.nrows == nrows assert csr.ncols == ncols assert csr.nnz == 0 assert all(csr.rowptrs == 0) assert len(csr.rowptrs) == nrows + 1 assert len(csr.colinds) == 0
def test_csr_row_extent_fixed(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) + 1 csr = CSR.from_coo(rows, cols, vals) assert csr.row_extent(0) == (0, 2) assert csr.row_extent(1) == (2, 3) assert csr.row_extent(2) == (3, 3) assert csr.row_extent(3) == (3, 4)
def test_csr_row_fixed(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) + 1 csr = CSR.from_coo(rows, cols, vals) assert all(csr.row(0) == np.array([0, 1, 2], dtype=np.float_)) assert all(csr.row(1) == np.array([3, 0, 0], dtype=np.float_)) assert all(csr.row(2) == np.array([0, 0, 0], dtype=np.float_)) assert all(csr.row(3) == np.array([0, 4, 0], dtype=np.float_))
def test_csr_from_coo_fixed(): "Make a CSR from COO data" rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) assert csr.nrows == 4 assert csr.ncols == 3 assert csr.nnz == 4 assert csr.values == approx(vals)
def test_unit_norm(csr: CSR): # assume(spm.nnz >= 10) backup = csr.copy() m2 = csr.normalize_rows('unit') assert len(m2) == csr.nrows assert m2.dtype == csr.values.dtype for i in range(csr.nrows): vs = csr.row_vs(i) bvs = backup.row_vs(i) if len(vs) > 0: assert m2[i] == approx(np.linalg.norm(bvs)) if m2[i] > 0: assert np.linalg.norm(vs) == approx(1.0) assert vs * m2[i] == approx(backup.row_vs(i)) else: assert all(np.isnan(vs)) else: assert m2[i] == 0.0
def test_csr_set_values(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) v2 = 10 - vals csr.values = v2 assert all(csr.values == v2)
def to_handle(csr: CSR) -> mkl_h: if csr.nnz > _hpkg.max_nnz: raise ValueError('CSR size {} exceeds max nnz {}'.format( csr.nnz, _hpkg.max_nnz)) if csr.nnz == 0: # empty matrices don't really work return mkl_h(0, csr.nrows, csr.ncols, None) norm = csr._normalize(np.float64, np.intc) return _make_handle(norm)
def mult_vec(h: CSR, v): res = np.zeros(h.nrows) row = 0 for i in range(h.nnz): # advance the row if necessary while i == h.rowptrs[row + 1]: row += 1 col = h.colinds[i] res[row] += v[col] * h._e_value(i) return res
def test_csr_set_values_none(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) csr.values = None assert csr.values is None assert all(csr.row(0) == [0, 1, 1]) assert all(csr.row(1) == [1, 0, 0]) assert all(csr.row(3) == [0, 1, 0])
def test_csr_set_values_oversize(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) v2 = np.arange(6, dtype=np.float_) + 10 csr.values = v2 assert csr.values is not None assert all(csr.values == v2[:4])
def test_mult_vec(kernel, benchmark): A = sps.random(100, 100, 0.1, format='csr') A = CSR.from_scipy(A) x = np.random.randn(100) # make sure it's compiled y = A.mult_vec(x) assert len(y) == A.nrows def op(): A.mult_vec(x) benchmark(op)
def test_sps_to_csr(data, format): mat = data.draw(sparse_matrices(format=format)) nr, nc = mat.shape sp_csr: sps.csr_matrix = mat.tocsr() csr = CSR.from_scipy(mat) assert csr.ncols == nc assert csr.nrows == nr assert csr.nnz == mat.nnz assert np.all(csr.rowptrs == sp_csr.indptr) assert np.all(csr.colinds == sp_csr.indices) assert np.all(csr.values == sp_csr.data)
def test_csr_set_values_undersize(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) v2 = np.arange(3, dtype=np.float_) + 5 with raises(ValueError): csr.values = v2 assert all(csr.values == vals)
def test_csr_from_sps_csr(smat, copy): "Test creating a CSR from a SciPy CSR matrix" csr = CSR.from_scipy(smat, copy=copy) assert csr.nnz == smat.nnz assert csr.nrows == smat.shape[0] assert csr.ncols == smat.shape[1] assert all(csr.rowptrs == smat.indptr) assert all(csr.colinds == smat.indices) assert all(csr.values == smat.data) assert isinstance(csr.rowptrs, np.ndarray) assert isinstance(csr.colinds, np.ndarray) if csr.nnz > 0: assert isinstance(csr.values, np.ndarray)
def _normalize(self, rmat): rmat = rmat.to_scipy() # compute column norms norms = spla.norm(rmat, 2, axis=0) # and multiply by a diagonal to normalize columns recip_norms = norms.copy() is_nz = recip_norms > 0 recip_norms[is_nz] = np.reciprocal(recip_norms[is_nz]) norm_mat = rmat @ sps.diags(recip_norms) assert norm_mat.shape[1] == rmat.shape[1] # and reset NaN norm_mat.data[np.isnan(norm_mat.data)] = 0 _logger.info('[%s] normalized rating matrix columns', self._timer) return CSR.from_scipy(norm_mat, False)
def test_csr_sparse_row(): rows = np.array([0, 0, 1, 3], dtype=np.int32) cols = np.array([1, 2, 0, 1], dtype=np.int32) vals = np.arange(4, dtype=np.float_) csr = CSR.from_coo(rows, cols, vals) assert all(csr.row_cs(0) == np.array([1, 2], dtype=np.int32)) assert all(csr.row_cs(1) == np.array([0], dtype=np.int32)) assert all(csr.row_cs(2) == np.array([], dtype=np.int32)) assert all(csr.row_cs(3) == np.array([1], dtype=np.int32)) assert all(csr.row_vs(0) == np.array([0, 1], dtype=np.float_)) assert all(csr.row_vs(1) == np.array([2], dtype=np.float_)) assert all(csr.row_vs(2) == np.array([], dtype=np.float_)) assert all(csr.row_vs(3) == np.array([3], dtype=np.float_))
def test_csr_row_nnzs(mat): nrows, ncols = mat.shape # sparsify the matrix mat[mat <= 0] = 0 smat = sps.csr_matrix(mat) # make sure it's sparse assume(smat.nnz == np.sum(mat > 0)) csr = CSR.from_scipy(smat) nnzs = csr.row_nnzs() assert nnzs.sum() == csr.nnz for i in range(nrows): row = mat[i, :] assert nnzs[i] == np.sum(row > 0)
def _compute_similarities(self, rmat): trmat = rmat.transpose() nitems = trmat.nrows m_nbrs = self.save_nbrs if m_nbrs is None or m_nbrs < 0: m_nbrs = 0 bounds = _make_blocks(nitems, 1000) _logger.info('[%s] splitting %d items (%d ratings) into %d blocks', self._timer, nitems, trmat.nnz, len(bounds)) blocks = [trmat.subset_rows(sp, ep) for (sp, ep) in bounds] _logger.info('[%s] computing similarities', self._timer) ptrs = List(bounds) nbs = List(blocks) if not nbs: # oops, this is the bad place # in non-JIT node, List doesn't actually make the list nbs = blocks ptrs = bounds s_blocks = _sim_blocks(trmat, nbs, ptrs, self.min_sim, m_nbrs) nnz = sum(b.nnz for b in s_blocks) tot_rows = sum(b.nrows for b in s_blocks) _logger.info('[%s] computed %d similarities for %d items in %d blocks', self._timer, nnz, tot_rows, len(s_blocks)) row_nnzs = np.concatenate([b.row_nnzs() for b in s_blocks]) assert len(row_nnzs) == nitems, \ 'only have {} rows for {} items'.format(len(row_nnzs), nitems) smat = CSR.empty(nitems, nitems, row_nnzs) start = 0 for bi, b in enumerate(s_blocks): bnr = b.nrows end = start + bnr v_sp = smat.rowptrs[start] v_ep = smat.rowptrs[end] _logger.debug('block %d (%d:%d) has %d entries, storing in %d:%d', bi, start, end, b.nnz, v_sp, v_ep) smat.colinds[v_sp:v_ep] = b.colinds smat.values[v_sp:v_ep] = b.values start = end _logger.info('[%s] sorting similarity matrix with %d entries', self._timer, smat.nnz) _sort_nbrs(smat) return smat
def test_subset_rows(data): nrows = data.draw(st.integers(5, 100)) ncols = data.draw(st.integers(1, 100)) dens = data.draw(st.floats(0, 1)) beg = data.draw(st.integers(0, nrows - 1)) end = data.draw(st.integers(beg, nrows - 1)) spm = sps.random(nrows, ncols, dens, format='csr') csr = CSR.from_scipy(spm) m2 = csr.subset_rows(beg, end) assert m2.nrows == end - beg for i in range(m2.nrows): assert all(m2.row_cs(i) == csr.row_cs(beg + i)) assert all(m2.row_vs(i) == csr.row_vs(beg + i))