def dotc(x, y, out=None): """Computes the dot product of x.conj() and y.""" dtype = x.dtype.char if dtype in 'fd': return dot(x, y, out=out) elif dtype == 'F': func = cublas.cdotc elif dtype == 'D': func = cublas.zdotc else: raise TypeError('invalid dtype') _check_two_vectors(x, y) handle = device.get_cublas_handle() result_dtype = dtype result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def nrm2(x, out=None): """Computes the Euclidean norm of vector x.""" if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.snrm2 elif dtype == 'd': func = cublas.dnrm2 elif dtype == 'F': func = cublas.scnrm2 elif dtype == 'D': func = cublas.dznrm2 else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() result_dtype = dtype.lower() result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def _iamaxmin(x, out, name): if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': t = 's' elif dtype == 'd': t = 'd' elif dtype == 'F': t = 'c' elif dtype == 'D': t = 'z' else: raise TypeError('invalid dtype') func = getattr(cublas, 'i' + t + name) handle = device.get_cublas_handle() result_dtype = 'i' result_ptr, result, orig_mode = _setup_result_ptr( handle, out, result_dtype) try: func(handle, x.size, x.data.ptr, 1, result_ptr) finally: cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: _core.elementwise_copy(result, out) return out
def axpy(a, x, y): """Computes y += a * x. (*) y will be updated. """ _check_two_vectors(x, y) dtype = x.dtype.char if dtype == 'f': func = cublas.saxpy elif dtype == 'd': func = cublas.daxpy elif dtype == 'F': func = cublas.caxpy elif dtype == 'D': func = cublas.zaxpy else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype) try: func(handle, x.size, a_ptr, x.data.ptr, 1, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)
def gerc(alpha, x, y, a): """Computes a += alpha * x @ y.T.conj() Note: ''a'' will be updated. """ dtype = a.dtype.char if dtype in 'fd': return ger(alpha, x, y, a) elif dtype == 'F': func = cublas.cgerc elif dtype == 'D': func = cublas.zgerc else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == m assert y.shape[0] == n handle = device.get_cublas_handle() alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype) x_ptr, y_ptr = x.data.ptr, y.data.ptr try: if a._f_contiguous: func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m) else: aa = a.copy(order='F') func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m) _core.elementwise_copy(aa, a) finally: cublas.setPointerMode(handle, orig_mode)
def dot(x, y, out=None): """Computes the dot product of x and y.""" dtype = x.dtype.char if dtype == 'f': func = cublas.sdot elif dtype == 'd': func = cublas.ddot elif dtype in 'FD': raise TypeError('Use dotu() or dotc() for complex dtype') else: raise TypeError('invalid dtype') _check_two_vectors(x, y) handle = device.get_cublas_handle() result_dtype = dtype result_ptr, result, orig_mode = _setup_result_ptr(handle, out, result_dtype) func(handle, x.size, x.data.ptr, 1, y.data.ptr, 1, result_ptr) cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: out[...] = result return out
def asum(x, out=None): """Computes the sum of the absolute of x.""" if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.sasum elif dtype == 'd': func = cublas.dasum elif dtype == 'F': func = cublas.scasum elif dtype == 'D': func = cublas.dzasum else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() result_dtype = dtype.lower() result_ptr, result, orig_mode = _setup_result_ptr(handle, out, result_dtype) func(handle, x.size, x.data.ptr, 1, result_ptr) cublas.setPointerMode(handle, orig_mode) if out is None: out = result elif out.dtype != result_dtype: out[...] = result return out
def scal(a, x): """Computes x *= a. (*) x will be updated. """ if x.ndim != 1: raise ValueError('x must be a 1D array (actual: {})'.format(x.ndim)) dtype = x.dtype.char if dtype == 'f': func = cublas.sscal elif dtype == 'd': func = cublas.dscal elif dtype == 'F': func = cublas.cscal elif dtype == 'D': func = cublas.zscal else: raise TypeError('invalid dtype') handle = device.get_cublas_handle() a, a_ptr, orig_mode = _setup_scalar_ptr(handle, a, dtype) try: func(handle, x.size, a_ptr, x.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)
def _setup_scalar_ptr(handle, a, dtype): a, a_ptr = _get_scalar_ptr(a, dtype) mode = cublas.getPointerMode(handle) if isinstance(a, cupy.ndarray): cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) return a, a_ptr, mode
def _norm(self, i): _cublas.setPointerMode(self.cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: self.nrm2(self.cublas_handle, self.n, self.u.data.ptr, 1, self.beta.data.ptr + i * self.beta.itemsize) finally: _cublas.setPointerMode(self.cublas_handle, self.cublas_pointer_mode)
def _setup_scalar_ptr(handle, a, dtype): mode = cublas.getPointerMode(handle) if isinstance(a, cupy.ndarray): if a.dtype != dtype: a = cupy.array(a, dtype=dtype) a_ptr = a.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: if not (isinstance(a, numpy.ndarray) and a.dtype == dtype): a = numpy.array(a, dtype=dtype) a_ptr = a.ctypes.data cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) return a_ptr, mode
def _setup_result_ptr(handle, out, dtype): mode = cublas.getPointerMode(handle) if out is None or isinstance(out, cupy.ndarray): if out is None or out.dtype != dtype: result = cupy.empty([], dtype=dtype) else: result = out result_ptr = result.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) elif isinstance(out, numpy.ndarray): if out.dtype != dtype: result = numpy.empty([], dtype=dtype) else: result = out result_ptr = result.ctypes.data cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) else: raise TypeError('out must be either cupy or numpy ndarray') return result_ptr, result, mode
def ger(alpha, x, y, a): """Computes a += alpha * x @ y.T Note: ''a'' will be updated. """ dtype = a.dtype.char if dtype == 'f': func = cublas.sger elif dtype == 'd': func = cublas.dger elif dtype in 'FD': raise TypeError('Use geru or gerc for complex dtypes') else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == m assert y.shape[0] == n handle = device.get_cublas_handle() alpha, alpha_ptr, orig_mode = _setup_scalar_ptr(handle, alpha, dtype) x_ptr, y_ptr = x.data.ptr, y.data.ptr try: if a._f_contiguous: func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, a.data.ptr, m) elif a._c_contiguous: func(handle, n, m, alpha_ptr, y_ptr, 1, x_ptr, 1, a.data.ptr, n) else: aa = a.copy(order='F') func(handle, m, n, alpha_ptr, x_ptr, 1, y_ptr, 1, aa.data.ptr, m) a[...] = aa finally: cublas.setPointerMode(handle, orig_mode)
def sbmv(k, alpha, a, x, beta, y, lower=False): """Computes y = alpha*A @ x + beta * y """ dtype = a.dtype.char if dtype == 'f': func = cublas.ssbmv elif dtype == 'd': func = cublas.dsbmv else: raise TypeError('Complex dtypes not supported') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape assert x.shape[0] == n assert y.shape[0] == n if not a._f_contiguous: a = a.copy(order='F') alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) if lower: uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER handle = device.get_cublas_handle() try: func(handle, uplo, n, k, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode) return y
def aux(A, V, u, alpha, beta, i_start, i_end): assert A is outer_A beta_eps = inversion_eps(A.dtype) # Get ready for spmv if enabled if cusparse_handle is not None: # Note: I would like to reuse descriptors and working buffer # on the next update, but I gave it up because it sometimes # caused illegal memory access error. spmv_desc_A = cusparse.SpMatDescriptor.create(A) spmv_desc_v = cusparse.DnVecDescriptor.create(v) spmv_desc_u = cusparse.DnVecDescriptor.create(u) buff_size = _cusparse.spMV_bufferSize( cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg) spmv_buff = cupy.empty(buff_size, cupy.int8) v[...] = V[i_start] for i in range(i_start, i_end): # Matrix-vector multiplication if cusparse_handle is None: u[...] = A @ v else: _cusparse.spMV(cusparse_handle, spmv_op_a, spmv_alpha.ctypes.data, spmv_desc_A.desc, spmv_desc_v.desc, spmv_beta.ctypes.data, spmv_desc_u.desc, spmv_cuda_dtype, spmv_alg, spmv_buff.data.ptr) # Call dotc _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: dotc(cublas_handle, n, v.data.ptr, 1, u.data.ptr, 1, alpha.data.ptr + i * alpha.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Orthogonalize gemm(cublas_handle, _cublas.CUBLAS_OP_C, _cublas.CUBLAS_OP_N, 1, i + 1, n, one.ctypes.data, u.data.ptr, n, V.data.ptr, n, zero.ctypes.data, uu.data.ptr, 1) gemm(cublas_handle, _cublas.CUBLAS_OP_N, _cublas.CUBLAS_OP_C, n, 1, i + 1, mone.ctypes.data, V.data.ptr, n, uu.data.ptr, 1, one.ctypes.data, u.data.ptr, n) # Call nrm2 _cublas.setPointerMode(cublas_handle, _cublas.CUBLAS_POINTER_MODE_DEVICE) try: nrm2(cublas_handle, n, u.data.ptr, 1, beta.data.ptr + i * beta.itemsize) finally: _cublas.setPointerMode(cublas_handle, cublas_pointer_mode) # Break here as the normalization below touches V[i+1] if i >= i_end - 1: break if beta[i] < beta_eps: V[i + 1:i_end, :] = 0 u[...] = 0 v[...] = 0 break if i == i_start: beta_eps *= beta[i] # scale eps to largest beta # Normalize _kernel_normalize(u, beta, i, n, v, V)
def syrk(trans, a, out=None, alpha=1.0, beta=0.0, lower=False): """Computes out := alpha*op1(a)*op2(a) + beta*out op1(a) = a if trans is 'N', op2(a) = a.T if transa is 'N' op1(a) = a.T if trans is 'T', op2(a) = a if transa is 'T' lower specifies whether the upper or lower triangular part of the array out is to be referenced """ assert a.ndim == 2 dtype = a.dtype.char if dtype == 'f': func = cublas.ssyrk elif dtype == 'd': func = cublas.dsyrk elif dtype == 'F': func = cublas.csyrk elif dtype == 'D': func = cublas.zsyrk else: raise TypeError('invalid dtype') trans = _trans_to_cublas_op(trans) if trans == cublas.CUBLAS_OP_N: n, k = a.shape else: k, n = a.shape if out is None: out = cupy.zeros((n, n), dtype=dtype, order='F') beta = 0.0 else: assert out.ndim == 2 assert out.shape == (n, n) assert out.dtype == dtype if lower: uplo = cublas.CUBLAS_FILL_MODE_LOWER else: uplo = cublas.CUBLAS_FILL_MODE_UPPER alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) lda, trans = _decide_ld_and_trans(a, trans) ldo, _ = _decide_ld_and_trans(out, trans) if out._c_contiguous: if not a._c_contiguous: a = a.copy(order='C') trans = 1 - trans lda = a.shape[1] try: func(handle, 1 - uplo, trans, n, k, alpha_ptr, a.data.ptr, lda, beta_ptr, out.data.ptr, ldo) finally: cublas.setPointerMode(handle, orig_mode) else: if not a._f_contiguous: a = a.copy(order='F') lda = a.shape[0] trans = 1 - trans c = out if not out._f_contiguous: c = out.copy(order='F') try: func(handle, uplo, trans, n, k, alpha_ptr, a.data.ptr, lda, beta_ptr, out.data.ptr, ldo) finally: cublas.setPointerMode(handle, orig_mode) if not out._f_contiguous: out[...] = c return out
def geam(transa, transb, alpha, a, beta, b, out=None): """Computes alpha * op(a) + beta * op(b) op(a) = a if transa is 'N', op(a) = a.T if transa is 'T', op(a) = a.T.conj() if transa is 'H'. op(b) = b if transb is 'N', op(b) = b.T if transb is 'T', op(b) = b.T.conj() if transb is 'H'. """ assert a.ndim == b.ndim == 2 assert a.dtype == b.dtype dtype = a.dtype.char if dtype == 'f': func = cublas.sgeam elif dtype == 'd': func = cublas.dgeam elif dtype == 'F': func = cublas.cgeam elif dtype == 'D': func = cublas.zgeam else: raise TypeError('invalid dtype') transa = _trans_to_cublas_op(transa) transb = _trans_to_cublas_op(transb) if transa == cublas.CUBLAS_OP_N: m, n = a.shape else: n, m = a.shape if transb == cublas.CUBLAS_OP_N: assert b.shape == (m, n) else: assert b.shape == (n, m) if out is None: out = cupy.empty((m, n), dtype=dtype, order='F') else: assert out.ndim == 2 assert out.shape == (m, n) assert out.dtype == dtype alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) lda, transa = _decide_ld_and_trans(a, transa) ldb, transb = _decide_ld_and_trans(b, transb) if not (lda is None or ldb is None): if out._f_contiguous: try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) return out elif out._c_contiguous: # Computes alpha * a.T + beta * b.T try: func(handle, 1-transa, 1-transb, n, m, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, out.data.ptr, n) finally: cublas.setPointerMode(handle, orig_mode) return out a, lda = _change_order_if_necessary(a, lda) b, ldb = _change_order_if_necessary(b, ldb) c = out if not out._f_contiguous: c = out.copy(order='F') try: func(handle, transa, transb, m, n, alpha_ptr, a.data.ptr, lda, beta_ptr, b.data.ptr, ldb, c.data.ptr, m) finally: cublas.setPointerMode(handle, orig_mode) if not out._f_contiguous: _core.elementwise_copy(c, out) return out
def gemv(transa, alpha, a, x, beta, y): """Computes y = alpha * op(a) @ x + beta * y op(a) = a if transa is 'N', op(a) = a.T if transa is 'T', op(a) = a.T.conj() if transa is 'H'. Note: ''y'' will be updated. """ dtype = a.dtype.char if dtype == 'f': func = cublas.sgemv elif dtype == 'd': func = cublas.dgemv elif dtype == 'F': func = cublas.cgemv elif dtype == 'D': func = cublas.zgemv else: raise TypeError('invalid dtype') assert a.ndim == 2 assert x.ndim == y.ndim == 1 assert a.dtype == x.dtype == y.dtype m, n = a.shape transa = _trans_to_cublas_op(transa) if transa == cublas.CUBLAS_OP_N: xlen, ylen = n, m else: xlen, ylen = m, n assert x.shape[0] == xlen assert y.shape[0] == ylen alpha, alpha_ptr = _get_scalar_ptr(alpha, a.dtype) beta, beta_ptr = _get_scalar_ptr(beta, a.dtype) handle = device.get_cublas_handle() orig_mode = cublas.getPointerMode(handle) if isinstance(alpha, cupy.ndarray) or isinstance(beta, cupy.ndarray): if not isinstance(alpha, cupy.ndarray): alpha = cupy.array(alpha) alpha_ptr = alpha.data.ptr if not isinstance(beta, cupy.ndarray): beta = cupy.array(beta) beta_ptr = beta.data.ptr cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_DEVICE) else: cublas.setPointerMode(handle, cublas.CUBLAS_POINTER_MODE_HOST) try: if a._f_contiguous: func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) elif a._c_contiguous and transa != cublas.CUBLAS_OP_C: if transa == cublas.CUBLAS_OP_N: transa = cublas.CUBLAS_OP_T else: transa = cublas.CUBLAS_OP_N func(handle, transa, n, m, alpha_ptr, a.data.ptr, n, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) else: a = a.copy(order='F') func(handle, transa, m, n, alpha_ptr, a.data.ptr, m, x.data.ptr, 1, beta_ptr, y.data.ptr, 1) finally: cublas.setPointerMode(handle, orig_mode)