def main(): from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1 / csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat * res - rhs) / la.norm(rhs)) print( "size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def _solve_cuda(lap_sparse, B, return_full_prob=False, maxiter=100, tol=5e-5): """ solves lap_sparse X_i = B_i for each phase i, using the conjugate gradient method. For each pixel, the label i corresponding to the maximal X_i is returned. """ print("using gpu mode") dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() csr_mat = lap_sparse csr_mat = csr_mat.astype(np.float32) inv_mat_diag = 1 / csr_mat.diagonal() spmv = PacketedSpMV(csr_mat, True, csr_mat.dtype) X = [] for i in range(len(B)): rhs = -B[i].astype(spmv.dtype) if True: precon = DiagonalPreconditioner( spmv.permute( gpuarray.to_gpu(inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None print("start solve") start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) tol = 1e-7 if spmv.dtype == np.float64 else tol res_gpu, it_count, res_count = solve_pkt_with_cg( spmv, rhs_gpu, precon, tol=tol, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start) * 1e-3 est_flops = (csr_mat.nnz * 2 * (it_count + res_count) + csr_mat.shape[0] * (2 + 2 + 2 + 2 + 2) * it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % (csr_mat.shape[0], elapsed, it_count, res_count, it_count / elapsed, est_flops / elapsed / 1e9)) x0 = res[0] X.append(x0) pagelocked_pool.stop_holding() dev_pool.stop_holding() if not return_full_prob: X = np.array(X) X = np.argmax(X, axis=0) return X
def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32): self.logger = logging.getLogger(__name__) self.nx = nx self.ny = ny self.x_halo = x_halo self.y_halo = y_halo nx_halo = nx + 2 * x_halo ny_halo = ny + 2 * y_halo #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype) #For returning to download self.memorypool = PageLockedMemoryPool() #If we don't have any data, just allocate and return if cpu_data is None: return #Make sure data is in proper format assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == ( self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str( cpu_data.shape), str( (self.ny, self.nx)), str((ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran( cpu_data), "Wrong datatype (Fortran, expected C)" #Create copy object from host to device x = (nx_halo - cpu_data.shape[1]) // 2 y = (ny_halo - cpu_data.shape[0]) // 2 self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]])
def main_cg(): from optparse import OptionParser parser = OptionParser( usage="%prog [options] MATRIX-MARKET-FILE") parser.add_option("-s", "--is-symmetric", action="store_true", help="Specify that the input matrix is already symmetric") options, args = parser.parse_args() from pycuda.tools import DeviceMemoryPool, PageLockedMemoryPool dev_pool = DeviceMemoryPool() pagelocked_pool = PageLockedMemoryPool() from scipy.io import mmread csr_mat = mmread(args[0]).tocsr().astype(numpy.float32) inv_mat_diag = 1/csr_mat.diagonal() print "building..." from pycuda.sparse.packeted import PacketedSpMV spmv = PacketedSpMV(csr_mat, options.is_symmetric, csr_mat.dtype) rhs = numpy.random.rand(spmv.shape[0]).astype(spmv.dtype) from pycuda.sparse.operator import DiagonalPreconditioner if True: precon = DiagonalPreconditioner( spmv.permute(gpuarray.to_gpu( inv_mat_diag, allocator=dev_pool.allocate))) else: precon = None from pycuda.sparse.cg import solve_pkt_with_cg print "start solve" for i in range(4): start = drv.Event() stop = drv.Event() start.record() rhs_gpu = gpuarray.to_gpu(rhs, dev_pool.allocate) res_gpu, it_count, res_count = \ solve_pkt_with_cg(spmv, rhs_gpu, precon, tol=1e-7 if spmv.dtype == numpy.float64 else 5e-5, pagelocked_allocator=pagelocked_pool.allocate) res = res_gpu.get() stop.record() stop.synchronize() elapsed = stop.time_since(start)*1e-3 est_flops = (csr_mat.nnz*2*(it_count+res_count) + csr_mat.shape[0]*(2+2+2+2+2)*it_count) if precon is not None: est_flops += csr_mat.shape[0] * it_count print "residual norm: %g" % (la.norm(csr_mat*res - rhs)/la.norm(rhs)) print ("size: %d, elapsed: %g s, %d it, %d residual, it/second: %g, " "%g gflops/s" % ( csr_mat.shape[0], elapsed, it_count, res_count, it_count/elapsed, est_flops/elapsed/1e9)) # TODO: mixed precision # TODO: benchmark pagelocked_pool.stop_holding() dev_pool.stop_holding()
def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32): self.logger = logging.getLogger(__name__) self.nx = nx self.ny = ny self.nz = nz self.x_halo = x_halo self.y_halo = y_halo self.z_halo = z_halo nx_halo = nx + 2 * x_halo ny_halo = ny + 2 * y_halo nz_halo = nz + 2 * z_halo #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype) #For returning to download self.memorypool = PageLockedMemoryPool() #If we don't have any data, just allocate and return if cpu_data is None: return #Make sure data is in proper format assert cpu_data.shape == ( nz_halo, ny_halo, nx_halo) or cpu_data.shape == ( self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str( cpu_data.shape), str( (self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran( cpu_data), "Wrong datatype (Fortran, expected C)" #Create copy object from host to device copy = cuda.Memcpy3D() copy.set_src_host(cpu_data) copy.set_dst_device(self.data.gpudata) #Set offsets of destination x_offset = (nx_halo - cpu_data.shape[2]) // 2 y_offset = (ny_halo - cpu_data.shape[1]) // 2 z_offset = (nz_halo - cpu_data.shape[0]) // 2 copy.dst_x_in_bytes = x_offset * self.data.strides[1] copy.dst_y = y_offset copy.dst_z = z_offset #Set pitch of destination copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy width = max(self.nx, cpu_data.shape[2]) height = max(self.ny, cpu_data.shape[1]) depth = max(self.nz, cpu - data.shape[0]) copy.width_in_bytes = width * cpu_data.itemsize copy.height = height copy.depth = depth #Perform the copy copy(stream)
class CudaArray3D: """ Uploads initial data to the CL device """ def __init__(self, stream, nx, ny, nz, x_halo, y_halo, z_halo, cpu_data=None, dtype=np.float32): self.logger = logging.getLogger(__name__) self.nx = nx self.ny = ny self.nz = nz self.x_halo = x_halo self.y_halo = y_halo self.z_halo = z_halo nx_halo = nx + 2 * x_halo ny_halo = ny + 2 * y_halo nz_halo = nz + 2 * z_halo #self.logger.debug("Allocating [%dx%dx%d] buffer", self.nx, self.ny, self.nz) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here self.data = pycuda.gpuarray.zeros((nz_halo, ny_halo, nx_halo), dtype) #For returning to download self.memorypool = PageLockedMemoryPool() #If we don't have any data, just allocate and return if cpu_data is None: return #Make sure data is in proper format assert cpu_data.shape == ( nz_halo, ny_halo, nx_halo) or cpu_data.shape == ( self.nz, self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str( cpu_data.shape), str( (self.nz, self.ny, self.nx)), str((nz_halo, ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran( cpu_data), "Wrong datatype (Fortran, expected C)" #Create copy object from host to device copy = cuda.Memcpy3D() copy.set_src_host(cpu_data) copy.set_dst_device(self.data.gpudata) #Set offsets of destination x_offset = (nx_halo - cpu_data.shape[2]) // 2 y_offset = (ny_halo - cpu_data.shape[1]) // 2 z_offset = (nz_halo - cpu_data.shape[0]) // 2 copy.dst_x_in_bytes = x_offset * self.data.strides[1] copy.dst_y = y_offset copy.dst_z = z_offset #Set pitch of destination copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy width = max(self.nx, cpu_data.shape[2]) height = max(self.ny, cpu_data.shape[1]) depth = max(self.nz, cpu - data.shape[0]) copy.width_in_bytes = width * cpu_data.itemsize copy.height = height copy.depth = depth #Perform the copy copy(stream) #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) def __del__(self, *args): #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) self.data.gpudata.free() self.data = None """ Enables downloading data from GPU to Python """ def download(self, stream, asynch=False): #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32) #cpu_data = np.empty((self.nz, self.ny, self.nx), dtype=np.float32) cpu_data = self.memorypool.allocate((self.nz, self.ny, self.nx), dtype=np.float32) #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_src_device(self.data.gpudata) copy.set_dst_host(cpu_data) #Set offsets and pitch of source copy.src_x_in_bytes = self.x_halo * self.data.strides[1] copy.src_y = self.y_halo copy.src_z = self.z_halo copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = self.nx * cpu_data.itemsize copy.height = self.ny copy.depth = self.nz copy(stream) if asynch == False: stream.synchronize() return cpu_data
class CudaArray2D: """ Uploads initial data to the CUDA device """ def __init__(self, stream, nx, ny, x_halo, y_halo, cpu_data=None, dtype=np.float32): self.logger = logging.getLogger(__name__) self.nx = nx self.ny = ny self.x_halo = x_halo self.y_halo = y_halo nx_halo = nx + 2 * x_halo ny_halo = ny + 2 * y_halo #self.logger.debug("Allocating [%dx%d] buffer", self.nx, self.ny) #Should perhaps use pycuda.driver.mem_alloc_data.pitch() here self.data = pycuda.gpuarray.zeros((ny_halo, nx_halo), dtype) #For returning to download self.memorypool = PageLockedMemoryPool() #If we don't have any data, just allocate and return if cpu_data is None: return #Make sure data is in proper format assert cpu_data.shape == (ny_halo, nx_halo) or cpu_data.shape == ( self.ny, self.nx), "Wrong shape of data %s vs %s / %s" % (str( cpu_data.shape), str( (self.ny, self.nx)), str((ny_halo, nx_halo))) assert cpu_data.itemsize == 4, "Wrong size of data type" assert not np.isfortran( cpu_data), "Wrong datatype (Fortran, expected C)" #Create copy object from host to device x = (nx_halo - cpu_data.shape[1]) // 2 y = (ny_halo - cpu_data.shape[0]) // 2 self.upload(stream, cpu_data, extent=[x, y, cpu_data.shape[1], cpu_data.shape[0]]) #self.logger.debug("Buffer <%s> [%dx%d]: Allocated ", int(self.data.gpudata), self.nx, self.ny) def __del__(self, *args): #self.logger.debug("Buffer <%s> [%dx%d]: Releasing ", int(self.data.gpudata), self.nx, self.ny) self.data.gpudata.free() self.data = None """ Enables downloading data from GPU to Python """ def download(self, stream, cpu_data=None, asynch=False, extent=None): if (extent is None): x = self.x_halo y = self.y_halo nx = self.nx ny = self.ny else: x, y, nx, ny = extent if (cpu_data is None): #self.logger.debug("Downloading [%dx%d] buffer", self.nx, self.ny) #Allocate host memory #The following fails, don't know why (crashes python) #cpu_data = cuda.pagelocked_empty((self.ny, self.nx), np.float32)32) #Non-pagelocked: cpu_data = np.empty((ny, nx), dtype=np.float32) cpu_data = self.memorypool.allocate((ny, nx), dtype=np.float32) assert nx == cpu_data.shape[1] assert ny == cpu_data.shape[0] assert x + nx <= self.nx + 2 * self.x_halo assert y + ny <= self.ny + 2 * self.y_halo #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_src_device(self.data.gpudata) copy.set_dst_host(cpu_data) #Set offsets and pitch of source copy.src_x_in_bytes = int(x) * self.data.strides[1] copy.src_y = int(y) copy.src_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = int(nx) * cpu_data.itemsize copy.height = int(ny) copy(stream) if asynch == False: stream.synchronize() return cpu_data def upload(self, stream, cpu_data, extent=None): if (extent is None): x = self.x_halo y = self.y_halo nx = self.nx ny = self.ny else: x, y, nx, ny = extent assert (nx == cpu_data.shape[1]) assert (ny == cpu_data.shape[0]) assert (x + nx <= self.nx + 2 * self.x_halo) assert (y + ny <= self.ny + 2 * self.y_halo) #Create copy object from device to host copy = cuda.Memcpy2D() copy.set_dst_device(self.data.gpudata) copy.set_src_host(cpu_data) #Set offsets and pitch of source copy.dst_x_in_bytes = int(x) * self.data.strides[1] copy.dst_y = int(y) copy.dst_pitch = self.data.strides[0] #Set width in bytes to copy for each row and #number of rows to copy copy.width_in_bytes = int(nx) * cpu_data.itemsize copy.height = int(ny) copy(stream)