def test_module(self): logging.debug("ENTER: test_module") ctx = cu.Devices().create_some_context() module = cu.Module(ctx, source_file="%s/test.cu" % self.path) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("nvcc compilation succeeded") logging.debug("Resulted ptx code is:\n%s", module.ptx.decode("utf-8")) logging.debug("Will try Context.create_module") module = ctx.create_module(source_file="%s/test.cu" % self.path) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Will try to compile with includes") module = cu.Module(ctx, source_file="%s/inc.cu" % self.path, include_dirs=("", self.path, "")) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Will try to compile with source") module = cu.Module(ctx, source="#include \"inc.cu\"", include_dirs=(self.path,)) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Testing get_func, get_global") with ctx: self.assertIsNotNone(module.get_func("test")) ptr, size = module.get_global("g_a") self.assertEqual(ptr, int(ptr)) self.assertEqual(size, 4) logging.debug("Succeeded") logging.debug("EXIT: test_module")
def test_mem_alloc_managed(self): logging.debug("ENTER: test_mem_alloc_managed") ctx = cu.Devices().create_some_context() self._test_alloc(lambda a: cu.MemAllocManaged(ctx, a)) self._test_alloc(ctx.mem_alloc_managed) logging.debug("MemAllocManaged succeeded") logging.debug("EXIT: test_mem_alloc_managed")
def _get_some_device(self, **kwargs): """Gets some device from the available CUDA devices. Returns True if any device was selected, otherwise, False. """ device = self.parse_device(**kwargs) try: devices = cu.Devices() except (OSError, cu.CUDARuntimeError): devices = None if devices is None or not len(devices): raise DeviceNotFoundError("No CUDA devices were found") self._id = device if device == "": context = devices.create_some_context() else: try: device = devices[int(device)] except IndexError: raise from_none( DeviceNotFoundError("CUDA device %s was not found." % device)) context = device.create_context() self._context_ = context device = self.context.device self.device_info = DeviceInfo( desc=device.name, memsize=device.total_mem, memalign=4096, version=device.compute_capability, device_type="CUDA", max_work_group_size=device.max_grid_dims, max_work_item_sizes=device.max_block_dims, local_memsize=device.max_shared_memory_per_block) return True
def test_memcpy(self): logging.debug("ENTER: test_memcpy") ctx = cu.Devices().create_some_context() a = cu.MemAlloc(ctx, 4096) a.memset32_async(123) b = cu.MemAlloc(ctx, 4096) b.memset32_async(456) test = numpy.zeros(a.size // 4, dtype=numpy.int32) a.from_device_async(b) a.to_host(test) for x in test: self.assertEqual(x, 456) a.memset32_async(123) a.from_device_async(b, 12) a.to_host(test) for x in test[:3]: self.assertEqual(x, 123) for x in test[3:]: self.assertEqual(x, 456) a.memset32_async(123) a.from_device_async(b, 12, 64) a.to_host(test) for x in test[:3]: self.assertEqual(x, 123) for x in test[3:19]: self.assertEqual(x, 456) for x in test[19:]: self.assertEqual(x, 123) logging.debug("EXIT: test_memcpy")
def test_memcpy_3d_async(self): logging.debug("ENTER: test_memcpy_3d_async") p_copy = cu.get_ffi().new("CUDA_MEMCPY3D *") self.assertEqual(cu.get_ffi().sizeof(p_copy[0]), 200) ctx = cu.Devices().create_some_context() logging.debug("Context created") # Create arrays with some values for testing a = numpy.arange(35 * 25 * 15, dtype=numpy.float32).reshape(35, 25, 15) b = numpy.arange(37 * 27 * 17, dtype=numpy.float32).reshape(37, 27, 17) b *= 0.5 c = numpy.empty_like(b) c[:] = 1.0e30 # Create buffers a_ = cu.MemAlloc(ctx, a) b_ = cu.MemAlloc(ctx, b) # Copy 3D rect from one device buffer to another logging.debug("Testing device -> device memcpy_3d_async") sz = a.itemsize a_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], dst=b_) b_.to_host(c) diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) # Copy 3D rect from host buffer to device buffer logging.debug("Testing host -> device memcpy_3d_async") sz = a.itemsize b_.memset32_async() b_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], src=a) c[:] = 1.0e30 b_.to_host(c) diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) # Copy 3D rect from device buffer to host buffer logging.debug("Testing device -> host memcpy_3d_async") sz = a.itemsize c[:] = 1.0e30 a_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], dst=c) ctx.synchronize() diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) logging.debug("EXIT: test_memcpy_3d_async")
def setUp(self): logging.basicConfig(level=logging.DEBUG) self.old_env = os.environ.get("CUDA_DEVICE") if self.old_env is None: os.environ["CUDA_DEVICE"] = "0" self.ctx = cu.Devices().create_some_context() self.path = os.path.dirname(__file__) if not len(self.path): self.path = "."
def test_occupancy(self): logging.debug("ENTER: test_occupancy") ctx = cu.Devices().create_some_context() logging.debug("Context created") module = cu.Module(ctx, source_file="%s/test.cu" % self.path) logging.debug("Program builded") f = module.get_func("test") logging.debug("Got function pointer") num_blocks = f.max_active_blocks_per_multiprocessor(1) self.assertEqual(num_blocks, int(num_blocks)) self.assertGreater(num_blocks, 0) logging.debug("num_blocks = %d", num_blocks) logging.debug("Testing dynamic_smem_size parameter") num_blocks = f.max_active_blocks_per_multiprocessor( 128, dynamic_smem_size=8192) self.assertEqual(num_blocks, int(num_blocks)) self.assertGreater(num_blocks, 0) logging.debug("num_blocks = %d", num_blocks) min_grid_size, block_size = f.max_potential_block_size() self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Trying callback") min_grid_size, block_size = f.max_potential_block_size( lambda x: x ** 2) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Testing block_size_limit parameter") min_grid_size, block_size = f.max_potential_block_size( block_size_limit=16) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) self.assertLessEqual(block_size, 16) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Testing dynamic_smem_size parameter") min_grid_size, block_size = f.max_potential_block_size( dynamic_smem_size=8192) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("EXIT: test_occupancy")
def arg_completer(prefix, **kwargs): def format_device(dev): return "%d: %s - %s, %dMb, compute_%d%d, pci %s" % ( (dev.handle, dev.name, dev.total_mem) + dev.compute_capability + (dev.pci_bus_id, )) devices = cu.Devices() if len(devices) == 1: return ["0"] result = [] for device in devices: result.append(format_device(device)) return result
def _test_good(self): ctx = cu.Devices().create_some_context() a = Container() a.ctx = ctx b = Container() b.mem = cu.MemAlloc(ctx, 4096) b.module = cu.Module(ctx, source=""" __global__ void test(float *a) { a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f; }""") b.blas = blas.CUBLAS(ctx) logging.debug("Remaining context count: %d", cu.Context.context_count) # self.assertEqual(cu.Context.context_count, 1) self.assertIsNotNone(ctx) # to hold ctx up to this point
def test_mem_host_alloc(self): logging.debug("ENTER: test_mem_host_alloc") ctx = cu.Devices().create_some_context() def test(mem): devptr = mem.device_pointer self.assertEqual(devptr, int(devptr)) if ctx.device.unified_addressing: self.assertEqual(devptr, mem.handle) self.assertIsNotNone(mem.buffer) self._test_alloc(lambda a: cu.MemHostAlloc(ctx, a), test) self._test_alloc(ctx.mem_host_alloc, test) logging.debug("MemHostAlloc succeeded") logging.debug("EXIT: test_mem_host_alloc")
def test_memset(self): logging.debug("ENTER: test_memset") ctx = cu.Devices().create_some_context() mem = cu.MemAlloc(ctx, 4096) mem.memset32_async(123) mem.memset32_async(456, 1) mem.memset32_async(789, 2, 3) a = numpy.zeros(mem.size // 4, dtype=numpy.int32) mem.to_host(a) self.assertEqual(a[0], 123) self.assertEqual(a[1], 456) for i in range(2, 2 + 3): self.assertEqual(a[i], 789) for i in range(2 + 3, a.size): self.assertEqual(a[i], 456) logging.debug("EXIT: test_memset")
def test_context(self): logging.debug("ENTER: test_context") ctx = cu.Devices().create_some_context() logging.debug("Context created") self.assertEqual(ctx.handle, cu.Context.get_current()) h, h0 = self._run_on_thread(self._check_push_pop, (ctx,)) self.assertEqual(h, ctx.handle) self.assertEqual(h0, 0) logging.debug("push/pop succeeded") h, h0 = self._run_on_thread(self._check_with, (ctx,)) self.assertEqual(h, ctx.handle) self.assertEqual(h0, 0) logging.debug("with succeeded") self.assertEqual( self._run_on_thread(self._check_set_current, (ctx,))[0], ctx.handle) logging.debug("set_current succeeded") logging.debug("EXIT: test_context")
def test_attributes(self): d = cu.Devices()[0] self.assertIsInstance(d.unified_addressing, bool) self.assertGreater(d.warp_size, 0) self.assertGreater(d.max_threads_per_block, 0) self.assertGreaterEqual(d.max_shared_memory_per_block, 0) xyz = d.max_block_dims self.assertIsInstance(xyz, tuple) self.assertEqual(len(xyz), 3) for x in xyz: self.assertGreater(x, 0) xyz = d.max_grid_dims self.assertIsInstance(xyz, tuple) self.assertEqual(len(xyz), 3) for x in xyz: self.assertGreater(x, 0) self.assertGreater(d.max_registers_per_block, 0) self.assertGreater(d.clock_rate, 0) self.assertGreater(d.memory_clock_rate, 0) self.assertGreaterEqual(d.total_constant_memory, 0) self.assertGreater(d.multiprocessor_count, 0) self.assertGreaterEqual(d.kernel_exec_timeout, 0) self.assertIsInstance(d.integrated, bool) self.assertIsInstance(d.can_map_host_memory, bool) self.assertIsInstance(d.concurrent_kernels, bool) self.assertIsInstance(d.ecc_enabled, bool) self.assertGreater(d.memory_bus_width, 0) self.assertGreaterEqual(d.l2_cache_size, 0) self.assertGreater(d.max_threads_per_multiprocessor, 0) self.assertGreaterEqual(d.async_engine_count, 0) self.assertIsInstance(d.stream_priorities_supported, bool) self.assertIsInstance(d.global_l1_cache_supported, bool) self.assertIsInstance(d.local_l1_cache_supported, bool) self.assertGreaterEqual(d.max_shared_memory_per_multiprocessor, 0) self.assertGreater(d.max_registers_per_multiprocessor, 0) self.assertIsInstance(d.managed_memory, bool) self.assertIsInstance(d.multi_gpu_board, bool) self.assertGreaterEqual(d.multi_gpu_board_group_id, 0) self.assertGreaterEqual(d.max_pitch, 0)
def test_devices(self): logging.debug("ENTER: test_devices") devices = cu.Devices() logging.debug("Found %d CUDA device%s", len(devices), "" if len(devices) <= 1 else "s") for i, device in enumerate(devices): logging.debug("%d: %s", i, device.name) if not len(devices): return logging.debug("Selecting device 0") d = devices[0] self.assertEqual(d.handle, int(d.handle)) logging.debug("It's name is %s", d.name) logging.debug("It's total mem is %d", d.total_mem) logging.debug("It's compute capability is %d_%d", *d.compute_capability) logging.debug("It's pci bus id: %s", d.pci_bus_id) logging.debug("Trying to get device by it's pci id") d2 = cu.Device(d.pci_bus_id) self.assertEqual(d2.handle, d.handle) logging.debug("Succeeded") logging.debug("EXIT: test_devices")
def test_launch_kernel(self): logging.debug("ENTER: test_launch_kernel") ctx = cu.Devices().create_some_context() logging.debug("Context created") N = 1024 C = 0.75 a = cu.MemHostAlloc(ctx, N * 4) b = cu.MemHostAlloc(ctx, N * 4) logging.debug("Memory allocated") module = cu.Module(ctx, source_file="%s/test.cu" % self.path) logging.debug("Program builded") f = module.get_func("test") logging.debug("Got function pointer") f.set_args(a, b, numpy.array([C], dtype=numpy.float32)) f.set_args(a, cu.skip, numpy.array([C], dtype=numpy.float32)) f.set_args(cu.skip(2), numpy.array([C], dtype=numpy.float32)) f.set_args(a, b, cu.skip(1)) f.set_args(cu.skip(3)) f.set_arg(0, None) f.set_arg(0, a) logging.debug("Args set") a_host = numpy.random.rand(N).astype(numpy.float32) b_host = numpy.random.rand(N).astype(numpy.float32) gold = a_host.copy() for _ in range(10): gold += b_host * C a.to_device(a_host) b.to_device_async(b_host) for _ in range(10): f((N, 1, 1)) logging.debug("Scheduled for execution") c_host = numpy.zeros(N, dtype=numpy.float32) a.to_host(c_host) logging.debug("Got results back") max_diff = numpy.fabs(c_host - gold).max() self.assertLess(max_diff, 0.0001) logging.debug("test_launch_kernel() succeeded") logging.debug("EXIT: test_launch_kernel")
def test_kernel(self): path = os.path.dirname(__file__) if not len(path): path = "." with cu.Devices().create_some_context() as ctx: cap = ctx.device.compute_capability if cap < (3, 5): return logging.info("Compiling...") m = ctx.create_module(source_file=(path + "/cublas_perf.cu"), nvcc_options2=cu.Module.OPTIONS_CUBLAS, compute_capability=(cap[0], 0) if cap >= (6, 0) else cap) # minor version of compute has to be set to 0 # to work on Pascal with CUDA 8.0 create_cublas = m.create_function("create_cublas") destroy_cublas = m.create_function("destroy_cublas") test = m.create_function("test") test_full = m.create_function("test_full") dummy = m.create_function("dummy") logging.info("Done") blas = numpy.zeros(1, dtype=numpy.uint64) blas_ = ctx.mem_alloc(blas) create_cublas.set_args(blas_) create_cublas((1, 1, 1)) blas_.to_host(blas) n = 256 one_ = ctx.mem_alloc(numpy.ones(1, dtype=numpy.float32)) zero_ = ctx.mem_alloc(numpy.zeros(1, dtype=numpy.float32)) a = numpy.random.rand(n, n).astype(numpy.float32) b = numpy.random.rand(n, n).astype(numpy.float32) c = numpy.zeros_like(a) c_gold = numpy.dot(a.transpose(), b.transpose()).transpose() a_ = ctx.mem_alloc(a) b_ = ctx.mem_alloc(b) c_ = ctx.mem_alloc(c) N = 10000 test.set_args(numpy.array([N], dtype=numpy.int64), blas, numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) ctx.synchronize() t0 = time.time() test((1, 1, 1)) ctx.synchronize() dt = time.time() - t0 logging.info("With external blas handle completed in %.6f sec", dt) destroy_cublas.set_args(blas) destroy_cublas((1, 1, 1)) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) test_full.set_args(numpy.array([N], dtype=numpy.int64), numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) ctx.synchronize() t0 = time.time() test_full((1, 1, 1)) ctx.synchronize() dt = time.time() - t0 logging.info("With local blas handle completed in %.6f sec", dt) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) blas = cublas.CUBLAS(ctx) ctx.synchronize() one = numpy.ones(1, dtype=numpy.float32) zero = numpy.zeros(1, dtype=numpy.float32) dummy.set_args(c_, c_) t0 = time.time() for i in range(N): blas.sgemm(cublas.CUBLAS_OP_N, cublas.CUBLAS_OP_N, n, n, n, one, a_ if i & 1 else b_, b_ if i & 1 else a_, zero, c_, n, n, n) dummy((1, 1, 1)) # interleave with some dummy kernel ctx.synchronize() dt = time.time() - t0 logging.info("With shared library blas completed in %.6f sec", dt) c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.info("max_diff = %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) logging.info("Succeeded")
def available(): try: return len(cu.Devices()) > 0 except: return False
def test_dump_devices(self): logging.debug("ENTER: test_dump_devices") logging.debug("Available CUDA devices:\n%s", cu.Devices().dump_devices()) logging.debug("EXIT: test_dump_devices")