def test_module(self): logging.debug("ENTER: test_module") ctx = cu.Devices().create_some_context() module = cu.Module(ctx, source_file="%s/test.cu" % self.path) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("nvcc compilation succeeded") logging.debug("Resulted ptx code is:\n%s", module.ptx.decode("utf-8")) logging.debug("Will try Context.create_module") module = ctx.create_module(source_file="%s/test.cu" % self.path) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Will try to compile with includes") module = cu.Module(ctx, source_file="%s/inc.cu" % self.path, include_dirs=("", self.path, "")) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Will try to compile with source") module = cu.Module(ctx, source="#include \"inc.cu\"", include_dirs=(self.path,)) self.assertIsNotNone(module.handle) self.assertIsNotNone(ctx.handle) logging.debug("Succeeded") logging.debug("Testing get_func, get_global") with ctx: self.assertIsNotNone(module.get_func("test")) ptr, size = module.get_global("g_a") self.assertEqual(ptr, int(ptr)) self.assertEqual(size, 4) logging.debug("Succeeded") logging.debug("EXIT: test_module")
def test_occupancy(self): logging.debug("ENTER: test_occupancy") ctx = cu.Devices().create_some_context() logging.debug("Context created") module = cu.Module(ctx, source_file="%s/test.cu" % self.path) logging.debug("Program builded") f = module.get_func("test") logging.debug("Got function pointer") num_blocks = f.max_active_blocks_per_multiprocessor(1) self.assertEqual(num_blocks, int(num_blocks)) self.assertGreater(num_blocks, 0) logging.debug("num_blocks = %d", num_blocks) logging.debug("Testing dynamic_smem_size parameter") num_blocks = f.max_active_blocks_per_multiprocessor( 128, dynamic_smem_size=8192) self.assertEqual(num_blocks, int(num_blocks)) self.assertGreater(num_blocks, 0) logging.debug("num_blocks = %d", num_blocks) min_grid_size, block_size = f.max_potential_block_size() self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Trying callback") min_grid_size, block_size = f.max_potential_block_size( lambda x: x ** 2) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Testing block_size_limit parameter") min_grid_size, block_size = f.max_potential_block_size( block_size_limit=16) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) self.assertLessEqual(block_size, 16) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("Testing dynamic_smem_size parameter") min_grid_size, block_size = f.max_potential_block_size( dynamic_smem_size=8192) self.assertEqual(min_grid_size, int(min_grid_size)) self.assertEqual(block_size, int(block_size)) self.assertGreater(min_grid_size, 0) self.assertGreater(block_size, 0) logging.debug("min_grid_size, block_size = %d, %d", min_grid_size, block_size) logging.debug("EXIT: test_occupancy")
def _test_good(self): ctx = cu.Devices().create_some_context() a = Container() a.ctx = ctx b = Container() b.mem = cu.MemAlloc(ctx, 4096) b.module = cu.Module(ctx, source=""" __global__ void test(float *a) { a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f; }""") b.blas = blas.CUBLAS(ctx) logging.debug("Remaining context count: %d", cu.Context.context_count) # self.assertEqual(cu.Context.context_count, 1) self.assertIsNotNone(ctx) # to hold ctx up to this point
def test_kernel(self): logging.debug("ENTER: test_kernel") cap = self.ctx.device.compute_capability if cap < (3, 5): logging.debug("Requires compute capability >= (3, 5), got %s", cap) logging.debug("EXIT: test_kernel") return with self.ctx: module = cu.Module(self.ctx, source_file=("%s/cublas.cu" % self.path), nvcc_options2=cu.Module.OPTIONS_CUBLAS, compute_capability=(cap[0], 0) if cap >= (6, 0) else cap) # minor version of compute has to be set to 0 # to work on Pascal with CUDA 8.0 logging.debug("Compiled") f = module.create_function("test") logging.debug("Got function") n = 256 a = numpy.random.rand(n, n).astype(numpy.float32) b = numpy.random.rand(n, n).astype(numpy.float32) c = numpy.zeros_like(a) c_gold = numpy.dot(a.transpose(), b.transpose()).transpose() a_ = cu.MemAlloc(self.ctx, a) b_ = cu.MemAlloc(self.ctx, b) c_ = cu.MemAlloc(self.ctx, c) zero_ = cu.MemAlloc(self.ctx, numpy.zeros(1, dtype=numpy.float32)) one_ = cu.MemAlloc(self.ctx, numpy.ones(1, dtype=numpy.float32)) logging.debug("Allocated arrays") f.set_args(numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) logging.debug("Set args") f((1, 1, 1), (1, 1, 1)) logging.debug("Executed") c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.debug("Maximum difference is %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) logging.debug("EXIT: test_kernel")
def test_launch_kernel(self): logging.debug("ENTER: test_launch_kernel") ctx = cu.Devices().create_some_context() logging.debug("Context created") N = 1024 C = 0.75 a = cu.MemHostAlloc(ctx, N * 4) b = cu.MemHostAlloc(ctx, N * 4) logging.debug("Memory allocated") module = cu.Module(ctx, source_file="%s/test.cu" % self.path) logging.debug("Program builded") f = module.get_func("test") logging.debug("Got function pointer") f.set_args(a, b, numpy.array([C], dtype=numpy.float32)) f.set_args(a, cu.skip, numpy.array([C], dtype=numpy.float32)) f.set_args(cu.skip(2), numpy.array([C], dtype=numpy.float32)) f.set_args(a, b, cu.skip(1)) f.set_args(cu.skip(3)) f.set_arg(0, None) f.set_arg(0, a) logging.debug("Args set") a_host = numpy.random.rand(N).astype(numpy.float32) b_host = numpy.random.rand(N).astype(numpy.float32) gold = a_host.copy() for _ in range(10): gold += b_host * C a.to_device(a_host) b.to_device_async(b_host) for _ in range(10): f((N, 1, 1)) logging.debug("Scheduled for execution") c_host = numpy.zeros(N, dtype=numpy.float32) a.to_host(c_host) logging.debug("Got results back") max_diff = numpy.fabs(c_host - gold).max() self.assertLess(max_diff, 0.0001) logging.debug("test_launch_kernel() succeeded") logging.debug("EXIT: test_launch_kernel")