Python Module Examples, cuda4py.Module Python Examples

Example #1

0

Show file

File: test_api.py Project: ra2003/cuda4py

    def test_module(self):
        logging.debug("ENTER: test_module")
        ctx = cu.Devices().create_some_context()
        module = cu.Module(ctx, source_file="%s/test.cu" % self.path)
        self.assertIsNotNone(module.handle)
        self.assertIsNotNone(ctx.handle)
        logging.debug("nvcc compilation succeeded")
        logging.debug("Resulted ptx code is:\n%s", module.ptx.decode("utf-8"))

        logging.debug("Will try Context.create_module")
        module = ctx.create_module(source_file="%s/test.cu" % self.path)
        self.assertIsNotNone(module.handle)
        self.assertIsNotNone(ctx.handle)
        logging.debug("Succeeded")

        logging.debug("Will try to compile with includes")
        module = cu.Module(ctx, source_file="%s/inc.cu" % self.path,
                           include_dirs=("", self.path, ""))
        self.assertIsNotNone(module.handle)
        self.assertIsNotNone(ctx.handle)
        logging.debug("Succeeded")
        logging.debug("Will try to compile with source")
        module = cu.Module(ctx, source="#include \"inc.cu\"",
                           include_dirs=(self.path,))
        self.assertIsNotNone(module.handle)
        self.assertIsNotNone(ctx.handle)
        logging.debug("Succeeded")
        logging.debug("Testing get_func, get_global")
        with ctx:
            self.assertIsNotNone(module.get_func("test"))
            ptr, size = module.get_global("g_a")
            self.assertEqual(ptr, int(ptr))
            self.assertEqual(size, 4)
        logging.debug("Succeeded")
        logging.debug("EXIT: test_module")

Example #2

0

Show file

File: test_api.py Project: ra2003/cuda4py

    def test_occupancy(self):
        logging.debug("ENTER: test_occupancy")
        ctx = cu.Devices().create_some_context()
        logging.debug("Context created")
        module = cu.Module(ctx, source_file="%s/test.cu" % self.path)
        logging.debug("Program builded")
        f = module.get_func("test")
        logging.debug("Got function pointer")

        num_blocks = f.max_active_blocks_per_multiprocessor(1)
        self.assertEqual(num_blocks, int(num_blocks))
        self.assertGreater(num_blocks, 0)
        logging.debug("num_blocks = %d", num_blocks)
        logging.debug("Testing dynamic_smem_size parameter")
        num_blocks = f.max_active_blocks_per_multiprocessor(
            128, dynamic_smem_size=8192)
        self.assertEqual(num_blocks, int(num_blocks))
        self.assertGreater(num_blocks, 0)
        logging.debug("num_blocks = %d", num_blocks)

        min_grid_size, block_size = f.max_potential_block_size()
        self.assertEqual(min_grid_size, int(min_grid_size))
        self.assertEqual(block_size, int(block_size))
        self.assertGreater(min_grid_size, 0)
        self.assertGreater(block_size, 0)
        logging.debug("min_grid_size, block_size = %d, %d",
                      min_grid_size, block_size)
        logging.debug("Trying callback")
        min_grid_size, block_size = f.max_potential_block_size(
            lambda x: x ** 2)
        self.assertEqual(min_grid_size, int(min_grid_size))
        self.assertEqual(block_size, int(block_size))
        self.assertGreater(min_grid_size, 0)
        self.assertGreater(block_size, 0)
        logging.debug("min_grid_size, block_size = %d, %d",
                      min_grid_size, block_size)
        logging.debug("Testing block_size_limit parameter")
        min_grid_size, block_size = f.max_potential_block_size(
            block_size_limit=16)
        self.assertEqual(min_grid_size, int(min_grid_size))
        self.assertEqual(block_size, int(block_size))
        self.assertGreater(min_grid_size, 0)
        self.assertGreater(block_size, 0)
        self.assertLessEqual(block_size, 16)
        logging.debug("min_grid_size, block_size = %d, %d",
                      min_grid_size, block_size)
        logging.debug("Testing dynamic_smem_size parameter")
        min_grid_size, block_size = f.max_potential_block_size(
            dynamic_smem_size=8192)
        self.assertEqual(min_grid_size, int(min_grid_size))
        self.assertEqual(block_size, int(block_size))
        self.assertGreater(min_grid_size, 0)
        self.assertGreater(block_size, 0)
        logging.debug("min_grid_size, block_size = %d, %d",
                      min_grid_size, block_size)
        logging.debug("EXIT: test_occupancy")

Example #3

0

Show file

File: test_del.py Project: ra2003/cuda4py

    def _test_good(self):
        ctx = cu.Devices().create_some_context()
        a = Container()
        a.ctx = ctx
        b = Container()
        b.mem = cu.MemAlloc(ctx, 4096)
        b.module = cu.Module(ctx, source="""
            __global__ void test(float *a) {
                a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f;
            }""")
        b.blas = blas.CUBLAS(ctx)

        logging.debug("Remaining context count: %d", cu.Context.context_count)
        # self.assertEqual(cu.Context.context_count, 1)
        self.assertIsNotNone(ctx)  # to hold ctx up to this point

Example #4

0

Show file

    def test_kernel(self):
        logging.debug("ENTER: test_kernel")
        cap = self.ctx.device.compute_capability
        if cap < (3, 5):
            logging.debug("Requires compute capability >= (3, 5), got %s", cap)
            logging.debug("EXIT: test_kernel")
            return
        with self.ctx:
            module = cu.Module(self.ctx,
                               source_file=("%s/cublas.cu" % self.path),
                               nvcc_options2=cu.Module.OPTIONS_CUBLAS,
                               compute_capability=(cap[0], 0) if cap >=
                               (6, 0) else cap)
            # minor version of compute has to be set to 0
            # to work on Pascal with CUDA 8.0
            logging.debug("Compiled")
            f = module.create_function("test")
            logging.debug("Got function")

            n = 256
            a = numpy.random.rand(n, n).astype(numpy.float32)
            b = numpy.random.rand(n, n).astype(numpy.float32)
            c = numpy.zeros_like(a)
            c_gold = numpy.dot(a.transpose(), b.transpose()).transpose()
            a_ = cu.MemAlloc(self.ctx, a)
            b_ = cu.MemAlloc(self.ctx, b)
            c_ = cu.MemAlloc(self.ctx, c)
            zero_ = cu.MemAlloc(self.ctx, numpy.zeros(1, dtype=numpy.float32))
            one_ = cu.MemAlloc(self.ctx, numpy.ones(1, dtype=numpy.float32))
            logging.debug("Allocated arrays")

            f.set_args(numpy.array([n], dtype=numpy.int64), one_, a_, b_,
                       zero_, c_)
            logging.debug("Set args")

            f((1, 1, 1), (1, 1, 1))
            logging.debug("Executed")

            c_.to_host(c)
            max_diff = numpy.fabs(c - c_gold).max()
            logging.debug("Maximum difference is %.6f", max_diff)
            self.assertLess(max_diff, 1.0e-3)
        logging.debug("EXIT: test_kernel")

Example #5

0

Show file

File: test_api.py Project: ra2003/cuda4py

 def test_launch_kernel(self):
     logging.debug("ENTER: test_launch_kernel")
     ctx = cu.Devices().create_some_context()
     logging.debug("Context created")
     N = 1024
     C = 0.75
     a = cu.MemHostAlloc(ctx, N * 4)
     b = cu.MemHostAlloc(ctx, N * 4)
     logging.debug("Memory allocated")
     module = cu.Module(ctx, source_file="%s/test.cu" % self.path)
     logging.debug("Program builded")
     f = module.get_func("test")
     logging.debug("Got function pointer")
     f.set_args(a, b, numpy.array([C], dtype=numpy.float32))
     f.set_args(a, cu.skip, numpy.array([C], dtype=numpy.float32))
     f.set_args(cu.skip(2), numpy.array([C], dtype=numpy.float32))
     f.set_args(a, b, cu.skip(1))
     f.set_args(cu.skip(3))
     f.set_arg(0, None)
     f.set_arg(0, a)
     logging.debug("Args set")
     a_host = numpy.random.rand(N).astype(numpy.float32)
     b_host = numpy.random.rand(N).astype(numpy.float32)
     gold = a_host.copy()
     for _ in range(10):
         gold += b_host * C
     a.to_device(a_host)
     b.to_device_async(b_host)
     for _ in range(10):
         f((N, 1, 1))
     logging.debug("Scheduled for execution")
     c_host = numpy.zeros(N, dtype=numpy.float32)
     a.to_host(c_host)
     logging.debug("Got results back")
     max_diff = numpy.fabs(c_host - gold).max()
     self.assertLess(max_diff, 0.0001)
     logging.debug("test_launch_kernel() succeeded")
     logging.debug("EXIT: test_launch_kernel")