Example #1
0
    def test_copy_buffer(self):
        import numpy
        # Create platform, context and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(10000, dtype=numpy.float32)
        b = a * 0.5
        c = numpy.empty_like(b)
        c[:] = 1.0e30

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               b)

        # Copy some data from one buffer to another
        sz = a.itemsize
        queue.copy_buffer(a_, b_, 1000 * sz, 2000 * sz, 3000 * sz).wait()

        queue.read_buffer(b_, c)
        diff = numpy.fabs(c[2000:5000] - a[1000:4000]).max()
        self.assertEqual(diff, 0)
Example #2
0
    def test_set_kernel_args(self):
        import numpy

        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])
        global_size = [12345]
        local_size = None

        krn.set_args(cl.skip(3))
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
        krn.set_args(cl.skip, cl.skip, cl.skip)
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
        krn.set_args(cl.skip(1), cl.skip(1), cl.skip(1))
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
        krn.set_args(cl.skip(1000))
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
        self.assertRaises(ValueError, cl.skip, 0)
        self.assertRaises(ValueError, cl.skip, -1)

        c = numpy.array([1.2345], dtype=numpy.float32)
        krn.set_args(cl.skip(2), c)
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
        krn.set_args(cl.skip, cl.skip, c)
        self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn,
                          global_size, local_size)
Example #3
0
    def test_fill_buffer(self):
        # Create platform, context and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        if ctx.devices[0].version < 1.2:
            return
        queue = ctx.create_queue(ctx.devices[0])

        import numpy

        # Create array
        a = numpy.zeros(4096, dtype=numpy.int32)

        # Create buffer
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               a)

        # Fill the buffer
        pattern = numpy.array([1, 2, 3, 4], dtype=numpy.int32)
        queue.fill_buffer(a_, pattern, pattern.nbytes, a.nbytes).wait()

        queue.read_buffer(a_, a)
        diff = 0
        for i in range(0, a.size, pattern.size):
            diff += numpy.fabs(a[i:i + pattern.size] - pattern).sum()
        self.assertEqual(diff, 0)
Example #4
0
 def test1():
     logging.basicConfig(level=logging.DEBUG)
     platforms = cl.Platforms()
     print("OpenCL devices:\n\n%s\n" % platforms.dump_devices())
     ctx = platforms.create_some_context()
     queue = ctx.create_queue(ctx.devices[0])
     prg = ctx.create_program("""
     __kernel void test(__global const float *a, __global const float *b,
     __global float *c, const float k) {
     size_t i = get_global_id(0);
     c[i] = (a[i] + b[i]) * k;
     }
     """)
     krn = prg.get_kernel("test")
     a = np.arange(1000000, dtype=np.float32)
     b = np.arange(1000000, dtype=np.float32)
     c = np.empty(1000000, dtype=np.float32)
     k = np.array([0.5], dtype=np.float32)
     a_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a)
     b_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b)
     c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY
                               | cl.CL_MEM_ALLOC_HOST_PTR,
                               size=c.nbytes)
     krn.set_arg(0, a_buf)
     krn.set_arg(1, b_buf)
     krn.set_arg(2, c_buf)
     krn.set_arg(3, k[0:1])
     queue.execute_kernel(krn, [a.size], None)
     queue.read_buffer(c_buf, c)
     diff = np.fabs(c - (a + b) * k[0])
     print(diff)
Example #5
0
    def test_copy_buffer_rect(self):
        import numpy
        # Create platform, context and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(35 * 25 * 15, dtype=numpy.float32).reshape(35, 25, 15)
        b = numpy.arange(37 * 27 * 17, dtype=numpy.float32).reshape(37, 27, 17)
        b *= 0.5
        c = numpy.empty_like(b)
        c[:] = 1.0e30

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               b)

        # Copy 3D rect from one buffer to another
        sz = a.itemsize
        queue.copy_buffer_rect(a_, b_, (3 * sz, 4, 5), (6 * sz, 7, 8),
                               (5 * sz, 10, 20), a.shape[2] * sz,
                               a.shape[1] * a.shape[2] * sz, b.shape[2] * sz,
                               b.shape[1] * b.shape[2] * sz).wait()

        queue.read_buffer(b_, c)
        diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max()
        self.assertEqual(diff, 0)
Example #6
0
 def test_binary(self):
     platforms = cl.Platforms()
     ctx = platforms.create_some_context()
     prg = ctx.create_program(self.src_test, self.include_dirs)
     binary = prg.binaries[0]
     prg = ctx.create_program([binary], binary=True)
     krn = prg.get_kernel("test")
     del krn
Example #7
0
    def test_set_arg_None(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        src = """
        __kernel void test(__global float *a, __global const float *b,
                           __global const float *c) {
            int idx = get_global_id(0);
            a[idx] += b[idx] + (c ? c[idx] : 0);
        }
        """
        prg = ctx.create_program(src)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32)
        b = numpy.array([6, 7, 8, 9, 10], dtype=numpy.float32)
        c = numpy.array([11, 12, 13, 14, 15], dtype=numpy.float32)

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR,
                               b)
        c_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR,
                               c)

        # Set kernel arguments
        krn.set_arg(0, a_)
        krn.set_arg(1, b_)
        krn.set_arg(2, c_)

        # Execute kernel
        queue.execute_kernel(krn, [a.size], None).wait()

        # Get results back
        d = numpy.zeros_like(a)
        queue.read_buffer(a_, d)
        t = a + b + c
        diff = numpy.fabs(d - t).max()
        self.assertEqual(diff, 0)

        # Set arg to None
        krn.set_arg(2, None)

        # Execute kernel
        queue.execute_kernel(krn, [a.size], None).wait()

        # Get results back
        queue.read_buffer(a_, d)
        t += b
        diff = numpy.fabs(d - t).max()
        self.assertEqual(diff, 0)
Example #8
0
 def test_kernel_info(self):
     platforms = cl.Platforms()
     ctx = platforms.create_some_context()
     prg = ctx.create_program(self.src_test, self.include_dirs)
     krn = prg.get_kernel("test")
     self.assertGreater(krn.reference_count, 0)
     self.assertEqual(krn.num_args, 3)
     try:
         self.assertEqual(krn.attributes, "vec_type_hint(float4)")
     except cl.CLRuntimeError as e:
         self.assertEqual(e.code, -30)
Example #9
0
    def arg_completer(prefix, **kwargs):
        def format_device(plf, dev):
            return "%s - %s on %s" % (dev.path, dev.name.strip(), plf.name)

        if prefix.strip() == "":
            platforms = cl.Platforms().platforms
            if len(platforms) == 1 and len(platforms[0].devices) == 1:
                return ["0:0"]
            result = []
            for platform in platforms:
                for device in platform:
                    result.append(format_device(platform, device))
            return result
        parsed = [p for p in prefix.split(':') if p.strip() != ""]
        platform = cl.Platforms().platforms[int(parsed[0].strip())]
        if len(parsed) == 1:
            if len(platform.devices) == 1:
                return [platform.devices[0].path]
            result = []
            for device in platform:
                result.append(format_device(platform, device))
            return result
Example #10
0
 def testadd():
     os.environ["PYOPENCL_CTX"] = "0:0"
     platforms = cl.Platforms()
     print("OpenCL devices:\n%s" % platforms.dump_devices())
     #ctx = platforms.create_some_context()
     ctx = cl.Context(platforms.platforms[0],
                      platforms.platforms[0].devices[0:1])
     dev = ctx.devices[0]
     print("version=%s,\ngroup_size=%s" %
           (dev.version, dev.max_work_group_size))
     #prg = ctx.create_program(src_test, include_dirs)
     prg = ctx.create_program(testopencl.readoclfile("test.cl"))
     bins = prg.binaries[0]
     print(prg.kernel_names)
     #print(bins)
     krn = prg.get_kernel("testadd")
     print(krn.attributes)
     queue = ctx.create_queue(ctx.devices[0])
     a = np.arange(100, dtype=np.float32)
     b = np.arange(100, dtype=np.float32)
     c = np.empty(100, dtype=np.float32)
     k = np.array([0.5], dtype=np.float32)
     a_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a)
     b_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b)
     c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY
                               | cl.CL_MEM_ALLOC_HOST_PTR,
                               size=c.nbytes)
     '''
     krn.set_arg(0, a_buf)
     krn.set_arg(1, b_buf)
     krn.set_arg(2, c_buf)
     krn.set_arg(3, k[0:1])
     '''
     krn.set_args(a_buf, b_buf, c_buf, k[0:1])
     ev = queue.execute_kernel(krn, [a.size], None)
     queue.read_buffer(c_buf, c)
     diff = c - (a * k[0] + b * k[0]) * k[0]
     #print(a)
     #print(c)
     print(diff)
     del queue
     del ctx
     del krn
     del prg
     gc.collect()
Example #11
0
    def _get_some_device(self, **kwargs):
        """Gets some device from the available OpenCL devices.
        Returns True if any device was selected, otherwise, False.
        """
        device = self.parse_device(**kwargs)
        try:
            platforms = cl.Platforms()
        except cl.CLRuntimeError:
            platforms = None
        if platforms is None or len(platforms.platforms) == 0:
            raise DeviceNotFoundError("No OpenCL devices were found")
        self._id = device
        if device == "":
            context = platforms.create_some_context()
        else:
            platfnum, devnums = device.split(':')
            try:
                platform = platforms.platforms[int(platfnum)]
            except IndexError:
                raise from_none(
                    DeviceNotFoundError("Device %s was not found." % device))
            context = platform.create_context([
                platform.devices[int(devnum)] for devnum in devnums.split(',')
            ])
        if "NVIDIA" in context.platform.name:

            def fail(*args, **kwargs):
                raise RuntimeError("fork() breaks NVIDIA OpenCL")

            os.fork = fail
            import subprocess

            subprocess.Popen = fail
        device = context.devices[0]
        desc = "%s/%s" % (device.vendor.strip(), device.name.strip())
        self.queue_ = context.create_queue(device)
        self.device_info = DeviceInfo(
            desc=desc,
            memsize=device.memsize,
            memalign=device.memalign,
            version=device.version,
            device_type=device.type,
            max_work_group_size=self.queue_.device.max_work_group_size,
            max_work_item_sizes=self.queue_.device.max_work_item_sizes,
            local_memsize=self.queue_.device.local_memsize)
        return True
Example #12
0
 def test_svm_memcpy(self):
     ctx = cl.Platforms().create_some_context()
     if ctx.devices[0].version < 2.0:
         return
     svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096)
     import numpy
     a = numpy.frombuffer(svm.buffer, dtype=numpy.int32)
     queue = ctx.create_queue(ctx.devices[0])
     queue.svm_map(svm, cl.CL_MAP_WRITE_INVALIDATE_REGION, svm.size)
     a[:] = numpy.arange(a.size, dtype=a.dtype)
     queue.svm_unmap(svm)
     n = a.size // 2
     queue.svm_memcpy(a[n:], a, n * a.itemsize)
     queue.svm_map(svm, cl.CL_MAP_READ, svm.size)
     self.assertEqual(numpy.fabs(a[n:] - a[:n]).max(), 0)
     queue.svm_unmap(svm).wait()
     del svm
Example #13
0
 def test_svm_memfill(self):
     ctx = cl.Platforms().create_some_context()
     if ctx.devices[0].version < 2.0:
         return
     svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096)
     import numpy
     a = numpy.frombuffer(svm.buffer, dtype=numpy.int32)
     queue = ctx.create_queue(ctx.devices[0])
     pattern = numpy.array([1, 2, 3, 4], dtype=numpy.int32)
     queue.svm_memfill(a, pattern, pattern.nbytes, a.nbytes)
     queue.svm_map(svm, cl.CL_MAP_READ, svm.size)
     diff = 0
     for i in range(0, a.size, pattern.size):
         diff += numpy.fabs(a[i:i + pattern.size] - pattern).sum()
     self.assertEqual(diff, 0)
     queue.svm_unmap(svm).wait()
     del svm
Example #14
0
 def test_create_pipe(self):
     ctx = cl.Platforms().create_some_context()
     if ctx.devices[0].version < 2.0:
         return
     pipe = ctx.create_pipe(0, 8, 16)
     del pipe
     pipe = ctx.create_pipe(cl.CL_MEM_READ_WRITE, 8, 16)
     prg = ctx.create_program("""
         __kernel void test(__write_only pipe int p) {
             int x = 0;
             write_pipe(p, &x);
         }
         """, options="-cl-std=CL2.0")
     krn = prg.get_kernel("test")
     krn.set_arg(0, pipe)
     del krn
     del prg
     del pipe
Example #15
0
 def testmul():
     os.environ["PYOPENCL_CTX"] = "0:0"
     platforms = cl.Platforms()
     print("OpenCL devices:\n%s" % platforms.dump_devices())
     ctx = platforms.create_some_context()
     prg = ctx.create_program(testopencl.readoclfile("test.cl"))
     print(prg.kernel_names)
     krn = prg.get_kernel("matmul")
     print(krn.attributes)
     queue = ctx.create_queue(ctx.devices[0])
     a = np.arange(10, dtype=np.float32)
     b = np.arange(10, dtype=np.float32)
     c = np.empty(1000, dtype=np.float32)
     m = np.array([10], dtype=np.float32)
     p = np.array([10], dtype=np.float32)
     n = np.array([10], dtype=np.float32)
     a_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a)
     b_buf = ctx.create_buffer(
         cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b)
     c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY
                               | cl.CL_MEM_ALLOC_HOST_PTR,
                               size=c.nbytes)
     '''
     krn.set_arg(0, a_buf)
     krn.set_arg(1, b_buf)
     krn.set_arg(2, c_buf)
     krn.set_arg(3, m[0:1])
     krn.set_arg(4, p[0:1])
     krn.set_arg(5, n[0:1])
     '''
     krn.set_args(a_buf, b_buf, c_buf, m[0:1], p[0:1], c[0:1])
     #queue.execute_kernel(krn, [a.size], None)
     #queue.read_buffer(c_buf, c)
     #diff = np.fabs(c - (a * k[0]+ b * k[0]) * k[0])
     print(a)
     print(b)
     #print(diff)
     del queue
     del ctx
     del krn
     del prg
     gc.collect()
Example #16
0
 def test_program_info(self):
     platforms = cl.Platforms()
     ctx = platforms.create_some_context()
     prg = ctx.create_program(self.src_test, self.include_dirs)
     self.assertGreater(prg.reference_count, 0)
     try:
         self.assertEqual(prg.num_kernels, 1)
         names = prg.kernel_names
         self.assertIsInstance(names, list)
         self.assertEqual(len(names), 1)
         self.assertEqual(names[0], "test")
     except cl.CLRuntimeError as e:
         if prg.devices[0].version >= 1.2:
             raise
         self.assertEqual(e.code, -30)
     bins = prg.binaries
     self.assertEqual(len(bins), 1)
     self.assertIsInstance(bins[0], bytes)
     self.assertGreater(len(bins[0]), 0)
Example #17
0
def load_kernels():
    global cl_image_processing_kernel, cl_queue, cl_context

    platforms = cl.Platforms()
    cuda_platform = None

    for p in platforms:
        # It is hard to determine the device with the most power.
        # As my machines only have nvidia graphics cards, just filter for the nvidia CUDA platform
        if 'cuda' in p.name.lower():
            cuda_platform = p
            break

    if cuda_platform is None:
        print('No suitable device found. Exiting.')
        exit(0)

    device = cuda_platform.devices[0]
    print('Chosen OpenCL device: {0}'.format(cuda_platform.devices[0].name))
    cl_context = cuda_platform.create_context([device])

    cl_queue = cl_context.create_queue(device)
    program = cl_context.create_program(
         """
        __kernel void imageProcessing(__global const uchar* rgbImage, __global const float* parameters, __global float* grayImage) {{
            size_t imgIdx = get_global_id(0);
            size_t rgbIdx = {0} * imgIdx; 
            size_t grayIdx = {1} * imgIdx;
            size_t paramIdx = 2 * imgIdx;
            for (int y = 0; y < {2}; y++) {{
                for (int x = 0; x < {3}; x++) {{
                    float gray = 0.21 * rgbImage[rgbIdx] + 0.72 * rgbImage[rgbIdx + 1] + 0.07 * rgbImage[rgbIdx + 2];
                    float grayEqualized = (gray - parameters[paramIdx]) * 255.0 / parameters[paramIdx + 1];
                    grayImage[grayIdx] = (grayEqualized - 128.0) / 128.0;
                    grayIdx++;
                    rgbIdx += 3;
                }}
            }}
        }}
        """.format(3*IMG_WIDTH*IMG_HEIGHT, IMG_WIDTH*IMG_HEIGHT, IMG_WIDTH, IMG_HEIGHT))

    cl_image_processing_kernel = program.get_kernel('imageProcessing')
Example #18
0
    def test_work_group_info(self):
        ctx = cl.Platforms().create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        info = krn.get_work_group_info(ctx.devices[0])

        self.assertRaises(cl.CLRuntimeError, getattr, info, "global_work_size")

        for vle in (info.compile_work_group_size, ):
            self.assertIsInstance(vle, tuple)
            self.assertEqual(len(vle), 3)
            for x in vle:
                self.assertIsInstance(x, int)
                self.assertGreaterEqual(x, 0)

        for vle in (info.work_group_size, info.local_mem_size,
                    info.preferred_work_group_size_multiple,
                    info.private_mem_size):
            self.assertIsInstance(vle, int)
            self.assertGreaterEqual(vle, 0)
Example #19
0
 def test_create_queue_with_properties(self):
     ctx = cl.Platforms().create_some_context()
     try:
         queue = ctx.create_queue(
             ctx.devices[0],
             cl.CL_QUEUE_ON_DEVICE |
             cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE,
             properties={cl.CL_QUEUE_SIZE: 64})
         del queue
     except cl.CLRuntimeError:
         if ctx.devices[0].version >= 2.0:
             raise
         return
     queue = ctx.create_queue(
         ctx.devices[0],
         properties={cl.CL_QUEUE_SIZE: 64,
                     cl.CL_QUEUE_PROPERTIES:
                     cl.CL_QUEUE_ON_DEVICE |
                     cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE})
     del queue
Example #20
0
    def test_event_profiling(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE)

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)
        c = numpy.array([1.2345], dtype=numpy.float32)

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR,
                               b)

        # Set kernel arguments
        krn.set_arg(0, a_)
        krn.set_arg(1, b_)
        krn.set_arg(2, c[0:1])

        # Execute kernel
        ev = queue.execute_kernel(krn, [a.size], None)
        ev.wait()

        try:
            vles, errs = ev.get_profiling_info()
            self.assertEqual(vles, ev.profiling_values)
            self.assertEqual(errs, ev.profiling_errors)
        except cl.CLRuntimeError:
            pass
        for name, vle in ev.profiling_values.items():
            err = ev.profiling_errors[name]
            self.assertTrue((vle and not err) or (not vle and err))
            self.assertEqual(type(vle), float)
            self.assertEqual(type(err), int)
Example #21
0
 def test_svm_alloc(self):
     ctx = cl.Platforms().create_some_context()
     if ctx.devices[0].version < 2.0:
         return
     svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096)
     svm.release()
     self.assertIsNone(svm.handle)
     del svm
     svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096)
     prg = ctx.create_program("""
         __kernel void test(__global void *p) {
             __global int *ptr = (__global int*)p;
             *ptr += 1;
         }
         """,
                              options="-cl-std=CL2.0")
     krn = prg.get_kernel("test")
     krn.set_arg(0, svm)
     krn.set_arg_svm(0, svm)
     queue = ctx.create_queue(ctx.devices[0])
     queue.svm_map(svm, cl.CL_MAP_WRITE_INVALIDATE_REGION, 4)
     p = cl.get_ffi().cast("int*", svm.handle)
     p[0] = 2
     queue.svm_unmap(svm)
     queue.execute_kernel(krn, [1], None)
     queue.svm_map(svm, cl.CL_MAP_READ, 4)
     self.assertEqual(p[0], 3)
     # always ensure that the last unmap had completed before
     # the svm destructor
     queue.svm_unmap(svm).wait()
     try:
         import numpy
         a = numpy.frombuffer(svm.buffer, dtype=numpy.int32)
         queue.execute_kernel(krn, [1], None)
         queue.svm_map(svm, cl.CL_MAP_READ, 4)
         self.assertEqual(a[0], 4)
         queue.svm_unmap(svm).wait()
     except ImportError:
         pass
     del svm  # svm destructor here
Example #22
0
 def test_device_info(self):
     platforms = cl.Platforms()
     ctx = platforms.create_some_context()
     dev = ctx.devices[0]
     self.assertGreater(dev.max_work_item_dimensions, 0)
     self.assertEqual(len(dev.max_work_item_sizes),
                      dev.max_work_item_dimensions)
     for size in dev.max_work_item_sizes:
         self.assertGreater(size, 0)
     self.assertIsInstance(dev.driver_version.encode("utf-8"), bytes)
     self.assertGreater(len(dev.driver_version), 0)
     try:
         self.assertIsInstance(dev.built_in_kernels, list)
         for krn in dev.built_in_kernels:
             self.assertIsInstance(krn, str)
             self.assertGreater(len(krn), 0)
     except cl.CLRuntimeError as e:
         if dev.version >= 1.2:
             raise
         self.assertEqual(e.code, -30)
     self.assertIsInstance(dev.extensions, list)
     for ext in dev.extensions:
         self.assertIsInstance(ext.encode("utf-8"), bytes)
         self.assertGreater(len(ext), 0)
     self.assertGreater(dev.preferred_vector_width_int, 0)
     self.assertGreater(dev.max_work_group_size, 1)
     self.assertTrue(dev.available)
     try:
         self.assertTrue(type(dev.pipe_max_active_reservations) == int)
         self.assertTrue(type(dev.pipe_max_packet_size) == int)
         self.assertTrue(type(dev.svm_capabilities) == int)
         self.assertTrue(
             type(dev.preferred_platform_atomic_alignment) == int)
         self.assertTrue(type(dev.preferred_global_atomic_alignment) == int)
         self.assertTrue(type(dev.preferred_local_atomic_alignment) == int)
     except cl.CLRuntimeError as e:
         if dev.version >= 2.0:
             raise
         self.assertEqual(e.code, -30)
Example #23
0
 def _test_gemm(self, gemm, dtype):
     ctx = cl.Platforms().create_some_context()
     queue = ctx.create_queue(ctx.devices[0])
     a = numpy.zeros([127, 353], dtype=dtype)
     b = numpy.zeros([135, a.shape[1]], dtype=dtype)
     c = numpy.zeros([a.shape[0], b.shape[0]], dtype=dtype)
     numpy.random.seed(numpy.array([123], dtype=numpy.int32)[0])
     a[:] = numpy.random.rand(a.size).astype(dtype).reshape(a.shape)
     b[:] = numpy.random.rand(b.size).astype(dtype).reshape(b.shape)
     gold_c = numpy.dot(a, b.transpose())
     a_buf = ctx.create_buffer(
         cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a)
     b_buf = ctx.create_buffer(
         cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, b)
     c_buf = ctx.create_buffer(
         cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, c)
     gemm([queue], blas.clblasRowMajor, blas.clblasNoTrans,
          blas.clblasTrans, a.shape[0], b.shape[0], a.shape[1], 1.0, a_buf,
          b_buf, 0.0, c_buf)
     queue.flush()
     queue.read_buffer(c_buf, c)
     max_diff = numpy.fabs(c - gold_c).max()
     self.assertLess(max_diff,
                     0.00001 if dtype == numpy.float64 else 0.00015)
'''
This module simply prints your OpenCL platforms and devices to screen.
'''

import os
import opencl4py as cl
os.environ["PYOPENCL_CTX"] = "0:0"  # This is where you choose a device number

# Print all platforms and devices
platforms = cl.Platforms()
print(platforms.dump_devices())
Example #25
0
    def test():
        print(os.environ.get("PYOPENCL_CTX"))
        os.environ["PYOPENCL_CTX"] = "0:0"
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        print("OpenCL devices:\n%s" % platforms.dump_devices())
        ctx = platforms.create_some_context()
        queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE)
        '''
        prg = ctx.create_program(
        """
        __kernel void test(
                __global float *a, 
                __global float *b, 
                const float c) 
        {
          size_t i = get_global_id(0);
          a[i] = (a[i] + b[i]) * c;
        }
        """)
        '''
        prg = ctx.create_program(testopencl.readoclfile("test.cl"))
        krn = prg.get_kernel("test")

        # Create arrays with some values for testing
        a = np.arange(100000, dtype=np.float32)
        b = np.cos(a)
        a = np.sin(a)
        a_copy = a.copy()
        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, np)
        b = cl.realign_array(b, queue.device.memalign, np)
        c = np.array([0.1], dtype=np.float32)
        d = (a + b) * c[0]
        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b)
        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])
        # Execute kernel
        global_size = [a.size]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        print(a - d)
        aa = np.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, aa)
        print(aa - d)
        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True)
        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        print(a - d)
        bb = np.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, bb)
        print(bb - d)
        del queue
        del ctx
        del krn
        del prg
        gc.collect()
Example #26
0
    def matrixmul():
        os.environ["PYOPENCL_CTX"] = "0:0"
        platforms = cl.Platforms()
        print("OpenCL devices:\n%s" % platforms.dump_devices())
        #ctx = platforms.create_some_context()
        ctx = cl.Context(platforms.platforms[0],
                         platforms.platforms[0].devices[0:1])
        prg = ctx.create_program(testopencl.readoclfile("test.cl"))
        print(prg.kernel_names)
        krn = prg.get_kernel("MatrixMul")
        print(krn.attributes)
        queue = ctx.create_queue(ctx.devices[0])

        iHeightA = np.array([800], dtype=np.int32)
        iWidthA = np.array([500], dtype=np.int32)
        pInMatA = np.arange(iHeightA[0] * iWidthA[0], dtype=np.float32)

        iHeightB = np.array([500], dtype=np.int32)
        iWidthB = np.array([800], dtype=np.int32)
        pInMatB = np.arange(iHeightB[0] * iWidthB[0], dtype=np.float32)

        pOutMat = np.empty(iHeightA[0] * iWidthB[0], dtype=np.float32)

        pInMatA_buf = ctx.create_buffer(
            cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, pInMatA)
        pInMatB_buf = ctx.create_buffer(
            cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, pInMatB)
        pOutMat_buf = ctx.create_buffer(cl.CL_MEM_READ_WRITE
                                        | cl.CL_MEM_ALLOC_HOST_PTR,
                                        size=pOutMat.nbytes)

        krn.set_args(iHeightA[0:1], iWidthA[0:1], pInMatA_buf, iHeightB[0:1],
                     iWidthB[0:1], pInMatB_buf, pOutMat_buf)
        global_size = [pInMatA.size, pInMatB.size]
        local_size = None
        for i in range(10):
            start = time.time()
            ev = queue.execute_kernel(krn,
                                      global_size,
                                      local_size,
                                      need_event=True)
            t1 = time.time() - start
            #ev, ptr = queue.map_buffer(pOutMat_buf, cl.CL_MAP_READ, pOutMat.nbytes)

            #queue.unmap_buffer(pOutMat_buf, ptr).wait()
            queue.read_buffer(pOutMat_buf, pOutMat)

            data1 = np.reshape(pOutMat, (iHeightA[0], iWidthB[0]))
            print(data1[0][1:5])
            start = time.time()
            data2 = np.dot(np.reshape(pInMatA, (iHeightA[0], iWidthA[0])),
                           np.reshape(pInMatB, (iHeightB[0], iWidthB[0])))
            t2 = time.time() - start
            pInMatA += pInMatA
            ev = queue.write_buffer(pInMatA_buf,
                                    pInMatA,
                                    blocking=False,
                                    need_event=True)
            ev = queue.write_buffer(pInMatB_buf,
                                    pInMatB,
                                    blocking=False,
                                    need_event=True)
            print(data2[0][1:5])
            print(t1, t2)
        del queue
        del ctx
        del krn
        del prg
        gc.collect()
Example #27
0
    def test_api_numpy(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)
        a_copy = a.copy()

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a + b * c[0]

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [a.size]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(a.shape, dtype=a.dtype)
        queue.read_buffer(a_, aa)
        self.assertLess(
            numpy.fabs(aa - d).max(), 0.0001,
            "Incorrect result after read_buffer")

        # Refill buffer with stored copy by map_buffer with event
        ev, ptr = queue.map_buffer(
            a_,
            cl.CL_MAP_WRITE if queue.device.version < 1.1999 else
            cl.CL_MAP_WRITE_INVALIDATE_REGION,
            a.nbytes,
            blocking=False,
            need_event=True)
        ev.wait()
        a[:] = a_copy[:]
        ev = queue.unmap_buffer(a_, ptr)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   a.nbytes,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a - d).max(), 0.0001,
            "Incorrect result after map_buffer")
Example #28
0
    def test_api_nonumpy(self):
        import math

        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        # Create command queue
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        N = 100000
        ffi = cl.get_ffi()
        _a = ffi.new("float[]", N + queue.device.memalign)
        sz = int(ffi.cast("size_t", _a))
        if sz % queue.device.memalign != 0:
            sz += queue.device.memalign - (sz % queue.device.memalign)
            a = ffi.cast("float*", sz)
        else:
            a = _a
        _b = ffi.new("float[]", N + queue.device.memalign)
        sz = int(ffi.cast("size_t", _b))
        if sz % queue.device.memalign != 0:
            sz += queue.device.memalign - (sz % queue.device.memalign)
            b = ffi.cast("float*", sz)
        else:
            b = _b
        c = ffi.new("float[]", 1)
        c[0] = 1.2345
        d = ffi.new("float[]", N)
        sz = ffi.sizeof(d)
        for i, t in enumerate(d):
            a[i] = math.sin(i)
            b[i] = math.cos(i)
            d[i] = a[i] + b[i] * c[0]
        a_copy = ffi.new("float[]", N)
        a_copy[0:N] = a[0:N]

        # Create buffers
        a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               a,
                               size=sz)
        b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR,
                               b,
                               size=sz)

        # Set kernel arguments
        krn.set_arg(0, a_)
        krn.set_arg(1, b_)
        krn.set_arg(2, ffi.cast("const void*", c), ffi.sizeof(c))

        # Execute kernel
        global_size = [N]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        mx = 0
        for i, t in enumerate(d):
            mx = max(mx, math.fabs(a[i] - t))
        self.assertLess(mx, 0.0001, "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = ffi.new("float[]", N)
        queue.read_buffer(a_, aa, size=sz)
        mx = 0
        for i, t in enumerate(d):
            mx = max(mx, math.fabs(aa[i] - t))
        self.assertLess(mx, 0.0001, "Incorrect result after read_buffer")

        # Refill buffer with stored copy by map_buffer with event
        ev, ptr = queue.map_buffer(
            a_,
            cl.CL_MAP_WRITE if queue.device.version < 1.1999 else
            cl.CL_MAP_WRITE_INVALIDATE_REGION,
            sz,
            blocking=False,
            need_event=True)
        ev.wait()
        a[0:N] = a_copy[0:N]
        ev = queue.unmap_buffer(a_, ptr)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   sz,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        mx = 0
        for i, t in enumerate(d):
            mx = max(mx, math.fabs(a[i] - t))
        self.assertLess(mx, 0.0001, "Incorrect result after map_buffer")

        # Refill buffer with stored copy by write_buffer
        ev = queue.write_buffer(a_,
                                a_copy,
                                size=sz,
                                blocking=False,
                                need_event=True)

        # Execute kernel
        ev = queue.execute_kernel(krn,
                                  global_size,
                                  local_size,
                                  wait_for=(ev, ))
        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_,
                                   cl.CL_MAP_READ,
                                   sz,
                                   wait_for=(ev, ),
                                   need_event=True)
        ev.wait()
        queue.unmap_buffer(a_, ptr).wait()
        mx = 0
        for i, t in enumerate(d):
            mx = max(mx, math.fabs(a[i] - t))
        self.assertLess(mx, 0.0001, "Incorrect result after map_buffer")

        del _b
        del _a
Example #29
0
 def test_create_some_context(self):
     platforms = cl.Platforms()
     ctx = platforms.create_some_context()
     del ctx
Example #30
0
    def test_create_sub_buffer(self):
        import numpy
        # Create platform, context, program, kernel and queue
        platforms = cl.Platforms()
        ctx = platforms.create_some_context()
        prg = ctx.create_program(self.src_test, self.include_dirs)
        krn = prg.get_kernel("test")
        queue = ctx.create_queue(ctx.devices[0])

        # Create arrays with some values for testing
        a = numpy.arange(100000, dtype=numpy.float32)
        b = numpy.cos(a)
        a = numpy.sin(a)

        # Prepare arrays for use with map_buffer
        a = cl.realign_array(a, queue.device.memalign, numpy)
        b = cl.realign_array(b, queue.device.memalign, numpy)
        c = numpy.array([1.2345], dtype=numpy.float32)
        d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0]

        # Create buffers
        a_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a)
        self.assertEqual(a_parent_._n_refs, 1)
        a_ = a_parent_.create_sub_buffer(4096, 16384)
        self.assertEqual(a_parent_._n_refs, 2)
        self.assertEqual(a_._n_refs, 1)
        b_parent_ = ctx.create_buffer(
            cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b)
        self.assertEqual(b_parent_._n_refs, 1)
        b_ = b_parent_.create_sub_buffer(8192, 16384)
        self.assertEqual(b_parent_._n_refs, 2)
        self.assertEqual(b_._n_refs, 1)

        # Set kernel arguments
        krn.set_args(a_, b_, c[0:1])

        # Execute kernel
        global_size = [4096]
        local_size = None
        queue.execute_kernel(krn, global_size, local_size, need_event=False)

        # Get results back from the device by map_buffer
        ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size)
        del ev
        queue.unmap_buffer(a_, ptr).wait()
        self.assertLess(
            numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001,
            "Incorrect result after map_buffer")

        # Get results back from the device by read_buffer
        aa = numpy.zeros(4096, dtype=numpy.float32)
        queue.read_buffer(a_, aa)
        self.assertLess(
            numpy.fabs(aa - d).max(), 0.0001,
            "Incorrect result after read_buffer")

        del b_
        self.assertIn(b_parent_._n_refs, (1, 2))
        logging.info(
            "test_create_sub_buffer: "
            "b_parent_._n_refs = %d (expected 1 or 2)", b_parent_._n_refs)
        del a_
        self.assertIn(a_parent_._n_refs, (1, 2))
        logging.info(
            "test_create_sub_buffer: "
            "a_parent_._n_refs = %d (expected 1 or 2)", a_parent_._n_refs)