def test_copy_buffer(self): import numpy # Create platform, context and queue platforms = cl.Platforms() ctx = platforms.create_some_context() queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(10000, dtype=numpy.float32) b = a * 0.5 c = numpy.empty_like(b) c[:] = 1.0e30 # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, b) # Copy some data from one buffer to another sz = a.itemsize queue.copy_buffer(a_, b_, 1000 * sz, 2000 * sz, 3000 * sz).wait() queue.read_buffer(b_, c) diff = numpy.fabs(c[2000:5000] - a[1000:4000]).max() self.assertEqual(diff, 0)
def test_set_kernel_args(self): import numpy platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) global_size = [12345] local_size = None krn.set_args(cl.skip(3)) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size) krn.set_args(cl.skip, cl.skip, cl.skip) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size) krn.set_args(cl.skip(1), cl.skip(1), cl.skip(1)) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size) krn.set_args(cl.skip(1000)) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size) self.assertRaises(ValueError, cl.skip, 0) self.assertRaises(ValueError, cl.skip, -1) c = numpy.array([1.2345], dtype=numpy.float32) krn.set_args(cl.skip(2), c) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size) krn.set_args(cl.skip, cl.skip, c) self.assertRaises(cl.CLRuntimeError, queue.execute_kernel, krn, global_size, local_size)
def test_fill_buffer(self): # Create platform, context and queue platforms = cl.Platforms() ctx = platforms.create_some_context() if ctx.devices[0].version < 1.2: return queue = ctx.create_queue(ctx.devices[0]) import numpy # Create array a = numpy.zeros(4096, dtype=numpy.int32) # Create buffer a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) # Fill the buffer pattern = numpy.array([1, 2, 3, 4], dtype=numpy.int32) queue.fill_buffer(a_, pattern, pattern.nbytes, a.nbytes).wait() queue.read_buffer(a_, a) diff = 0 for i in range(0, a.size, pattern.size): diff += numpy.fabs(a[i:i + pattern.size] - pattern).sum() self.assertEqual(diff, 0)
def test1(): logging.basicConfig(level=logging.DEBUG) platforms = cl.Platforms() print("OpenCL devices:\n\n%s\n" % platforms.dump_devices()) ctx = platforms.create_some_context() queue = ctx.create_queue(ctx.devices[0]) prg = ctx.create_program(""" __kernel void test(__global const float *a, __global const float *b, __global float *c, const float k) { size_t i = get_global_id(0); c[i] = (a[i] + b[i]) * k; } """) krn = prg.get_kernel("test") a = np.arange(1000000, dtype=np.float32) b = np.arange(1000000, dtype=np.float32) c = np.empty(1000000, dtype=np.float32) k = np.array([0.5], dtype=np.float32) a_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a) b_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b) c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_ALLOC_HOST_PTR, size=c.nbytes) krn.set_arg(0, a_buf) krn.set_arg(1, b_buf) krn.set_arg(2, c_buf) krn.set_arg(3, k[0:1]) queue.execute_kernel(krn, [a.size], None) queue.read_buffer(c_buf, c) diff = np.fabs(c - (a + b) * k[0]) print(diff)
def test_copy_buffer_rect(self): import numpy # Create platform, context and queue platforms = cl.Platforms() ctx = platforms.create_some_context() queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(35 * 25 * 15, dtype=numpy.float32).reshape(35, 25, 15) b = numpy.arange(37 * 27 * 17, dtype=numpy.float32).reshape(37, 27, 17) b *= 0.5 c = numpy.empty_like(b) c[:] = 1.0e30 # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, b) # Copy 3D rect from one buffer to another sz = a.itemsize queue.copy_buffer_rect(a_, b_, (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1] * a.shape[2] * sz, b.shape[2] * sz, b.shape[1] * b.shape[2] * sz).wait() queue.read_buffer(b_, c) diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0)
def test_binary(self): platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) binary = prg.binaries[0] prg = ctx.create_program([binary], binary=True) krn = prg.get_kernel("test") del krn
def test_set_arg_None(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() src = """ __kernel void test(__global float *a, __global const float *b, __global const float *c) { int idx = get_global_id(0); a[idx] += b[idx] + (c ? c[idx] : 0); } """ prg = ctx.create_program(src) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.array([1, 2, 3, 4, 5], dtype=numpy.float32) b = numpy.array([6, 7, 8, 9, 10], dtype=numpy.float32) c = numpy.array([11, 12, 13, 14, 15], dtype=numpy.float32) # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b) c_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, c) # Set kernel arguments krn.set_arg(0, a_) krn.set_arg(1, b_) krn.set_arg(2, c_) # Execute kernel queue.execute_kernel(krn, [a.size], None).wait() # Get results back d = numpy.zeros_like(a) queue.read_buffer(a_, d) t = a + b + c diff = numpy.fabs(d - t).max() self.assertEqual(diff, 0) # Set arg to None krn.set_arg(2, None) # Execute kernel queue.execute_kernel(krn, [a.size], None).wait() # Get results back queue.read_buffer(a_, d) t += b diff = numpy.fabs(d - t).max() self.assertEqual(diff, 0)
def test_kernel_info(self): platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") self.assertGreater(krn.reference_count, 0) self.assertEqual(krn.num_args, 3) try: self.assertEqual(krn.attributes, "vec_type_hint(float4)") except cl.CLRuntimeError as e: self.assertEqual(e.code, -30)
def arg_completer(prefix, **kwargs): def format_device(plf, dev): return "%s - %s on %s" % (dev.path, dev.name.strip(), plf.name) if prefix.strip() == "": platforms = cl.Platforms().platforms if len(platforms) == 1 and len(platforms[0].devices) == 1: return ["0:0"] result = [] for platform in platforms: for device in platform: result.append(format_device(platform, device)) return result parsed = [p for p in prefix.split(':') if p.strip() != ""] platform = cl.Platforms().platforms[int(parsed[0].strip())] if len(parsed) == 1: if len(platform.devices) == 1: return [platform.devices[0].path] result = [] for device in platform: result.append(format_device(platform, device)) return result
def testadd(): os.environ["PYOPENCL_CTX"] = "0:0" platforms = cl.Platforms() print("OpenCL devices:\n%s" % platforms.dump_devices()) #ctx = platforms.create_some_context() ctx = cl.Context(platforms.platforms[0], platforms.platforms[0].devices[0:1]) dev = ctx.devices[0] print("version=%s,\ngroup_size=%s" % (dev.version, dev.max_work_group_size)) #prg = ctx.create_program(src_test, include_dirs) prg = ctx.create_program(testopencl.readoclfile("test.cl")) bins = prg.binaries[0] print(prg.kernel_names) #print(bins) krn = prg.get_kernel("testadd") print(krn.attributes) queue = ctx.create_queue(ctx.devices[0]) a = np.arange(100, dtype=np.float32) b = np.arange(100, dtype=np.float32) c = np.empty(100, dtype=np.float32) k = np.array([0.5], dtype=np.float32) a_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a) b_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b) c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_ALLOC_HOST_PTR, size=c.nbytes) ''' krn.set_arg(0, a_buf) krn.set_arg(1, b_buf) krn.set_arg(2, c_buf) krn.set_arg(3, k[0:1]) ''' krn.set_args(a_buf, b_buf, c_buf, k[0:1]) ev = queue.execute_kernel(krn, [a.size], None) queue.read_buffer(c_buf, c) diff = c - (a * k[0] + b * k[0]) * k[0] #print(a) #print(c) print(diff) del queue del ctx del krn del prg gc.collect()
def _get_some_device(self, **kwargs): """Gets some device from the available OpenCL devices. Returns True if any device was selected, otherwise, False. """ device = self.parse_device(**kwargs) try: platforms = cl.Platforms() except cl.CLRuntimeError: platforms = None if platforms is None or len(platforms.platforms) == 0: raise DeviceNotFoundError("No OpenCL devices were found") self._id = device if device == "": context = platforms.create_some_context() else: platfnum, devnums = device.split(':') try: platform = platforms.platforms[int(platfnum)] except IndexError: raise from_none( DeviceNotFoundError("Device %s was not found." % device)) context = platform.create_context([ platform.devices[int(devnum)] for devnum in devnums.split(',') ]) if "NVIDIA" in context.platform.name: def fail(*args, **kwargs): raise RuntimeError("fork() breaks NVIDIA OpenCL") os.fork = fail import subprocess subprocess.Popen = fail device = context.devices[0] desc = "%s/%s" % (device.vendor.strip(), device.name.strip()) self.queue_ = context.create_queue(device) self.device_info = DeviceInfo( desc=desc, memsize=device.memsize, memalign=device.memalign, version=device.version, device_type=device.type, max_work_group_size=self.queue_.device.max_work_group_size, max_work_item_sizes=self.queue_.device.max_work_item_sizes, local_memsize=self.queue_.device.local_memsize) return True
def test_svm_memcpy(self): ctx = cl.Platforms().create_some_context() if ctx.devices[0].version < 2.0: return svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) import numpy a = numpy.frombuffer(svm.buffer, dtype=numpy.int32) queue = ctx.create_queue(ctx.devices[0]) queue.svm_map(svm, cl.CL_MAP_WRITE_INVALIDATE_REGION, svm.size) a[:] = numpy.arange(a.size, dtype=a.dtype) queue.svm_unmap(svm) n = a.size // 2 queue.svm_memcpy(a[n:], a, n * a.itemsize) queue.svm_map(svm, cl.CL_MAP_READ, svm.size) self.assertEqual(numpy.fabs(a[n:] - a[:n]).max(), 0) queue.svm_unmap(svm).wait() del svm
def test_svm_memfill(self): ctx = cl.Platforms().create_some_context() if ctx.devices[0].version < 2.0: return svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) import numpy a = numpy.frombuffer(svm.buffer, dtype=numpy.int32) queue = ctx.create_queue(ctx.devices[0]) pattern = numpy.array([1, 2, 3, 4], dtype=numpy.int32) queue.svm_memfill(a, pattern, pattern.nbytes, a.nbytes) queue.svm_map(svm, cl.CL_MAP_READ, svm.size) diff = 0 for i in range(0, a.size, pattern.size): diff += numpy.fabs(a[i:i + pattern.size] - pattern).sum() self.assertEqual(diff, 0) queue.svm_unmap(svm).wait() del svm
def test_create_pipe(self): ctx = cl.Platforms().create_some_context() if ctx.devices[0].version < 2.0: return pipe = ctx.create_pipe(0, 8, 16) del pipe pipe = ctx.create_pipe(cl.CL_MEM_READ_WRITE, 8, 16) prg = ctx.create_program(""" __kernel void test(__write_only pipe int p) { int x = 0; write_pipe(p, &x); } """, options="-cl-std=CL2.0") krn = prg.get_kernel("test") krn.set_arg(0, pipe) del krn del prg del pipe
def testmul(): os.environ["PYOPENCL_CTX"] = "0:0" platforms = cl.Platforms() print("OpenCL devices:\n%s" % platforms.dump_devices()) ctx = platforms.create_some_context() prg = ctx.create_program(testopencl.readoclfile("test.cl")) print(prg.kernel_names) krn = prg.get_kernel("matmul") print(krn.attributes) queue = ctx.create_queue(ctx.devices[0]) a = np.arange(10, dtype=np.float32) b = np.arange(10, dtype=np.float32) c = np.empty(1000, dtype=np.float32) m = np.array([10], dtype=np.float32) p = np.array([10], dtype=np.float32) n = np.array([10], dtype=np.float32) a_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, a) b_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b) c_buf = ctx.create_buffer(cl.CL_MEM_WRITE_ONLY | cl.CL_MEM_ALLOC_HOST_PTR, size=c.nbytes) ''' krn.set_arg(0, a_buf) krn.set_arg(1, b_buf) krn.set_arg(2, c_buf) krn.set_arg(3, m[0:1]) krn.set_arg(4, p[0:1]) krn.set_arg(5, n[0:1]) ''' krn.set_args(a_buf, b_buf, c_buf, m[0:1], p[0:1], c[0:1]) #queue.execute_kernel(krn, [a.size], None) #queue.read_buffer(c_buf, c) #diff = np.fabs(c - (a * k[0]+ b * k[0]) * k[0]) print(a) print(b) #print(diff) del queue del ctx del krn del prg gc.collect()
def test_program_info(self): platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) self.assertGreater(prg.reference_count, 0) try: self.assertEqual(prg.num_kernels, 1) names = prg.kernel_names self.assertIsInstance(names, list) self.assertEqual(len(names), 1) self.assertEqual(names[0], "test") except cl.CLRuntimeError as e: if prg.devices[0].version >= 1.2: raise self.assertEqual(e.code, -30) bins = prg.binaries self.assertEqual(len(bins), 1) self.assertIsInstance(bins[0], bytes) self.assertGreater(len(bins[0]), 0)
def load_kernels(): global cl_image_processing_kernel, cl_queue, cl_context platforms = cl.Platforms() cuda_platform = None for p in platforms: # It is hard to determine the device with the most power. # As my machines only have nvidia graphics cards, just filter for the nvidia CUDA platform if 'cuda' in p.name.lower(): cuda_platform = p break if cuda_platform is None: print('No suitable device found. Exiting.') exit(0) device = cuda_platform.devices[0] print('Chosen OpenCL device: {0}'.format(cuda_platform.devices[0].name)) cl_context = cuda_platform.create_context([device]) cl_queue = cl_context.create_queue(device) program = cl_context.create_program( """ __kernel void imageProcessing(__global const uchar* rgbImage, __global const float* parameters, __global float* grayImage) {{ size_t imgIdx = get_global_id(0); size_t rgbIdx = {0} * imgIdx; size_t grayIdx = {1} * imgIdx; size_t paramIdx = 2 * imgIdx; for (int y = 0; y < {2}; y++) {{ for (int x = 0; x < {3}; x++) {{ float gray = 0.21 * rgbImage[rgbIdx] + 0.72 * rgbImage[rgbIdx + 1] + 0.07 * rgbImage[rgbIdx + 2]; float grayEqualized = (gray - parameters[paramIdx]) * 255.0 / parameters[paramIdx + 1]; grayImage[grayIdx] = (grayEqualized - 128.0) / 128.0; grayIdx++; rgbIdx += 3; }} }} }} """.format(3*IMG_WIDTH*IMG_HEIGHT, IMG_WIDTH*IMG_HEIGHT, IMG_WIDTH, IMG_HEIGHT)) cl_image_processing_kernel = program.get_kernel('imageProcessing')
def test_work_group_info(self): ctx = cl.Platforms().create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") info = krn.get_work_group_info(ctx.devices[0]) self.assertRaises(cl.CLRuntimeError, getattr, info, "global_work_size") for vle in (info.compile_work_group_size, ): self.assertIsInstance(vle, tuple) self.assertEqual(len(vle), 3) for x in vle: self.assertIsInstance(x, int) self.assertGreaterEqual(x, 0) for vle in (info.work_group_size, info.local_mem_size, info.preferred_work_group_size_multiple, info.private_mem_size): self.assertIsInstance(vle, int) self.assertGreaterEqual(vle, 0)
def test_create_queue_with_properties(self): ctx = cl.Platforms().create_some_context() try: queue = ctx.create_queue( ctx.devices[0], cl.CL_QUEUE_ON_DEVICE | cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, properties={cl.CL_QUEUE_SIZE: 64}) del queue except cl.CLRuntimeError: if ctx.devices[0].version >= 2.0: raise return queue = ctx.create_queue( ctx.devices[0], properties={cl.CL_QUEUE_SIZE: 64, cl.CL_QUEUE_PROPERTIES: cl.CL_QUEUE_ON_DEVICE | cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE}) del queue
def test_event_profiling(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) c = numpy.array([1.2345], dtype=numpy.float32) # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, b) # Set kernel arguments krn.set_arg(0, a_) krn.set_arg(1, b_) krn.set_arg(2, c[0:1]) # Execute kernel ev = queue.execute_kernel(krn, [a.size], None) ev.wait() try: vles, errs = ev.get_profiling_info() self.assertEqual(vles, ev.profiling_values) self.assertEqual(errs, ev.profiling_errors) except cl.CLRuntimeError: pass for name, vle in ev.profiling_values.items(): err = ev.profiling_errors[name] self.assertTrue((vle and not err) or (not vle and err)) self.assertEqual(type(vle), float) self.assertEqual(type(err), int)
def test_svm_alloc(self): ctx = cl.Platforms().create_some_context() if ctx.devices[0].version < 2.0: return svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) svm.release() self.assertIsNone(svm.handle) del svm svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) prg = ctx.create_program(""" __kernel void test(__global void *p) { __global int *ptr = (__global int*)p; *ptr += 1; } """, options="-cl-std=CL2.0") krn = prg.get_kernel("test") krn.set_arg(0, svm) krn.set_arg_svm(0, svm) queue = ctx.create_queue(ctx.devices[0]) queue.svm_map(svm, cl.CL_MAP_WRITE_INVALIDATE_REGION, 4) p = cl.get_ffi().cast("int*", svm.handle) p[0] = 2 queue.svm_unmap(svm) queue.execute_kernel(krn, [1], None) queue.svm_map(svm, cl.CL_MAP_READ, 4) self.assertEqual(p[0], 3) # always ensure that the last unmap had completed before # the svm destructor queue.svm_unmap(svm).wait() try: import numpy a = numpy.frombuffer(svm.buffer, dtype=numpy.int32) queue.execute_kernel(krn, [1], None) queue.svm_map(svm, cl.CL_MAP_READ, 4) self.assertEqual(a[0], 4) queue.svm_unmap(svm).wait() except ImportError: pass del svm # svm destructor here
def test_device_info(self): platforms = cl.Platforms() ctx = platforms.create_some_context() dev = ctx.devices[0] self.assertGreater(dev.max_work_item_dimensions, 0) self.assertEqual(len(dev.max_work_item_sizes), dev.max_work_item_dimensions) for size in dev.max_work_item_sizes: self.assertGreater(size, 0) self.assertIsInstance(dev.driver_version.encode("utf-8"), bytes) self.assertGreater(len(dev.driver_version), 0) try: self.assertIsInstance(dev.built_in_kernels, list) for krn in dev.built_in_kernels: self.assertIsInstance(krn, str) self.assertGreater(len(krn), 0) except cl.CLRuntimeError as e: if dev.version >= 1.2: raise self.assertEqual(e.code, -30) self.assertIsInstance(dev.extensions, list) for ext in dev.extensions: self.assertIsInstance(ext.encode("utf-8"), bytes) self.assertGreater(len(ext), 0) self.assertGreater(dev.preferred_vector_width_int, 0) self.assertGreater(dev.max_work_group_size, 1) self.assertTrue(dev.available) try: self.assertTrue(type(dev.pipe_max_active_reservations) == int) self.assertTrue(type(dev.pipe_max_packet_size) == int) self.assertTrue(type(dev.svm_capabilities) == int) self.assertTrue( type(dev.preferred_platform_atomic_alignment) == int) self.assertTrue(type(dev.preferred_global_atomic_alignment) == int) self.assertTrue(type(dev.preferred_local_atomic_alignment) == int) except cl.CLRuntimeError as e: if dev.version >= 2.0: raise self.assertEqual(e.code, -30)
def _test_gemm(self, gemm, dtype): ctx = cl.Platforms().create_some_context() queue = ctx.create_queue(ctx.devices[0]) a = numpy.zeros([127, 353], dtype=dtype) b = numpy.zeros([135, a.shape[1]], dtype=dtype) c = numpy.zeros([a.shape[0], b.shape[0]], dtype=dtype) numpy.random.seed(numpy.array([123], dtype=numpy.int32)[0]) a[:] = numpy.random.rand(a.size).astype(dtype).reshape(a.shape) b[:] = numpy.random.rand(b.size).astype(dtype).reshape(b.shape) gold_c = numpy.dot(a, b.transpose()) a_buf = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, a) b_buf = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, b) c_buf = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_COPY_HOST_PTR, c) gemm([queue], blas.clblasRowMajor, blas.clblasNoTrans, blas.clblasTrans, a.shape[0], b.shape[0], a.shape[1], 1.0, a_buf, b_buf, 0.0, c_buf) queue.flush() queue.read_buffer(c_buf, c) max_diff = numpy.fabs(c - gold_c).max() self.assertLess(max_diff, 0.00001 if dtype == numpy.float64 else 0.00015)
''' This module simply prints your OpenCL platforms and devices to screen. ''' import os import opencl4py as cl os.environ["PYOPENCL_CTX"] = "0:0" # This is where you choose a device number # Print all platforms and devices platforms = cl.Platforms() print(platforms.dump_devices())
def test(): print(os.environ.get("PYOPENCL_CTX")) os.environ["PYOPENCL_CTX"] = "0:0" # Create platform, context, program, kernel and queue platforms = cl.Platforms() print("OpenCL devices:\n%s" % platforms.dump_devices()) ctx = platforms.create_some_context() queue = ctx.create_queue(ctx.devices[0], cl.CL_QUEUE_PROFILING_ENABLE) ''' prg = ctx.create_program( """ __kernel void test( __global float *a, __global float *b, const float c) { size_t i = get_global_id(0); a[i] = (a[i] + b[i]) * c; } """) ''' prg = ctx.create_program(testopencl.readoclfile("test.cl")) krn = prg.get_kernel("test") # Create arrays with some values for testing a = np.arange(100000, dtype=np.float32) b = np.cos(a) a = np.sin(a) a_copy = a.copy() # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, np) b = cl.realign_array(b, queue.device.memalign, np) c = np.array([0.1], dtype=np.float32) d = (a + b) * c[0] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [a.size] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes) del ev queue.unmap_buffer(a_, ptr).wait() print(a - d) aa = np.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, aa) print(aa - d) # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() print(a - d) bb = np.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, bb) print(bb - d) del queue del ctx del krn del prg gc.collect()
def matrixmul(): os.environ["PYOPENCL_CTX"] = "0:0" platforms = cl.Platforms() print("OpenCL devices:\n%s" % platforms.dump_devices()) #ctx = platforms.create_some_context() ctx = cl.Context(platforms.platforms[0], platforms.platforms[0].devices[0:1]) prg = ctx.create_program(testopencl.readoclfile("test.cl")) print(prg.kernel_names) krn = prg.get_kernel("MatrixMul") print(krn.attributes) queue = ctx.create_queue(ctx.devices[0]) iHeightA = np.array([800], dtype=np.int32) iWidthA = np.array([500], dtype=np.int32) pInMatA = np.arange(iHeightA[0] * iWidthA[0], dtype=np.float32) iHeightB = np.array([500], dtype=np.int32) iWidthB = np.array([800], dtype=np.int32) pInMatB = np.arange(iHeightB[0] * iWidthB[0], dtype=np.float32) pOutMat = np.empty(iHeightA[0] * iWidthB[0], dtype=np.float32) pInMatA_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, pInMatA) pInMatB_buf = ctx.create_buffer( cl.CL_MEM_READ_ONLY | cl.CL_MEM_COPY_HOST_PTR, pInMatB) pOutMat_buf = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_ALLOC_HOST_PTR, size=pOutMat.nbytes) krn.set_args(iHeightA[0:1], iWidthA[0:1], pInMatA_buf, iHeightB[0:1], iWidthB[0:1], pInMatB_buf, pOutMat_buf) global_size = [pInMatA.size, pInMatB.size] local_size = None for i in range(10): start = time.time() ev = queue.execute_kernel(krn, global_size, local_size, need_event=True) t1 = time.time() - start #ev, ptr = queue.map_buffer(pOutMat_buf, cl.CL_MAP_READ, pOutMat.nbytes) #queue.unmap_buffer(pOutMat_buf, ptr).wait() queue.read_buffer(pOutMat_buf, pOutMat) data1 = np.reshape(pOutMat, (iHeightA[0], iWidthB[0])) print(data1[0][1:5]) start = time.time() data2 = np.dot(np.reshape(pInMatA, (iHeightA[0], iWidthA[0])), np.reshape(pInMatB, (iHeightB[0], iWidthB[0]))) t2 = time.time() - start pInMatA += pInMatA ev = queue.write_buffer(pInMatA_buf, pInMatA, blocking=False, need_event=True) ev = queue.write_buffer(pInMatB_buf, pInMatB, blocking=False, need_event=True) print(data2[0][1:5]) print(t1, t2) del queue del ctx del krn del prg gc.collect()
def test_api_numpy(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) a_copy = a.copy() # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a + b * c[0] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [a.size] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(a.shape, dtype=a.dtype) queue.read_buffer(a_, aa) self.assertLess( numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, a.nbytes, blocking=False, need_event=True) ev.wait() a[:] = a_copy[:] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a.nbytes, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a - d).max(), 0.0001, "Incorrect result after map_buffer")
def test_api_nonumpy(self): import math # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") # Create command queue queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing N = 100000 ffi = cl.get_ffi() _a = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _a)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) a = ffi.cast("float*", sz) else: a = _a _b = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _b)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) b = ffi.cast("float*", sz) else: b = _b c = ffi.new("float[]", 1) c[0] = 1.2345 d = ffi.new("float[]", N) sz = ffi.sizeof(d) for i, t in enumerate(d): a[i] = math.sin(i) b[i] = math.cos(i) d[i] = a[i] + b[i] * c[0] a_copy = ffi.new("float[]", N) a_copy[0:N] = a[0:N] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a, size=sz) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b, size=sz) # Set kernel arguments krn.set_arg(0, a_) krn.set_arg(1, b_) krn.set_arg(2, ffi.cast("const void*", c), ffi.sizeof(c)) # Execute kernel global_size = [N] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz) del ev queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = ffi.new("float[]", N) queue.read_buffer(a_, aa, size=sz) mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(aa[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, sz, blocking=False, need_event=True) ev.wait() a[0:N] = a_copy[0:N] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, size=sz, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") del _b del _a
def test_create_some_context(self): platforms = cl.Platforms() ctx = platforms.create_some_context() del ctx
def test_create_sub_buffer(self): import numpy # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing a = numpy.arange(100000, dtype=numpy.float32) b = numpy.cos(a) a = numpy.sin(a) # Prepare arrays for use with map_buffer a = cl.realign_array(a, queue.device.memalign, numpy) b = cl.realign_array(b, queue.device.memalign, numpy) c = numpy.array([1.2345], dtype=numpy.float32) d = a[1024:1024 + 4096] + b[2048:2048 + 4096] * c[0] # Create buffers a_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a) self.assertEqual(a_parent_._n_refs, 1) a_ = a_parent_.create_sub_buffer(4096, 16384) self.assertEqual(a_parent_._n_refs, 2) self.assertEqual(a_._n_refs, 1) b_parent_ = ctx.create_buffer( cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b) self.assertEqual(b_parent_._n_refs, 1) b_ = b_parent_.create_sub_buffer(8192, 16384) self.assertEqual(b_parent_._n_refs, 2) self.assertEqual(b_._n_refs, 1) # Set kernel arguments krn.set_args(a_, b_, c[0:1]) # Execute kernel global_size = [4096] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, a_.size) del ev queue.unmap_buffer(a_, ptr).wait() self.assertLess( numpy.fabs(a[1024:1024 + 4096] - d).max(), 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = numpy.zeros(4096, dtype=numpy.float32) queue.read_buffer(a_, aa) self.assertLess( numpy.fabs(aa - d).max(), 0.0001, "Incorrect result after read_buffer") del b_ self.assertIn(b_parent_._n_refs, (1, 2)) logging.info( "test_create_sub_buffer: " "b_parent_._n_refs = %d (expected 1 or 2)", b_parent_._n_refs) del a_ self.assertIn(a_parent_._n_refs, (1, 2)) logging.info( "test_create_sub_buffer: " "a_parent_._n_refs = %d (expected 1 or 2)", a_parent_._n_refs)