def ocl_map(self, flags): if isinstance(self.device, NumpyDevice): return if self._map_arr_ is not None: # already mapped properly, nothing to do if self.map_flags != cl.CL_MAP_READ or flags == cl.CL_MAP_READ: return self.ocl_unmap() if (flags == cl.CL_MAP_WRITE_INVALIDATE_REGION and self.device.device_info.version < 1.1999): # 'cause available only starting with 1.2 flags = cl.CL_MAP_WRITE assert self.devmem is not None try: ev, self._map_arr_ = self.device.queue_.map_buffer( self.devmem, flags, self._mem.nbytes) del ev except cl.CLRuntimeError as err: self.error("Failed to map %d OpenCL bytes: %s(%d)", self._mem.nbytes, str(err), err.code) raise if (int(cl.get_ffi().cast("size_t", self._map_arr_)) != self._mem.__array_interface__["data"][0]): raise RuntimeError("map_buffer returned different pointer") self.map_flags = flags
def test_svm_alloc(self): ctx = cl.Platforms().create_some_context() if ctx.devices[0].version < 2.0: return svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) svm.release() self.assertIsNone(svm.handle) del svm svm = ctx.svm_alloc(cl.CL_MEM_READ_WRITE, 4096) prg = ctx.create_program(""" __kernel void test(__global void *p) { __global int *ptr = (__global int*)p; *ptr += 1; } """, options="-cl-std=CL2.0") krn = prg.get_kernel("test") krn.set_arg(0, svm) krn.set_arg_svm(0, svm) queue = ctx.create_queue(ctx.devices[0]) queue.svm_map(svm, cl.CL_MAP_WRITE_INVALIDATE_REGION, 4) p = cl.get_ffi().cast("int*", svm.handle) p[0] = 2 queue.svm_unmap(svm) queue.execute_kernel(krn, [1], None) queue.svm_map(svm, cl.CL_MAP_READ, 4) self.assertEqual(p[0], 3) # always ensure that the last unmap had completed before # the svm destructor queue.svm_unmap(svm).wait() try: import numpy a = numpy.frombuffer(svm.buffer, dtype=numpy.int32) queue.execute_kernel(krn, [1], None) queue.svm_map(svm, cl.CL_MAP_READ, 4) self.assertEqual(a[0], 4) queue.svm_unmap(svm).wait() except ImportError: pass del svm # svm destructor here
def test_api_nonumpy(self): import math # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") # Create command queue queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing N = 100000 ffi = cl.get_ffi() _a = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _a)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) a = ffi.cast("float*", sz) else: a = _a _b = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _b)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) b = ffi.cast("float*", sz) else: b = _b c = ffi.new("float[]", 1) c[0] = 1.2345 d = ffi.new("float[]", N) sz = ffi.sizeof(d) for i, t in enumerate(d): a[i] = math.sin(i) b[i] = math.cos(i) d[i] = a[i] + b[i] * c[0] a_copy = ffi.new("float[]", N) a_copy[0:N] = a[0:N] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a, size=sz) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b, size=sz) # Set kernel arguments krn.set_arg(0, a_) krn.set_arg(1, b_) krn.set_arg(2, ffi.cast("const void*", c), ffi.sizeof(c)) # Execute kernel global_size = [N] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz) del ev queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = ffi.new("float[]", N) queue.read_buffer(a_, aa, size=sz) mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(aa[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, sz, blocking=False, need_event=True) ev.wait() a[0:N] = a_copy[0:N] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, size=sz, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev, )) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev, ), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") del _b del _a
def test_api_nonumpy(self): import math # Create platform, context, program, kernel and queue platforms = cl.Platforms() ctx = platforms.create_some_context() prg = ctx.create_program(self.src_test, self.include_dirs) krn = prg.get_kernel("test") # Create command queue queue = ctx.create_queue(ctx.devices[0]) # Create arrays with some values for testing N = 100000 ffi = cl.get_ffi() _a = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _a)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) a = ffi.cast("float*", sz) else: a = _a _b = ffi.new("float[]", N + queue.device.memalign) sz = int(ffi.cast("size_t", _b)) if sz % queue.device.memalign != 0: sz += queue.device.memalign - (sz % queue.device.memalign) b = ffi.cast("float*", sz) else: b = _b c = ffi.new("float[]", 1) c[0] = 1.2345 d = ffi.new("float[]", N) sz = ffi.sizeof(d) for i, t in enumerate(d): a[i] = math.sin(i) b[i] = math.cos(i) d[i] = a[i] + b[i] * c[0] a_copy = ffi.new("float[]", N) a_copy[0:N] = a[0:N] # Create buffers a_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, a, size=sz) b_ = ctx.create_buffer(cl.CL_MEM_READ_WRITE | cl.CL_MEM_USE_HOST_PTR, b, size=sz) # Set kernel arguments krn.set_arg(0, a_) krn.set_arg(1, b_) krn.set_arg(2, ffi.cast("const void*", c), ffi.sizeof(c)) # Execute kernel global_size = [N] local_size = None queue.execute_kernel(krn, global_size, local_size, need_event=False) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz) del ev queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Get results back from the device by read_buffer aa = ffi.new("float[]", N) queue.read_buffer(a_, aa, size=sz) mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(aa[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after read_buffer") # Refill buffer with stored copy by map_buffer with event ev, ptr = queue.map_buffer( a_, cl.CL_MAP_WRITE if queue.device.version < 1.1999 else cl.CL_MAP_WRITE_INVALIDATE_REGION, sz, blocking=False, need_event=True) ev.wait() a[0:N] = a_copy[0:N] ev = queue.unmap_buffer(a_, ptr) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,)) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev,), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") # Refill buffer with stored copy by write_buffer ev = queue.write_buffer(a_, a_copy, size=sz, blocking=False, need_event=True) # Execute kernel ev = queue.execute_kernel(krn, global_size, local_size, wait_for=(ev,)) # Get results back from the device by map_buffer ev, ptr = queue.map_buffer(a_, cl.CL_MAP_READ, sz, wait_for=(ev,), need_event=True) ev.wait() queue.unmap_buffer(a_, ptr).wait() mx = 0 for i, t in enumerate(d): mx = max(mx, math.fabs(a[i] - t)) self.assertLess(mx, 0.0001, "Incorrect result after map_buffer") del _b del _a