def test_singlebuffer_sqrt_cocl(context, queue): """ Test doing stuff with one single large buffer for destination and source, just offset a bit """ code = """ __global__ void myKernel(float *data0, float *data1, int N) { if(threadIdx.x < N) { data0[threadIdx.x] = sqrt(data1[threadIdx.x]); } } """ mangledName = '_Z8myKernelPfS_i' kernel = test_common.compile_code_v3(cl, context, code, mangledName, num_clmems=2)['kernel'] N = 10 src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset = 128 dst_offset = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset, size=N * 4) global_size = 256 workgroup_size = 256 # scratch = workgroup_size * 4 kernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu, offset_type(0), huge_buf_gpu, offset_type(0), offset_type(dst_offset), offset_type(src_offset), np.int32(N), cl.LocalMemory(4)) queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset, size=N * 4) queue.finish() print('src_host', src_host) print('dst_host', dst_host) print('np.sqrt(src_host)', np.sqrt(src_host)) assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
def test_singlebuffer_sqrt_opencl_1(context, queue): """ Test doing stuff with one single large buffer for destination and source, just offset a bit """ code = """ kernel void myKernel(global float *data0, long offset0, global float *data1, long offset1, int N) { data0 += (offset0 >> 2); data1 += (offset1 >> 2); if(get_global_id(0) < N) { data0[get_global_id(0)] = sqrt(data1[get_global_id(0)]); } } """ prog = cl.Program(context, code).build() N = 10 src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset = 128 dst_offset = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) global_size = 256 workgroup_size = 256 test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset, size=N * 4) queue.finish() prog.myKernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu, np.int64(dst_offset), huge_buf_gpu, np.int64(src_offset), np.int32(N)) queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset, size=N * 4) queue.finish() print('src_host', src_host) print('dst_host', dst_host) print('np.sqrt(src_host)', np.sqrt(src_host)) assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
def test_singlebuffer_sqrt_opencl_2(context, queue): """ Test doing stuff with one single large buffer for destination and source, just offset a bit """ code = """ kernel void _Z8myKernelPfS_i(global float* data0, long data0_offset, global float* data1, long data1_offset, int N) { //data1 = (global float*)((global char *)data1 + data1_offset); //data0 = (global float*)((global char *)data0 + data0_offset); //data1 = data1 + (data1_offset >> 2); //data0 = data0 + (data0_offset >> 2); data1 = data1 + data1_offset; data0 = data0 + data0_offset; if(get_local_id(0) < N) { data0[get_local_id(0)] = (float)sqrt(data1[get_local_id(0)]); } } """ prog = cl.Program(context, code).build() N = 10 bufsize = 64 * 1024 * 1024 np.random.seed(444) src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset = 128 dst_offset = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=bufsize) global_size = 256 workgroup_size = 256 test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset, size=N * 4) queue.finish() prog._Z8myKernelPfS_i(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu, np.int64(dst_offset // 4), huge_buf_gpu, np.int64(src_offset // 4), np.int32(N)) queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset, size=N * 4) queue.finish() print('src_host', src_host) print('dst_host', dst_host) print('np.sqrt(src_host)', np.sqrt(src_host)) assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu): options = test_common.cocl_options() i = 0 opt_options = [] iropencl_options = [] while i < len(options): if options[i] == '--devicell-opt': opt_options.append('-' + options[i + 1]) i += 2 continue if options[i] in ['--run_branching_transforms', '--branches_as_switch']: iropencl_options.append(options[i]) i += 1 continue raise Exception('unknown option ', options[i]) i += 1 print('opt_options', opt_options) print('iropencl_options', iropencl_options) if 'NOREBUILD' not in os.environ: res = subprocess.run([ join(CLANG_HOME, 'bin/opt') ] + opt_options + [ '-S', 'test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', '-o', '/tmp/test-opt.ll' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print(' '.join(res.args)) print(res.stdout.decode('utf-8')) assert res.returncode == 0 res = subprocess.run([ 'build/ir-to-opencl' ] + iropencl_options + [ '--inputfile', '/tmp/test-opt.ll', '--outputfile', '/tmp/test-device.cl', '--cmem-indexes', '0,1,2', '--kernelname', '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_' ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) print(' '.join(res.args)) print(res.stdout.decode('utf-8')) assert res.returncode == 0 with open('/tmp/test-device.cl', 'r') as f: cl_sourcecode = f.read() prog = cl.Program(context, cl_sourcecode).build() N = 10 # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch # what we need: # struct Eigen__TensorEvaluator_nopointers Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer... # global float *eval_ptr0 => this will receive the result. just create a sufficiently large buffer # ptr0_offset => 0 # eval_ptr1 => will contian the data we want to reduce # eval_ptr1_offset=> 0 # size => eg 10, to reduce 10 values # scratch => set to workgroupsize * sizeof(float) # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use # offset into this src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset_bytes = 128 dst_offset_bytes = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_ptr0_gpu = huge_buf_gpu eval_ptr0_offset = dst_offset_bytes eval_ptr1_gpu = huge_buf_gpu eval_ptr1_offset = src_offset_bytes size = N # copy our host memory across # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4) global_size = 256 workgroup_size = 256 scratch = workgroup_size * 4 prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')( queue, (global_size,), (workgroup_size,), eval_nopointers_gpu, offset_type(0), eval_ptr0_gpu, offset_type(0), eval_ptr1_gpu, offset_type(0), offset_type(0), offset_type(eval_ptr0_offset), offset_type(eval_ptr1_offset), np.int32(size), cl.LocalMemory(scratch) ) # check for errors queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4) # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4) queue.finish() print('dst_host[:N]', dst_host[:N]) expected = np.sqrt(src_host) print('expected[:10]', expected[:N]) assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu): with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f: ll_code = f.read() cl_sourcecode = test_common.ll_to_cl( ll_code, '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_', num_clmems=3) prog = cl.Program(context, cl_sourcecode).build() N = 10 # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch # what we need: # struct Eigen__TensorEvaluator_nopointers Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer... # global float *eval_ptr0 => this will receive the result. just create a sufficiently large buffer # ptr0_offset => 0 # eval_ptr1 => will contian the data we want to reduce # eval_ptr1_offset=> 0 # size => eg 10, to reduce 10 values # scratch => set to workgroupsize * sizeof(float) # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use # offset into this src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset_bytes = 128 dst_offset_bytes = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_ptr0_gpu = huge_buf_gpu eval_ptr0_offset = dst_offset_bytes eval_ptr1_gpu = huge_buf_gpu eval_ptr1_offset = src_offset_bytes size = N # copy our host memory across # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4) global_size = 256 workgroup_size = 256 scratch = workgroup_size * 4 prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')( queue, (global_size,), (workgroup_size,), eval_nopointers_gpu, offset_type(0), eval_ptr0_gpu, offset_type(0), eval_ptr1_gpu, offset_type(0), offset_type(0), offset_type(eval_ptr0_offset), offset_type(eval_ptr1_offset), np.int32(size), cl.LocalMemory(scratch) ) # check for errors queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4) # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4) queue.finish() print('dst_host[:N]', dst_host[:N]) expected = np.sqrt(src_host) print('expected[:10]', expected[:N]) assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4