def test_sext(context, q, int_data, int_data_gpu): ll_code = """ define void @mykernel(i32* %data) { %1 = load i32, i32* %data %2 = sext i32 %1 to i64 %3 = lshr i64 %2, 32 %4 = trunc i64 %3 to i32 store i32 %4, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]: int_data[0] = experiment['in'] cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32 expected = experiment['out'] print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item() split_cl = cl_code.split('\n') found_long_cast = False for line in split_cl: if ' >> 32' in line and '(long)' in line: found_long_cast = True assert found_long_cast
def test_umulhi(context, q, int_data, int_data_gpu): ll_code = """ declare i32 @_Z8__umulhiii(i32, i32) define void @test_umulhi(i32* %data) { %1 = load i32, i32* %data %2 = getelementptr i32, i32* %data, i32 1 %3 = load i32, i32* %2 %4 = getelementptr i32, i32* %data, i32 2 %5 = load i32, i32* %4 %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5) store i32 %6, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = -50 int_data[2] = 2523123 cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'test_umulhi') kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item()
def test_memcpy(context, q, int_data, int_data_gpu): ll_code = """ declare void @_Z6memcpyPvPKvm(i8*, i8*, i64) define void @mykernel(i32* %data) { %1 = bitcast i32* %data to i8* %2 = getelementptr i32, i32* %data, i32 8 %3 = bitcast i32* %2 to i8* call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32) ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1) print('cl_code', cl_code) for i in range(8): int_data[i] = 3 + i cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() for i in range(8): print(i, from_gpu[8 + i]) assert from_gpu[8 + i] == 3 + i
def test_internal_struct(context, q, float_data, float_data_gpu): cu_code = """ struct MyStruct { float afloat; int anint; float *floatpointer; // float **floatstarstart; }; __attribute__((device)) void processStruct(MyStruct *myStruct) { myStruct->afloat = myStruct->floatpointer[0]; } __attribute__((global)) void mykernel(float *data) { float afloat = data[0]; float float2 = data[1]; struct MyStruct myStruct = { afloat, 3, &float2 }; processStruct(&myStruct); data[2] = myStruct.afloat; } """ ll_code = test_common.cu_to_devicell_noopt(cu_code) print('ll_code', 'define ' + ll_code.split('define ')[1].split('}')[0] + '}') cl_code = test_common.ll_to_cl(ll_code, '_Z8mykernelPf', num_clmems=1) print('cl_code', cl_code)
def extract_value_cl(): # lets check it's compileable ll first, using llvm ll_filepath = 'test/extract_value.ll' with open(ll_filepath, 'r') as f: ll_sourcecode = f.read() cl_sourcecode = test_common.ll_to_cl(ll_sourcecode=ll_sourcecode, kernelName=kernelname, num_clmems=1) return cl_sourcecode
def test_load_globalfloatstar(context, q, float_data, float_data_gpu): ll_code = """define void @mykernel(float * %p1) { %1 = load float, float* %p1 ret void } """ cl_sourcecode = test_common.ll_to_cl(ll_code, "mykernel", num_clmems=1) print('cl_sourcecode', cl_sourcecode) assert len([ l for l in cl_sourcecode.split('\n') if l.strip() == 'float v2;' ]) == 1
def test_kernelparam_ll(context, q, float_data, float_data_gpu): ll_code = """define void @mykernel(float * %p1) { ret void } """ cl_sourcecode = test_common.ll_to_cl(ll_code, "mykernel", num_clmems=1) print('cl_sourcecode', cl_sourcecode) assert len([ l for l in cl_sourcecode.split('\n') if l.strip().startswith('global float* p1') ]) == 1
def dotdotdot_cl(): # lets check it's compileable ll first, using llvm ll_filepath = 'test/dotdotdot.ll' with open(ll_filepath, 'r') as f: ll_sourcecode = f.read() kernelName = test_common.mangle('test_si', ['float *']) cl_sourcecode = test_common.ll_to_cl(ll_sourcecode, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) return cl_sourcecode
def test_addr_of_float(context, q, float_data, float_data_gpu): cu_code = """ __attribute__((global)) void mykernel(float *data) { float v = data[0]; float *v1 = &v; *v1 = 5.0f; } """ ll_code = test_common.cu_to_devicell_noopt(cu_code) print('ll_code', 'define ' + ll_code.split('define ')[1].split('}')[0] + '}') cl_code = test_common.ll_to_cl(ll_code, '_Z8mykernelPf', num_clmems=1) print('cl_code', cl_code)
def test_float_constants_from_ll(context, q, float_data, float_data_gpu): ll_code = """ define void @kernel_float_constants(float* nocapture %data) #1 { store float 0x3E7AD7F2A0000000, float* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1) print('cl_code', cl_code) # try compiling it, just to be sure... kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() print('from_gpu[0]', from_gpu[0]) print(type(from_gpu[0]), type(1e-7)) assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10 assert 'data[0] = 1e-07f' in cl_code
def test_struct_byval(context, q, float_data, float_data_gpu): cu_code = """ struct MyStruct { float afloat; int anint; float *floatpointer; float **floatstarstar; }; __attribute__((global)) void mykernel(struct MyStruct myStruct) { } """ ll_code = test_common.cu_to_devicell_noopt(cu_code) print('ll_code', 'define ' + ll_code.split('define ')[1].split('}')[0] + '}') cl_code = test_common.ll_to_cl(ll_code, '_Z8mykernel8MyStruct', num_clmems=1) print('cl_code', cl_code)
def test_atomic_add_floats(context, q, float_data, float_data_gpu): ll_code = """ declare float @_Z9atomicAddIfET_PS0_S0_(float *, float) define void @mykernel(float* nocapture %data) #1 { %1 = call float @_Z9atomicAddIfET_PS0_S0_(float * %data, float 3.25) ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) # try compiling it, just to be sure... float_data[0] = 0 kernel = test_common.build_kernel(context, cl_code, 'mykernel') cl.enqueue_copy(q, float_data_gpu, float_data) kernel(q, (128,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() print('from_gpu[0]', from_gpu[0]) assert from_gpu[0] == 3.25 * 128
def try_build(context, ll_filepath, kernelname): with open(ll_filepath, 'r') as f: llcode = f.read() clcode = test_common.ll_to_cl(llcode, kernelname) test_common.build_kernel(context, clcode, kernelname)
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu): with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f: ll_code = f.read() cl_sourcecode = test_common.ll_to_cl( ll_code, '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_', num_clmems=3) prog = cl.Program(context, cl_sourcecode).build() N = 10 # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch # what we need: # struct Eigen__TensorEvaluator_nopointers Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer... # global float *eval_ptr0 => this will receive the result. just create a sufficiently large buffer # ptr0_offset => 0 # eval_ptr1 => will contian the data we want to reduce # eval_ptr1_offset=> 0 # size => eg 10, to reduce 10 values # scratch => set to workgroupsize * sizeof(float) # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use # offset into this src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset_bytes = 128 dst_offset_bytes = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_ptr0_gpu = huge_buf_gpu eval_ptr0_offset = dst_offset_bytes eval_ptr1_gpu = huge_buf_gpu eval_ptr1_offset = src_offset_bytes size = N # copy our host memory across # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4) global_size = 256 workgroup_size = 256 scratch = workgroup_size * 4 prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')( queue, (global_size,), (workgroup_size,), eval_nopointers_gpu, offset_type(0), eval_ptr0_gpu, offset_type(0), eval_ptr1_gpu, offset_type(0), offset_type(0), offset_type(eval_ptr0_offset), offset_type(eval_ptr1_offset), np.int32(size), cl.LocalMemory(scratch) ) # check for errors queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4) # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4) queue.finish() print('dst_host[:N]', dst_host[:N]) expected = np.sqrt(src_host) print('expected[:10]', expected[:N]) assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4
def test_cwise_sqrt(context, q, float_data, float_data_gpu): with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f: ll_code = f.read() cl_sourcecode = test_common.ll_to_cl( ll_code, '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_', num_clmems=3) print('creating program...') prog_unbuilt = cl.Program(context, cl_sourcecode) print('building kernel...') prog = prog_unbuilt.build() N = 10 # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch # what we need: # struct Eigen__TensorEvaluator_nopointers Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer... # global float *eval_ptr0 => this will receive the result. just create a sufficiently large buffer # ptr0_offset => 0 # eval_ptr1 => will contian the data we want to reduce # eval_ptr1_offset=> 0 # size => eg 10, to reduce 10 values # scratch => set to workgroupsize * sizeof(float) eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) eval_ptr0 = np.zeros(1024, dtype=np.float32) eval_ptr0_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr0) eval_ptr0_offset = 0 eval_ptr1 = np.random.uniform(0, 1, size=(1024,)).astype(np.float32) + 1.0 eval_ptr1_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr1) eval_ptr1_offset = 0 size = N global_size = 256 workgroup_size = 256 scratch = workgroup_size * 4 print('running kernel...') prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')( q, (global_size,), (workgroup_size,), eval_nopointers_gpu, offset_type(0), eval_ptr0_gpu, offset_type(0), eval_ptr1_gpu, offset_type(0), offset_type(0), offset_type(eval_ptr0_offset), offset_type(eval_ptr1_offset), np.int32(size), cl.LocalMemory(scratch) ) # check for errors q.finish() # copy eval_ptr0 back, and check the results... cl.enqueue_copy(q, eval_ptr0, eval_ptr0_gpu) q.finish() print('eval_ptr0[:N]', eval_ptr0[:N]) expected = np.sqrt(eval_ptr1) print('expected[:10]', expected[:N]) assert np.abs(expected[:N] - eval_ptr0[:N]).max() < 1e-4