def test_atomic_inc_int(context, q, int_data, int_data_gpu): # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h cu_code = """ __global__ void mykernel(int *data, int limit) { atomicInc((unsigned int *)data, limit); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPii', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = 0 kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPii') cl.enqueue_copy(q, int_data_gpu, int_data) num_blocks = 4 threads_per_block = 4 modulus = 11 kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), np.int32(256), cl.LocalMemory(32)) kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(4), np.int32(modulus - 1), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() print('from_gpu', from_gpu[:2]) assert from_gpu[0] == num_blocks * threads_per_block assert from_gpu[1] == num_blocks * threads_per_block % modulus
def test_alloca(context, q, float_data, float_data_gpu): code = """ __global__ void mykernel(float *data) { float *foo = data; foo[0] = data[1] + data[2] + data[3]; } """ kernelName = test_common.mangle('mykernel', ['float *']) try: dict = test_common.compile_code_v2(cl, context, code, kernelName, num_clmems=1) prog = dict['prog'] cl_sourcecode = dict['cl_sourcecode'] print('cl_sourcecode', cl_sourcecode) except Exception as e: with open('/tmp/testprog-device.cl', 'r') as f: print(f.read()) raise e print('type(offset_type(0))', type(offset_type(0))) prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(float *float_data, int *int_data) { float_data[0] = (float)int_data[0]; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel'] int_data[0] = 5 int_data[1] = 2 int_data[2] = 4 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('float_data[0]', float_data[0]) # expected = pow(float_data[1], float_data[2]) assert float_data[0] == 5
def test_inlining(context, q, float_data, float_data_gpu): cu_source = """ __global__ void myKernel(float *data) { data[0] = (data[3] * (data[1] + data[2])) / data[4]; data[7] = (data[3] / (data[1] - data[2])) * data[4]; } """ kernelName = test_common.mangle('myKernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(10): float_data[i] = i + 3 cl.enqueue_copy(q, float_data_gpu, float_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() float_data2 = np.zeros((1024, ), dtype=np.float32) cl.enqueue_copy(q, float_data2, float_data_gpu) q.finish() print('float_data2[0]', float_data2[0]) d = float_data d2 = float_data2 expect = (d[3] * (d[1] + d[2])) / d[4] assert abs(d2[0] - expect) < 1e-5
def test_sincos(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { sincosf(0.1, &data[0], &data[1]); sincosf(data[2], &data[3], &data[4]); } """ kernel_name = test_common.mangle('mykernel', ['float*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) print('cl_code', cl_code) float_data[2] = -0.3 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:5]) assert abs(float_data[0] - math.sin(0.1)) < 1e-4 assert abs(float_data[1] - math.cos(0.1)) < 1e-4 assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4 assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
def test_use_template1(context, q, int_data, int_data_gpu, float_data, float_data_gpu): code = """ template< typename T > __device__ T addNumbers(T one, T two) { return one + two; } __global__ void use_template1(float *data, int *intdata) { if(threadIdx.x == 0 && blockIdx.x == 0) { data[0] = addNumbers(data[1], data[2]); intdata[0] = addNumbers(intdata[1], intdata[2]); } } """ kernelName = test_common.mangle('use_template1', ['float *', 'int *']) prog = compile_code(cl, context, code, kernelName, num_clmems=2) float_data_orig = np.copy(float_data) int_data_orig = np.copy(int_data) prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert float_data[0] == float_data_orig[1] + float_data_orig[2] assert int_data[0] == int_data_orig[1] + int_data_orig[2]
def test_sqrt(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[threadIdx.x] = sqrt(data[threadIdx.x]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[0] = 1.5 float_data[1] = 4.6 float_data[2] = -1.5 float_data[3] = 0 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:4]', float_data[:4]) for i in range(4): if float_data_orig[i] >= 0: assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4 else: assert math.isnan(float_data[i])
def test_int_expressions(context, q, int_data, int_data_gpu): cu_source = """ __global__ void myKernel(int *data) { data[0] = (data[10] | data[11]) == data[12]; } """ kernelName = test_common.mangle('myKernel', ['int *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(50): int_data[i] = 0 int_data[10] = 2 int_data[11] = 1 int_data[12] = 1 cl.enqueue_copy(q, int_data_gpu, int_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() gpu_data = np.zeros((1024, ), dtype=np.int32) cl.enqueue_copy(q, gpu_data, int_data_gpu) q.finish() data = int_data actual = gpu_data[0] expected = int((data[10] | data[11]) == data[12]) print('actual', actual, 'expected', expected) assert actual == expected
def test_double_ieeefloats(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(double *data) { double d_neginfinity = -INFINITY; double d_posinfinity = INFINITY; float f_neginfinity = -INFINITY; float f_posinfinity = INFINITY; data[0] = INFINITY; data[1] = -INFINITY; data[2] = f_neginfinity; data[3] = f_posinfinity; } """ kernel_name = test_common.mangle('mykernel', ['double*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:4]) assert float_data[0] == np.inf assert float_data[1] == - np.inf assert float_data[2] == - np.inf assert float_data[3] == np.inf
def test_test_if_else(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; if(tid < N) { data[tid] *= 2; } else { data[tid] += 5; } } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) for i in range(10): if i < N: assert float_data[i] == float_data_orig[i] * 2 else: assert abs(float_data[i] - float_data_orig[i] - 5) <= 1e-4
def test_copy_float(extract_value, q, float_data, float_data_gpu): extract_value.__getattr__(kernelname)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data[1]
def test_test_for(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testFor(float *data, int N) { if(threadIdx.x == 0) { float sum = 0.0f; for(int i = 0; i < N; i++) { sum += data[i]; } data[0] = sum; } } """ kernelName = test_common.mangle('testFor', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 4 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) sum = 0 for i in range(N): sum += float_data_orig[i] assert abs(float_data[0] - sum) <= 1e-4
def test_memcpy(context, q, int_data, int_data_gpu): ll_code = """ declare void @_Z6memcpyPvPKvm(i8*, i8*, i64) define void @mykernel(i32* %data) { %1 = bitcast i32* %data to i8* %2 = getelementptr i32, i32* %data, i32 8 %3 = bitcast i32* %2 to i8* call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32) ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1) print('cl_code', cl_code) for i in range(8): int_data[i] = 3 + i cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() for i in range(8): print(i, from_gpu[8 + i]) assert from_gpu[8 + i] == 3 + i
def test_test_do_while(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; int i = 0; float sum = 0; do { sum += data[i]; i++; } while(sum < 15); data[0] = sum; } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) print(float_data[0])
def test_sext(context, q, int_data, int_data_gpu): ll_code = """ define void @mykernel(i32* %data) { %1 = load i32, i32* %data %2 = sext i32 %1 to i64 %3 = lshr i64 %2, 32 %4 = trunc i64 %3 to i32 store i32 %4, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]: int_data[0] = experiment['in'] cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32 expected = experiment['out'] print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item() split_cl = cl_code.split('\n') found_long_cast = False for line in split_cl: if ' >> 32' in line and '(long)' in line: found_long_cast = True assert found_long_cast
def test_umulhi(context, q, int_data, int_data_gpu): ll_code = """ declare i32 @_Z8__umulhiii(i32, i32) define void @test_umulhi(i32* %data) { %1 = load i32, i32* %data %2 = getelementptr i32, i32* %data, i32 1 %3 = load i32, i32* %2 %4 = getelementptr i32, i32* %data, i32 2 %5 = load i32, i32* %4 %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5) store i32 %6, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = -50 int_data[2] = 2523123 cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'test_umulhi') kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item()
def test_pow(context, q, float_data, float_data_gpu): code = """ __global__ void myKernel(float *data) { data[0] = pow(data[1], data[2]); data[3] = pow(data[4], data[5]); data[5] = pow(data[7], data[8]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] float_data[1] = 1.5 float_data[2] = 4.6 float_data[4] = -1.5 float_data[5] = 4.6 float_data[7] = 1.5 float_data[8] = -4.6 cl.enqueue_copy(q, float_data_gpu, float_data) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[3]', float_data[3]) print('float_data[6]', float_data[6]) expected = pow(float_data[1], float_data[2]) assert abs(float_data[0] - expected) <= 1e-4
def test_ternary(context, q, float_data, float_data_gpu): kernelSource = """ __global__ void setValue(float *data, int idx, float value) { if(threadIdx.x == 0) { data[idx] = value; } } __global__ void testTernary(float *data) { data[0] = data[1] > 0 ? data[2] : data[3]; } """ setValueKernelName = test_common.mangle('setValue', ['float *', 'int', 'float']) setValueProg = compile_code(cl, context, kernelSource, setValueKernelName, num_clmems=1) testTernaryName = test_common.mangle('testTernary', ['float *']) testTernaryProg = compile_code(cl, context, kernelSource, testTernaryName, num_clmems=1) float_data_orig = np.copy(float_data) def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data_gpu, float_data) print('float_data[:8]', float_data[:8]) set_float_value(float_data_gpu, 1, 10) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[2] set_float_value(float_data_gpu, 1, -2) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[3]
def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4))
def test_singlebuffer_sqrt_cocl(context, queue): """ Test doing stuff with one single large buffer for destination and source, just offset a bit """ code = """ __global__ void myKernel(float *data0, float *data1, int N) { if(threadIdx.x < N) { data0[threadIdx.x] = sqrt(data1[threadIdx.x]); } } """ mangledName = '_Z8myKernelPfS_i' kernel = test_common.compile_code_v3(cl, context, code, mangledName, num_clmems=2)['kernel'] N = 10 src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0 dst_host = np.zeros(N, dtype=np.float32) src_offset = 128 dst_offset = 256 huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096) test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset, size=N * 4) global_size = 256 workgroup_size = 256 # scratch = workgroup_size * 4 kernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu, offset_type(0), huge_buf_gpu, offset_type(0), offset_type(dst_offset), offset_type(src_offset), np.int32(N), cl.LocalMemory(4)) queue.finish() test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset, size=N * 4) queue.finish() print('src_host', src_host) print('dst_host', dst_host) print('np.sqrt(src_host)', np.sqrt(src_host)) assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
def test_foo(context, q, float_data, float_data_gpu, cuSourcecode): kernelName = test_common.mangle('foo', ['float *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName, num_clmems=1) testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == 123
def test_structs(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ struct MyStruct { int myint; float myfloat; }; __global__ void testStructs(MyStruct *structs, float *float_data, int *int_data) { int_data[0] = structs[0].myint; float_data[0] = structs[0].myfloat; float_data[1] = structs[1].myfloat; } """ kernel = test_common.compile_code_v3( cl, context, code, test_common.mangle('testStructs', ['MyStruct *', 'float *', 'int *']), num_clmems=3)['kernel'] # my_struct = np.dtype([("myfloat", np.float32), ("myint", np.int32)]) # I dont know why, but seems these are back to front... my_struct = np.dtype([ ("myint", np.int32), ("myfloat", np.float32) ]) # seems these are wrong way around on HD5500. Works ok on 940M my_struct, my_struct_c_decl = pyopencl.tools.match_dtype_to_c_struct( context.devices[0], "MyStruct", my_struct) my_struct = cl.tools.get_or_register_dtype("MyStruct", my_struct) structs = np.empty(2, my_struct) structs[0]['myint'] = 123 structs[1]['myint'] = 33 structs[0]['myfloat'] = 567 structs[1]['myfloat'] = 44 structs_gpu = cl.array.to_device(q, structs) # p = structs_gpu.map_to_host(q) # print('p', p) # q.finish() kernel(q, (32, ), (32, ), structs_gpu.data, offset_type(0), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[0]', int_data[0]) print('int_data[1]', int_data[1]) print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) assert int_data[0] == 123 assert float_data[0] == 567 assert float_data[1] == 44
def test_copy_float(cuSourcecode, context, q, float_data, float_data_gpu): argTypes = ['float *'] kernelName = test_common.mangle('copy_float', argTypes) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName, num_clmems=1) testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data[1]
def test_use_tid2(cuSourcecode, context, q, int_data, int_data_gpu): int_data_orig = np.copy(int_data) kernelName = test_common.mangle('use_tid2', ['int *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName, num_clmems=1) testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert int_data[0] == int_data_orig[0] + 0 assert int_data[10] == int_data_orig[10] + 10 assert int_data[31] == int_data_orig[31] + 31
def test_float4(cuSourcecode, context, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) kernelName = test_common.mangle('testFloat4', ['float4 *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName, num_clmems=1) testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) assert float_data[1] == float_data_orig[4 + 2] * float_data_orig[4 + 3]
def test_nested_for(context, q, float_data, float_data_gpu): source = """ __device__ void myfunc(float *data, int a, int b) { data[1] = data[2]; float sum = 0; for(int i = 0; i < a; i++) { for(int j = 0; j < b; j++) { sum += data[i * 17 + j * 7]; } } data[0] = sum; } __global__ void mykernel(float *data, int a, int b) { myfunc(data, a, b); } """ kernelName = test_common.mangle('mykernel', ['float *', 'int', 'int']) kernel = test_common.compile_code_v3(cl, context, source, kernelName, num_clmems=1)['kernel'] float_data_orig = np.copy(float_data) a = 2 b = 3 kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0), np.int32(a), np.int32(b), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) sum = 0 for i in range(a): for j in range(b): sum += float_data_orig[i * 17 + j * 7] print('float_data_orig', float_data_orig[:3]) print('float_data', float_data[:3]) assert float_data[1] == float_data_orig[2] assert abs(float_data[0] - sum) <= 1e-4
def test_clz(context, q, float_data, float_data_gpu, int_data, int_data_gpu): code = """ __global__ void myKernel(int *int_data) { int_data[0] = __clz(int_data[1]); } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['int *']), num_clmems=1)['kernel'] int_data[1] = 15 cl.enqueue_copy(q, int_data_gpu, int_data) kernel( q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() print('int_data[:2]', int_data[:2])
def test_floatconstants(context, q, float_data, float_data_gpu): code = """ __device__ float4 getvals() { return make_float4(0xFFF0000000000000, 0x7FF0000000000000, INFINITY, -INFINITY); } __global__ void myKernel(float *data) { data[0] = 18442240474082181120.0f; // 0xFFF0000000000000 data[1] = 9218868437227405312.0f; // 0x7FF0000000000000 float4 vals = getvals(); data[2] = vals.x; data[3] = vals.y; data[4] = vals.w; data[5] = vals.z; data[6] = INFINITY; data[7] = -INFINITY; // data[8] = 0xFFEFFFFFFFFFFFFF; } """ kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel'] kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[0]', float_data[0]) print('float_data[1]', float_data[1]) print('float_data[2]', float_data[2]) print('float_data[3]', float_data[3]) print('float_data[4]', float_data[4]) print('float_data[5]', float_data[5]) print('float_data[6]', float_data[6]) print('float_data[7]', float_data[7]) # print('float_data[8]', float_data[8]) assert float_data[0] > 100000000 assert float_data[1] > 100000000 assert float_data[2] > 100000000 assert float_data[3] > 100000000 assert float_data[4] == - np.inf assert float_data[5] == np.inf assert float_data[6] == np.inf assert float_data[7] == - np.inf
def test_fabs_double(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { data[0] = fabs(data[0]); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPf', 1) print('cl_code', cl_code) float_data[0] = -0.123 cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPf') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() expected = 0.123 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert abs(expected - from_gpu[0].item()) < 1e-4
def test_float_constants_from_ll(context, q, float_data, float_data_gpu): ll_code = """ define void @kernel_float_constants(float* nocapture %data) #1 { store float 0x3E7AD7F2A0000000, float* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1) print('cl_code', cl_code) # try compiling it, just to be sure... kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() print('from_gpu[0]', from_gpu[0]) print(type(from_gpu[0]), type(1e-7)) assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10 assert 'data[0] = 1e-07f' in cl_code