def test_long_conflicting_names(context, q): cu_source = """ __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(float *d) { d[1] = 1.0f; } __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(float *d) { d[2] = 3.0f; } __global__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec(float *data) { data[0] = 123.0f; mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(data); mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(data); } """ mangled_name = test_common.mangle('mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) for line in cl_source.split("\n"): if line.strip().startswith('/*'): continue if not line.strip().replace('kernel ', '').strip().startswith('void'): continue name = line.replace('kernel ', '').replace('void ', '').split('(')[0] if name != '': print('name', name) assert len(name) <= 32 test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_short_names(context): cu_source = """ __device__ void funca(float *d); __device__ void funca(float *d) { d[1] = 1.0f; } __device__ void funcb(float *d, int c) { d[2] = 3.0f + 5 - d[c]; } __global__ void funck(float *data) { data[0] = 123.0f; funca(data); funcb(data, (int)data[6]); for(int i = 0; i < 1000; i++) { funcb(data + i, (int)data[i + 100]); } } """ mangled_name = test_common.mangle('funck', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_atomic_inc_int(context, q, int_data, int_data_gpu): # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h cu_code = """ __global__ void mykernel(int *data, int limit) { atomicInc((unsigned int *)data, limit); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPii', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = 0 kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPii') cl.enqueue_copy(q, int_data_gpu, int_data) num_blocks = 4 threads_per_block = 4 modulus = 11 kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), np.int32(256), cl.LocalMemory(32)) kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(4), np.int32(modulus - 1), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() print('from_gpu', from_gpu[:2]) assert from_gpu[0] == num_blocks * threads_per_block assert from_gpu[1] == num_blocks * threads_per_block % modulus
def test_umulhi(context, q, int_data, int_data_gpu): ll_code = """ declare i32 @_Z8__umulhiii(i32, i32) define void @test_umulhi(i32* %data) { %1 = load i32, i32* %data %2 = getelementptr i32, i32* %data, i32 1 %3 = load i32, i32* %2 %4 = getelementptr i32, i32* %data, i32 2 %5 = load i32, i32* %4 %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5) store i32 %6, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = -50 int_data[2] = 2523123 cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'test_umulhi') kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item()
def test_insertvalue(context, q, float_data, float_data_gpu): sourcecode = """ struct mystruct { int f0; float f1; }; __device__ struct mystruct doSomething(struct mystruct foo, int somevalue); __device__ struct mystruct doSomething(struct mystruct foo, int somevalue) { foo.f0 = somevalue; foo.f1 = 4.5f; return foo; } __global__ void somekernel(float *data) { struct mystruct foo; foo.f0 = 3; foo.f1 = 4.5; foo = doSomething(foo, data[2]); data[0] = (int)foo.f0; data[1] = foo.f1; } """ mangledname = test_common.mangle('somekernel', ['float *']) cl_code = test_common.cu_to_cl(sourcecode, mangledname) kernel = test_common.build_kernel(context, cl_code, mangledname)
def test_memcpy(context, q, int_data, int_data_gpu): ll_code = """ declare void @_Z6memcpyPvPKvm(i8*, i8*, i64) define void @mykernel(i32* %data) { %1 = bitcast i32* %data to i8* %2 = getelementptr i32, i32* %data, i32 8 %3 = bitcast i32* %2 to i8* call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32) ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1) print('cl_code', cl_code) for i in range(8): int_data[i] = 3 + i cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() for i in range(8): print(i, from_gpu[8 + i]) assert from_gpu[8 + i] == 3 + i
def test_inlining(context, q, float_data, float_data_gpu): cu_source = """ __global__ void myKernel(float *data) { data[0] = (data[3] * (data[1] + data[2])) / data[4]; data[7] = (data[3] / (data[1] - data[2])) * data[4]; } """ kernelName = test_common.mangle('myKernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(10): float_data[i] = i + 3 cl.enqueue_copy(q, float_data_gpu, float_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() float_data2 = np.zeros((1024, ), dtype=np.float32) cl.enqueue_copy(q, float_data2, float_data_gpu) q.finish() print('float_data2[0]', float_data2[0]) d = float_data d2 = float_data2 expect = (d[3] * (d[1] + d[2])) / d[4] assert abs(d2[0] - expect) < 1e-5
def test_int_expressions(context, q, int_data, int_data_gpu): cu_source = """ __global__ void myKernel(int *data) { data[0] = (data[10] | data[11]) == data[12]; } """ kernelName = test_common.mangle('myKernel', ['int *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(50): int_data[i] = 0 int_data[10] = 2 int_data[11] = 1 int_data[12] = 1 cl.enqueue_copy(q, int_data_gpu, int_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() gpu_data = np.zeros((1024, ), dtype=np.int32) cl.enqueue_copy(q, gpu_data, int_data_gpu) q.finish() data = int_data actual = gpu_data[0] expected = int((data[10] | data[11]) == data[12]) print('actual', actual, 'expected', expected) assert actual == expected
def test_sincos(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { sincosf(0.1, &data[0], &data[1]); sincosf(data[2], &data[3], &data[4]); } """ kernel_name = test_common.mangle('mykernel', ['float*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) print('cl_code', cl_code) float_data[2] = -0.3 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:5]) assert abs(float_data[0] - math.sin(0.1)) < 1e-4 assert abs(float_data[1] - math.cos(0.1)) < 1e-4 assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4 assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
def test_ieeefloats(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(double *data) { double d_neginfinity = -INFINITY; double d_posinfinity = INFINITY; float f_neginfinity = -INFINITY; float f_posinfinity = INFINITY; data[0] = INFINITY; data[1] = -INFINITY; data[2] = f_neginfinity; data[3] = f_posinfinity; } """ kernel_name = test_common.mangle('mykernel', ['double*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:4]) assert float_data[0] == np.inf assert float_data[1] == - np.inf assert float_data[2] == - np.inf assert float_data[3] == np.inf
def test_sext(context, q, int_data, int_data_gpu): ll_code = """ define void @mykernel(i32* %data) { %1 = load i32, i32* %data %2 = sext i32 %1 to i64 %3 = lshr i64 %2, 32 %4 = trunc i64 %3 to i32 store i32 %4, i32* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]: int_data[0] = experiment['in'] cl.enqueue_copy(q, int_data_gpu, int_data) kernel = test_common.build_kernel(context, cl_code, 'mykernel') kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32 expected = experiment['out'] print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert expected == from_gpu[0].item() split_cl = cl_code.split('\n') found_long_cast = False for line in split_cl: if ' >> 32' in line and '(long)' in line: found_long_cast = True assert found_long_cast
def test_compile(context, cu_filepath, kernelname, num_clmems): with open(cu_filepath, 'r') as f: cu_code = f.read() ll_sourcecode = test_common.cu_to_ll(cu_code) for line in ll_sourcecode.split('\n'): if line.startswith('define') and kernelname in line: mangledname = line.split('@')[1].split('(')[0] break print('mangledname', mangledname) cl_code = test_common.cu_to_cl(cu_code, mangledname, num_clmems=num_clmems) print('got cl_code') test_common.build_kernel(context, cl_code, mangledname) print('after build kernel')
def test_compile(context, cu_filepath, kernelname): with open(cu_filepath, 'r') as f: cu_code = f.read() try: cl_code = test_common.cu_to_cl(cu_code, '') except: pass with open('/tmp/testprog-device.ll') as f: ll_code = f.read() for line in ll_code.split('\n'): if line.startswith('define') and kernelname in line: mangledname = line.split('@')[1].split('(')[0] break print('mangledname', mangledname) cl_code = test_common.cu_to_cl(cu_code, mangledname) test_common.build_kernel(context, cl_code, mangledname[:31])
def test_getelementptr_struct_local(context, q, float_data, float_data_gpu): cu_source = """ struct MyStruct { float* f0; float* f1; }; __global__ void foo(float *data) { struct MyStruct astruct; float *floats = astruct.f0; } """ kernelName = test_common.mangle('foo', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
def test_getelementptr_struct_global(context, q, float_data, float_data_gpu): cu_source = """ struct MyStruct { float* f0; float* f1; }; __global__ void foo(struct MyStruct mystruct) { float *floats = mystruct.f0; } """ # kernelName = test_common.mangle('foo', ['float *']) kernelName = '_Z3foo8MyStruct' cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=3) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
def test_atomic_cas_unsignedint(context, q, int_data, int_data_gpu): # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h cu_code = """ __global__ void mykernel(unsigned int *data) { int tid = threadIdx.x; int gid = blockIdx.x * blockDim.x + threadIdx.x; int numAttempts = 0; while(true) { numAttempts += 1; unsigned int oldValue = data[0]; unsigned int newValue = oldValue + 1; unsigned int returnedOld = atomicCAS(data, oldValue, newValue); if(returnedOld == oldValue) { break; } } data[1 + gid] = numAttempts; } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPj', 1) print('cl_code', cl_code) int_data[0] = 0 kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPj') cl.enqueue_copy(q, int_data_gpu, int_data) num_blocks = 4 threads_per_block = 4 kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() print('from_gpu', from_gpu[:17]) # check two things: # - final value of data[0] should equal num_blocks * threads_per_block # - the number of attempts should be unique, for each thread assert from_gpu[0] == num_blocks * threads_per_block seen_num_attempts = set() for i in range(num_blocks * threads_per_block): num_attempts = from_gpu[i + 1] if num_attempts in seen_num_attempts: raise Exception('already saw num_attempts %s' % num_attempts) seen_num_attempts.add(num_attempts)
def test_fabs_double(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { data[0] = fabs(data[0]); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPf', 1) print('cl_code', cl_code) float_data[0] = -0.123 cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPf') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() expected = 0.123 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert abs(expected - from_gpu[0].item()) < 1e-4
def test_float_constants_from_ll(context, q, float_data, float_data_gpu): ll_code = """ define void @kernel_float_constants(float* nocapture %data) #1 { store float 0x3E7AD7F2A0000000, float* %data ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1) print('cl_code', cl_code) # try compiling it, just to be sure... kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() print('from_gpu[0]', from_gpu[0]) print(type(from_gpu[0]), type(1e-7)) assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10 assert 'data[0] = 1e-07f' in cl_code
def test_local(context, q, float_data, float_data_gpu): cu_source = """ __global__ void foo(float *data) { __shared__ float localmem[33]; int tid = threadIdx.x; int warpId = tid % 32; localmem[warpId] = data[warpId]; data[warpId] = localmem[warpId + 1]; } """ kernelName = test_common.mangle('foo', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) float_data_orig = np.copy(float_data) kernel(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('before', float_data_orig[:5]) print('after', float_data[:5]) assert np.abs(float_data_orig[1:32] - float_data[0:31]).max() <= 1e-4
def test_atomic_add_floats(context, q, float_data, float_data_gpu): ll_code = """ declare float @_Z9atomicAddIfET_PS0_S0_(float *, float) define void @mykernel(float* nocapture %data) #1 { %1 = call float @_Z9atomicAddIfET_PS0_S0_(float * %data, float 3.25) ret void } """ cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1) print('cl_code', cl_code) # try compiling it, just to be sure... float_data[0] = 0 kernel = test_common.build_kernel(context, cl_code, 'mykernel') cl.enqueue_copy(q, float_data_gpu, float_data) kernel(q, (128,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() print('from_gpu[0]', from_gpu[0]) assert from_gpu[0] == 3.25 * 128
def try_build(context, filepath, kernelname): with open(filepath, 'r') as f: cucode = f.read() clcode = test_common.cu_to_cl(cucode, kernelname) test_common.build_kernel(context, clcode, kernelname[:31])
def try_build(context, ll_filepath, kernelname): with open(ll_filepath, 'r') as f: llcode = f.read() clcode = test_common.ll_to_cl(llcode, kernelname) test_common.build_kernel(context, clcode, kernelname)
def dotdotdot(context, dotdotdot_cl): kernelName = test_common.mangle('test_si', ['float *']) kernel = test_common.build_kernel(context, dotdotdot_cl, kernelName) return kernel