def test_int_expressions(context, q, int_data, int_data_gpu): cu_source = """ __global__ void myKernel(int *data) { data[0] = (data[10] | data[11]) == data[12]; } """ kernelName = test_common.mangle('myKernel', ['int *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(50): int_data[i] = 0 int_data[10] = 2 int_data[11] = 1 int_data[12] = 1 cl.enqueue_copy(q, int_data_gpu, int_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() gpu_data = np.zeros((1024, ), dtype=np.int32) cl.enqueue_copy(q, gpu_data, int_data_gpu) q.finish() data = int_data actual = gpu_data[0] expected = int((data[10] | data[11]) == data[12]) print('actual', actual, 'expected', expected) assert actual == expected
def test_ieeefloats(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(double *data) { double d_neginfinity = -INFINITY; double d_posinfinity = INFINITY; float f_neginfinity = -INFINITY; float f_posinfinity = INFINITY; data[0] = INFINITY; data[1] = -INFINITY; data[2] = f_neginfinity; data[3] = f_posinfinity; } """ kernel_name = test_common.mangle('mykernel', ['double*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:4]) assert float_data[0] == np.inf assert float_data[1] == - np.inf assert float_data[2] == - np.inf assert float_data[3] == np.inf
def test_sincos(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { sincosf(0.1, &data[0], &data[1]); sincosf(data[2], &data[3], &data[4]); } """ kernel_name = test_common.mangle('mykernel', ['float*']) cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1) print('cl_code', cl_code) float_data[2] = -0.3 float_data_orig = np.copy(float_data) cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, kernel_name) kernel( q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print(float_data[:5]) assert abs(float_data[0] - math.sin(0.1)) < 1e-4 assert abs(float_data[1] - math.cos(0.1)) < 1e-4 assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4 assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
def test_inlining(context, q, float_data, float_data_gpu): cu_source = """ __global__ void myKernel(float *data) { data[0] = (data[3] * (data[1] + data[2])) / data[4]; data[7] = (data[3] / (data[1] - data[2])) * data[4]; } """ kernelName = test_common.mangle('myKernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) for i in range(10): float_data[i] = i + 3 cl.enqueue_copy(q, float_data_gpu, float_data) q.finish() # prog = cl.Program(context, sourcecode).build() # prog.__getattr__(kernelName)( kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() float_data2 = np.zeros((1024, ), dtype=np.float32) cl.enqueue_copy(q, float_data2, float_data_gpu) q.finish() print('float_data2[0]', float_data2[0]) d = float_data d2 = float_data2 expect = (d[3] * (d[1] + d[2])) / d[4] assert abs(d2[0] - expect) < 1e-5
def test_long_conflicting_names(context, q): cu_source = """ __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(float *d) { d[1] = 1.0f; } __device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(float *d) { d[2] = 3.0f; } __global__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec(float *data) { data[0] = 123.0f; mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(data); mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(data); } """ mangled_name = test_common.mangle('mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) for line in cl_source.split("\n"): if line.strip().startswith('/*'): continue if not line.strip().replace('kernel ', '').strip().startswith('void'): continue name = line.replace('kernel ', '').replace('void ', '').split('(')[0] if name != '': print('name', name) assert len(name) <= 32 test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_atomic_inc_int(context, q, int_data, int_data_gpu): # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h cu_code = """ __global__ void mykernel(int *data, int limit) { atomicInc((unsigned int *)data, limit); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPii', 1) print('cl_code', cl_code) int_data[0] = 0 int_data[1] = 0 kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPii') cl.enqueue_copy(q, int_data_gpu, int_data) num_blocks = 4 threads_per_block = 4 modulus = 11 kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), np.int32(256), cl.LocalMemory(32)) kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(4), np.int32(modulus - 1), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() print('from_gpu', from_gpu[:2]) assert from_gpu[0] == num_blocks * threads_per_block assert from_gpu[1] == num_blocks * threads_per_block % modulus
def test_short_names(context): cu_source = """ __device__ void funca(float *d); __device__ void funca(float *d) { d[1] = 1.0f; } __device__ void funcb(float *d, int c) { d[2] = 3.0f + 5 - d[c]; } __global__ void funck(float *data) { data[0] = 123.0f; funca(data); funcb(data, (int)data[6]); for(int i = 0; i < 1000; i++) { funcb(data + i, (int)data[i + 100]); } } """ mangled_name = test_common.mangle('funck', ['float *']) cl_source = test_common.cu_to_cl(cu_source, mangled_name) print('cl_source', cl_source) test_common.build_kernel(context, cl_source, mangled_name[:31])
def test_insertvalue(context, q, float_data, float_data_gpu): sourcecode = """ struct mystruct { int f0; float f1; }; __device__ struct mystruct doSomething(struct mystruct foo, int somevalue); __device__ struct mystruct doSomething(struct mystruct foo, int somevalue) { foo.f0 = somevalue; foo.f1 = 4.5f; return foo; } __global__ void somekernel(float *data) { struct mystruct foo; foo.f0 = 3; foo.f1 = 4.5; foo = doSomething(foo, data[2]); data[0] = (int)foo.f0; data[1] = foo.f1; } """ mangledname = test_common.mangle('somekernel', ['float *']) cl_code = test_common.cu_to_cl(sourcecode, mangledname) kernel = test_common.build_kernel(context, cl_code, mangledname)
def test_no_pointer_struct_ointer(context): with open("test/pointerpointer.cu", 'r') as f: cu_code = f.read() cl_code = test_common.cu_to_cl( cu_code, '_Z11myte6kernelP16TensorEvaluator6PfP9GpuDeviceiii') kernel_line = None for line in cl_code.split('\n'): if line.startswith('kernel'): kernel_line = line assert kernel_line is not None assert 'class_GpuDevice_nopointers' in kernel_line
def test_compile(context, cu_filepath, kernelname): with open(cu_filepath, 'r') as f: cu_code = f.read() try: cl_code = test_common.cu_to_cl(cu_code, '') except: pass with open('/tmp/testprog-device.ll') as f: ll_code = f.read() for line in ll_code.split('\n'): if line.startswith('define') and kernelname in line: mangledname = line.split('@')[1].split('(')[0] break print('mangledname', mangledname) cl_code = test_common.cu_to_cl(cu_code, mangledname) test_common.build_kernel(context, cl_code, mangledname[:31])
def test_getelementptr_struct_global(context, q, float_data, float_data_gpu): cu_source = """ struct MyStruct { float* f0; float* f1; }; __global__ void foo(struct MyStruct mystruct) { float *floats = mystruct.f0; } """ # kernelName = test_common.mangle('foo', ['float *']) kernelName = '_Z3foo8MyStruct' cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=3) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
def test_getelementptr_struct_local(context, q, float_data, float_data_gpu): cu_source = """ struct MyStruct { float* f0; float* f1; }; __global__ void foo(float *data) { struct MyStruct astruct; float *floats = astruct.f0; } """ kernelName = test_common.mangle('foo', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
def test_compile(context, cu_filepath, kernelname, num_clmems): with open(cu_filepath, 'r') as f: cu_code = f.read() ll_sourcecode = test_common.cu_to_ll(cu_code) for line in ll_sourcecode.split('\n'): if line.startswith('define') and kernelname in line: mangledname = line.split('@')[1].split('(')[0] break print('mangledname', mangledname) cl_code = test_common.cu_to_cl(cu_code, mangledname, num_clmems=num_clmems) print('got cl_code') test_common.build_kernel(context, cl_code, mangledname) print('after build kernel')
def test_atomic_cas_unsignedint(context, q, int_data, int_data_gpu): # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h cu_code = """ __global__ void mykernel(unsigned int *data) { int tid = threadIdx.x; int gid = blockIdx.x * blockDim.x + threadIdx.x; int numAttempts = 0; while(true) { numAttempts += 1; unsigned int oldValue = data[0]; unsigned int newValue = oldValue + 1; unsigned int returnedOld = atomicCAS(data, oldValue, newValue); if(returnedOld == oldValue) { break; } } data[1 + gid] = numAttempts; } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPj', 1) print('cl_code', cl_code) int_data[0] = 0 kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPj') cl.enqueue_copy(q, int_data_gpu, int_data) num_blocks = 4 threads_per_block = 4 kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(int_data) cl.enqueue_copy(q, from_gpu, int_data_gpu) q.finish() print('from_gpu', from_gpu[:17]) # check two things: # - final value of data[0] should equal num_blocks * threads_per_block # - the number of attempts should be unique, for each thread assert from_gpu[0] == num_blocks * threads_per_block seen_num_attempts = set() for i in range(num_blocks * threads_per_block): num_attempts = from_gpu[i + 1] if num_attempts in seen_num_attempts: raise Exception('already saw num_attempts %s' % num_attempts) seen_num_attempts.add(num_attempts)
def test_fabs_double(context, q, float_data, float_data_gpu): cu_code = """ __global__ void mykernel(float *data) { data[0] = fabs(data[0]); } """ cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPf', 1) print('cl_code', cl_code) float_data[0] = -0.123 cl.enqueue_copy(q, float_data_gpu, float_data) kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPf') kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32)) from_gpu = np.copy(float_data) cl.enqueue_copy(q, from_gpu, float_data_gpu) q.finish() expected = 0.123 print('expected', expected) print('from_gpu[0]', from_gpu[0]) assert abs(expected - from_gpu[0].item()) < 1e-4
def test_local(context, q, float_data, float_data_gpu): cu_source = """ __global__ void foo(float *data) { __shared__ float localmem[33]; int tid = threadIdx.x; int warpId = tid % 32; localmem[warpId] = data[warpId]; data[warpId] = localmem[warpId + 1]; } """ kernelName = test_common.mangle('foo', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName) print('cl_sourcecode', cl_sourcecode) kernel = test_common.build_kernel(context, cl_sourcecode, kernelName) float_data_orig = np.copy(float_data) kernel(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('before', float_data_orig[:5]) print('after', float_data[:5]) assert np.abs(float_data_orig[1:32] - float_data[0:31]).max() <= 1e-4
def x_test_play(context, q, float_data, float_data_gpu): cu_source = """ __device__ void process(float *data) { *data = 5.0f; } __device__ float process2(float value) { process(&value); return value; } __global__ void mykernel(float *data) { float v = data[0]; float *v1 = &v; *v1 = 5.0f; data[0] = v; data[0] = process2(data[0]); } """ kernelName = test_common.mangle('mykernel', ['float *']) cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1) print('cl_sourcecode', cl_sourcecode)
def try_build(context, filepath, kernelname): with open(filepath, 'r') as f: cucode = f.read() clcode = test_common.cu_to_cl(cucode, kernelname) test_common.build_kernel(context, clcode, kernelname[:31])