def test_ternary(context, q, float_data, float_data_gpu): kernelSource = """ __global__ void setValue(float *data, int idx, float value) { if(threadIdx.x == 0) { data[idx] = value; } } __global__ void testTernary(float *data) { data[0] = data[1] > 0 ? data[2] : data[3]; } """ setValueKernelName = test_common.mangle('setValue', ['float *', 'int', 'float']) setValueProg = compile_code(cl, context, kernelSource, setValueKernelName, num_clmems=1) testTernaryName = test_common.mangle('testTernary', ['float *']) testTernaryProg = compile_code(cl, context, kernelSource, testTernaryName, num_clmems=1) float_data_orig = np.copy(float_data) def set_float_value(gpu_buffer, idx, value): setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(idx), np.float32(value), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data_gpu, float_data) print('float_data[:8]', float_data[:8]) set_float_value(float_data_gpu, 1, 10) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[2] set_float_value(float_data_gpu, 1, -2) testTernaryProg.__getattr__(testTernaryName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) q.finish() cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data[:8]', float_data[:8]) assert float_data[0] == float_data_orig[3]
def test_use_template1(context, q, int_data, int_data_gpu, float_data, float_data_gpu): code = """ template< typename T > __device__ T addNumbers(T one, T two) { return one + two; } __global__ void use_template1(float *data, int *intdata) { if(threadIdx.x == 0 && blockIdx.x == 0) { data[0] = addNumbers(data[1], data[2]); intdata[0] = addNumbers(intdata[1], intdata[2]); } } """ kernelName = test_common.mangle('use_template1', ['float *', 'int *']) prog = compile_code(cl, context, code, kernelName) float_data_orig = np.copy(float_data) int_data_orig = np.copy(int_data) prog.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), int_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert float_data[0] == float_data_orig[1] + float_data_orig[2] assert int_data[0] == int_data_orig[1] + int_data_orig[2]
def test_copy_float(cuSourcecode, context, q, float_data, float_data_gpu): kernelName = test_common.mangle('copy_float', ['float *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() assert float_data[0] == float_data[1]
def test_test_if_else(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; if(tid < N) { data[tid] *= 2; } else { data[tid] += 5; } } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) for i in range(10): if i < N: assert float_data[i] == float_data_orig[i] * 2 else: assert abs(float_data[i] - float_data_orig[i] - 5) <= 1e-4
def test_test_for(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testFor(float *data, int N) { if(threadIdx.x == 0) { float sum = 0.0f; for(int i = 0; i < N; i++) { sum += data[i]; } data[0] = sum; } } """ kernelName = test_common.mangle('testFor', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 4 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) sum = 0 for i in range(N): sum += float_data_orig[i] assert abs(float_data[0] - sum) <= 1e-4
def test_test_do_while(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void testIfElse(float *data, int N) { int tid = threadIdx.x; int i = 0; float sum = 0; do { sum += data[i]; i++; } while(sum < 15); data[0] = sum; } """ kernelName = test_common.mangle('testIfElse', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) print(float_data[0])
def test_use_tid2(cuSourcecode, context, q, int_data, int_data_gpu): int_data_orig = np.copy(int_data) kernelName = test_common.mangle('use_tid2', ['int *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), int_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, int_data, int_data_gpu) q.finish() assert int_data[0] == int_data_orig[0] + 0 assert int_data[10] == int_data_orig[10] + 10 assert int_data[31] == int_data_orig[31] + 31
def test_float4(cuSourcecode, context, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) kernelName = test_common.mangle('testFloat4', ['float4 *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName) testcudakernel1.__getattr__(kernelName)(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) assert float_data[1] == float_data_orig[4 + 2] * float_data_orig[4 + 3]
def test_float4_test2(cuSourcecode, context, ctx, q, float_data, float_data_gpu): float_data_orig = np.copy(float_data) kernelName = test_common.mangle('testFloat4_test2', ['float4 *']) testcudakernel1 = compile_code(cl, context, cuSourcecode, kernelName, num_clmems=1) testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() print('float_data_orig[:8]', float_data_orig[:8]) print('float_data[:8]', float_data[:8]) for i in range(4): assert float_data[i] == float_data_orig[i + 4]
def test_simpleloop(context, q, float_data, float_data_gpu): sourcecode = """ __global__ void longKernel(float *data, int N, float value) { for(int i = 0; i < N; i++) { data[i] += value; } } """ kernelName = test_common.mangle('longKernel', ['float *', 'int', 'float']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 2 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), np.float32(123), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code) print(float_data[0])
def test_test_inlines(context, q, float_data, float_data_gpu): sourcecode = """ __device__ void somefunc(float *data) { data[0] = 3.4; } __global__ void testInline(float *data, int N) { somefunc(data); } """ kernelName = test_common.mangle('testInline', ['float *', 'int']) prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1) float_data_orig = np.copy(float_data) N = 4 prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu, offset_type(0), np.int32(N), cl.LocalMemory(4)) cl.enqueue_copy(q, float_data, float_data_gpu) q.finish() with open('/tmp/testprog-device.cl', 'r') as f: cl_code = f.read() print('cl_code', cl_code)