Example #1
0
def test_atomic_inc_int(context, q, int_data, int_data_gpu):
    # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
    cu_code = """
__global__ void mykernel(int *data, int limit) {
    atomicInc((unsigned int *)data, limit);
}
"""
    cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPii', 1)
    print('cl_code', cl_code)

    int_data[0] = 0
    int_data[1] = 0
    kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPii')
    cl.enqueue_copy(q, int_data_gpu, int_data)
    num_blocks = 4
    threads_per_block = 4
    modulus = 11
    kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), np.int32(256), cl.LocalMemory(32))
    kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(4), np.int32(modulus - 1), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    print('from_gpu', from_gpu[:2])

    assert from_gpu[0] == num_blocks * threads_per_block
    assert from_gpu[1] == num_blocks * threads_per_block % modulus
Example #2
0
def test_alloca(context, q, float_data, float_data_gpu):

    code = """
__global__ void mykernel(float *data) {
    float *foo = data;
    foo[0] = data[1] + data[2] + data[3];
}
"""
    kernelName = test_common.mangle('mykernel', ['float *'])
    try:
        dict = test_common.compile_code_v2(cl,
                                           context,
                                           code,
                                           kernelName,
                                           num_clmems=1)
        prog = dict['prog']
        cl_sourcecode = dict['cl_sourcecode']
        print('cl_sourcecode', cl_sourcecode)
    except Exception as e:
        with open('/tmp/testprog-device.cl', 'r') as f:
            print(f.read())
        raise e
    print('type(offset_type(0))', type(offset_type(0)))
    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 offset_type(0), offset_type(0),
                                 cl.LocalMemory(4))
Example #3
0
def test_sitofp(context, q, float_data, float_data_gpu, int_data, int_data_gpu):

    code = """
__global__ void myKernel(float *float_data, int *int_data) {
    float_data[0] = (float)int_data[0];
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *', 'int *']), num_clmems=2)['kernel']
    int_data[0] = 5
    int_data[1] = 2
    int_data[2] = 4
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu,
        int_data_gpu,
        offset_type(0),
        offset_type(0),
        cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    print('float_data[0]', float_data[0])
    # expected = pow(float_data[1], float_data[2])
    assert float_data[0] == 5
def test_inlining(context, q, float_data, float_data_gpu):

    cu_source = """
__global__ void myKernel(float *data) {
    data[0] = (data[3] * (data[1] + data[2])) / data[4];
    data[7] = (data[3] / (data[1] - data[2])) * data[4];
}
"""
    kernelName = test_common.mangle('myKernel', ['float *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)

    for i in range(10):
        float_data[i] = i + 3
    cl.enqueue_copy(q, float_data_gpu, float_data)
    q.finish()
    # prog = cl.Program(context, sourcecode).build()
    # prog.__getattr__(kernelName)(
    kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(4))
    q.finish()
    float_data2 = np.zeros((1024, ), dtype=np.float32)
    cl.enqueue_copy(q, float_data2, float_data_gpu)
    q.finish()
    print('float_data2[0]', float_data2[0])
    d = float_data
    d2 = float_data2
    expect = (d[3] * (d[1] + d[2])) / d[4]
    assert abs(d2[0] - expect) < 1e-5
Example #5
0
def test_sincos(context, q, float_data, float_data_gpu):

    cu_code = """
__global__ void mykernel(float *data) {
    sincosf(0.1, &data[0], &data[1]);
    sincosf(data[2], &data[3], &data[4]);
}
"""
    kernel_name = test_common.mangle('mykernel', ['float*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    print('cl_code', cl_code)

    float_data[2] = -0.3
    float_data_orig = np.copy(float_data)
    cl.enqueue_copy(q, float_data_gpu, float_data)

    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:5])
    assert abs(float_data[0] - math.sin(0.1)) < 1e-4
    assert abs(float_data[1] - math.cos(0.1)) < 1e-4
    assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4
    assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
Example #6
0
def test_use_template1(context, q, int_data, int_data_gpu, float_data,
                       float_data_gpu):
    code = """
template< typename T >
__device__ T addNumbers(T one, T two) {
    return one + two;
}

__global__ void use_template1(float *data, int *intdata) {
    if(threadIdx.x == 0 && blockIdx.x == 0) {
        data[0] = addNumbers(data[1], data[2]);
        intdata[0] = addNumbers(intdata[1], intdata[2]);
    }
}
"""
    kernelName = test_common.mangle('use_template1', ['float *', 'int *'])
    prog = compile_code(cl, context, code, kernelName, num_clmems=2)
    float_data_orig = np.copy(float_data)
    int_data_orig = np.copy(int_data)

    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 int_data_gpu, offset_type(0), offset_type(0),
                                 cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    assert float_data[0] == float_data_orig[1] + float_data_orig[2]
    assert int_data[0] == int_data_orig[1] + int_data_orig[2]
Example #7
0
def test_sqrt(context, q, float_data, float_data_gpu):

    code = """
__global__ void myKernel(float *data) {
    data[threadIdx.x] = sqrt(data[threadIdx.x]);
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel']
    float_data[0] = 1.5
    float_data[1] = 4.6
    float_data[2] = -1.5
    float_data[3] = 0
    float_data_orig = np.copy(float_data)
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:4]', float_data[:4])
    for i in range(4):
        if float_data_orig[i] >= 0:
            assert abs(float_data[i] - math.sqrt(float_data_orig[i])) <= 1e-4
        else:
            assert math.isnan(float_data[i])
def test_int_expressions(context, q, int_data, int_data_gpu):

    cu_source = """
__global__ void myKernel(int *data) {
    data[0] = (data[10] | data[11]) == data[12];
}
"""
    kernelName = test_common.mangle('myKernel', ['int *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)

    for i in range(50):
        int_data[i] = 0
    int_data[10] = 2
    int_data[11] = 1
    int_data[12] = 1
    cl.enqueue_copy(q, int_data_gpu, int_data)
    q.finish()
    # prog = cl.Program(context, sourcecode).build()
    # prog.__getattr__(kernelName)(
    kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(4))
    q.finish()
    gpu_data = np.zeros((1024, ), dtype=np.int32)
    cl.enqueue_copy(q, gpu_data, int_data_gpu)
    q.finish()
    data = int_data
    actual = gpu_data[0]
    expected = int((data[10] | data[11]) == data[12])
    print('actual', actual, 'expected', expected)
    assert actual == expected
Example #9
0
def test_double_ieeefloats(context, q, float_data, float_data_gpu):
    cu_code = """
__global__ void mykernel(double *data) {
    double d_neginfinity = -INFINITY;
    double d_posinfinity = INFINITY;
    float f_neginfinity = -INFINITY;
    float f_posinfinity = INFINITY;
    data[0] = INFINITY;
    data[1] = -INFINITY;
    data[2] = f_neginfinity;
    data[3] = f_posinfinity;
}
"""
    kernel_name = test_common.mangle('mykernel', ['double*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:4])
    assert float_data[0] == np.inf
    assert float_data[1] == - np.inf
    assert float_data[2] == - np.inf
    assert float_data[3] == np.inf
Example #10
0
def test_test_if_else(context, q, float_data, float_data_gpu):
    sourcecode = """
__global__ void testIfElse(float *data, int N) {
    int tid = threadIdx.x;
    if(tid < N) {
        data[tid] *= 2;
    } else {
        data[tid] += 5;
    }
}
"""
    kernelName = test_common.mangle('testIfElse', ['float *', 'int'])
    prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1)
    float_data_orig = np.copy(float_data)

    N = 2
    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 offset_type(0), offset_type(0), np.int32(N),
                                 cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    with open('/tmp/testprog-device.cl', 'r') as f:
        cl_code = f.read()
    print('cl_code', cl_code)
    for i in range(10):
        if i < N:
            assert float_data[i] == float_data_orig[i] * 2
        else:
            assert abs(float_data[i] - float_data_orig[i] - 5) <= 1e-4
Example #11
0
def test_copy_float(extract_value, q, float_data, float_data_gpu):
    extract_value.__getattr__(kernelname)(q, (32, ), (32, ), float_data_gpu,
                                          offset_type(0), offset_type(0),
                                          cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    assert float_data[0] == float_data[1]
Example #12
0
def test_test_for(context, q, float_data, float_data_gpu):
    sourcecode = """
__global__ void testFor(float *data, int N) {
    if(threadIdx.x == 0) {
        float sum = 0.0f;
        for(int i = 0; i < N; i++) {
            sum += data[i];
        }
        data[0] = sum;
    }
}
"""
    kernelName = test_common.mangle('testFor', ['float *', 'int'])
    prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1)
    float_data_orig = np.copy(float_data)

    N = 4
    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 offset_type(0), offset_type(0), np.int32(N),
                                 cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    with open('/tmp/testprog-device.cl', 'r') as f:
        cl_code = f.read()
    print('cl_code', cl_code)

    sum = 0
    for i in range(N):
        sum += float_data_orig[i]
    assert abs(float_data[0] - sum) <= 1e-4
Example #13
0
def test_memcpy(context, q, int_data, int_data_gpu):
    ll_code = """
declare void @_Z6memcpyPvPKvm(i8*, i8*, i64)

define void @mykernel(i32* %data) {
  %1 = bitcast i32* %data to i8*

  %2 = getelementptr i32, i32* %data, i32 8
  %3 = bitcast i32* %2 to i8*

  call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32)
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1)
    print('cl_code', cl_code)
    for i in range(8):
        int_data[i] = 3 + i
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'mykernel')
    kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    for i in range(8):
        print(i, from_gpu[8 + i])
        assert from_gpu[8 + i] == 3 + i
Example #14
0
def test_test_do_while(context, q, float_data, float_data_gpu):
    sourcecode = """
__global__ void testIfElse(float *data, int N) {
    int tid = threadIdx.x;
    int i = 0;
    float sum = 0;
    do {
        sum += data[i];
        i++;
    } while(sum < 15);
    data[0] = sum;
}
"""
    kernelName = test_common.mangle('testIfElse', ['float *', 'int'])
    prog = compile_code(cl, context, sourcecode, kernelName, num_clmems=1)
    float_data_orig = np.copy(float_data)

    N = 2
    prog.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                 offset_type(0), offset_type(0), np.int32(N),
                                 cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    with open('/tmp/testprog-device.cl', 'r') as f:
        cl_code = f.read()
    print('cl_code', cl_code)
    print(float_data[0])
Example #15
0
def test_sext(context, q, int_data, int_data_gpu):
    ll_code = """
define void @mykernel(i32* %data) {
  %1 = load i32, i32* %data
  %2 = sext i32 %1 to i64
  %3 = lshr i64 %2, 32
  %4 = trunc i64 %3 to i32
  store i32 %4, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]:
        int_data[0] = experiment['in']
        cl.enqueue_copy(q, int_data_gpu, int_data)
        kernel = test_common.build_kernel(context, cl_code, 'mykernel')
        kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
               cl.LocalMemory(32))
        from_gpu = np.copy(int_data)
        cl.enqueue_copy(q, from_gpu, int_data_gpu)
        q.finish()
        # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32
        expected = experiment['out']
        print('expected', expected)
        print('from_gpu[0]', from_gpu[0])
        assert expected == from_gpu[0].item()
        split_cl = cl_code.split('\n')
        found_long_cast = False
        for line in split_cl:
            if ' >> 32' in line and '(long)' in line:
                found_long_cast = True
        assert found_long_cast
Example #16
0
def test_umulhi(context, q, int_data, int_data_gpu):
    ll_code = """
declare i32 @_Z8__umulhiii(i32, i32)

define void @test_umulhi(i32* %data) {
  %1 = load i32, i32* %data

  %2 = getelementptr i32, i32* %data, i32 1
  %3 = load i32, i32* %2

  %4 = getelementptr i32, i32* %data, i32 2
  %5 = load i32, i32* %4

  %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5)
  store i32 %6, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1)
    print('cl_code', cl_code)
    int_data[0] = 0
    int_data[1] = -50
    int_data[2] = 2523123
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'test_umulhi')
    kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert expected == from_gpu[0].item()
Example #17
0
def test_pow(context, q, float_data, float_data_gpu):

    code = """
__global__ void myKernel(float *data) {
    data[0] = pow(data[1], data[2]);
    data[3] = pow(data[4], data[5]);
    data[5] = pow(data[7], data[8]);
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel']
    float_data[1] = 1.5
    float_data[2] = 4.6
    float_data[4] = -1.5
    float_data[5] = 4.6
    float_data[7] = 1.5
    float_data[8] = -4.6
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[0]', float_data[0])
    print('float_data[3]', float_data[3])
    print('float_data[6]', float_data[6])
    expected = pow(float_data[1], float_data[2])
    assert abs(float_data[0] - expected) <= 1e-4
Example #18
0
def test_ternary(context, q, float_data, float_data_gpu):
    kernelSource = """
__global__ void setValue(float *data, int idx, float value) {
    if(threadIdx.x == 0) {
        data[idx] = value;
    }
}
__global__ void testTernary(float *data) {
    data[0] = data[1] > 0 ? data[2] : data[3];
}
"""
    setValueKernelName = test_common.mangle('setValue',
                                            ['float *', 'int', 'float'])
    setValueProg = compile_code(cl,
                                context,
                                kernelSource,
                                setValueKernelName,
                                num_clmems=1)

    testTernaryName = test_common.mangle('testTernary', ['float *'])
    testTernaryProg = compile_code(cl,
                                   context,
                                   kernelSource,
                                   testTernaryName,
                                   num_clmems=1)

    float_data_orig = np.copy(float_data)

    def set_float_value(gpu_buffer, idx, value):
        setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ),
                                                     float_data_gpu,
                                                     offset_type(0),
                                                     np.int32(idx),
                                                     np.float32(value),
                                                     cl.LocalMemory(4))

    cl.enqueue_copy(q, float_data_gpu, float_data)
    print('float_data[:8]', float_data[:8])
    set_float_value(float_data_gpu, 1, 10)
    testTernaryProg.__getattr__(testTernaryName)(q, (32, ),
                                                 (32, ), float_data_gpu,
                                                 offset_type(0),
                                                 cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:8]', float_data[:8])
    assert float_data[0] == float_data_orig[2]

    set_float_value(float_data_gpu, 1, -2)
    testTernaryProg.__getattr__(testTernaryName)(q, (32, ),
                                                 (32, ), float_data_gpu,
                                                 offset_type(0),
                                                 cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[:8]', float_data[:8])
    assert float_data[0] == float_data_orig[3]
Example #19
0
 def set_float_value(gpu_buffer, idx, value):
     setValueProg.__getattr__(setValueKernelName)(q, (32, ), (32, ),
                                                  float_data_gpu,
                                                  offset_type(0),
                                                  offset_type(0),
                                                  np.int32(idx),
                                                  np.float32(value),
                                                  cl.LocalMemory(4))
Example #20
0
def test_singlebuffer_sqrt_cocl(context, queue):
    """
    Test doing stuff with one single large buffer for destination and source, just offset a bit
    """
    code = """
__global__ void myKernel(float *data0, float *data1, int N) {
    if(threadIdx.x < N) {
        data0[threadIdx.x] = sqrt(data1[threadIdx.x]);
    }
}
"""
    mangledName = '_Z8myKernelPfS_i'
    kernel = test_common.compile_code_v3(cl,
                                         context,
                                         code,
                                         mangledName,
                                         num_clmems=2)['kernel']

    N = 10

    src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset = 128
    dst_offset = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    test_common.enqueue_write_buffer_ext(cl,
                                         queue,
                                         huge_buf_gpu,
                                         src_host,
                                         device_offset=src_offset,
                                         size=N * 4)

    global_size = 256
    workgroup_size = 256
    # scratch = workgroup_size * 4

    kernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu,
           offset_type(0), huge_buf_gpu, offset_type(0),
           offset_type(dst_offset), offset_type(src_offset), np.int32(N),
           cl.LocalMemory(4))
    queue.finish()
    test_common.enqueue_read_buffer_ext(cl,
                                        queue,
                                        huge_buf_gpu,
                                        dst_host,
                                        device_offset=dst_offset,
                                        size=N * 4)

    queue.finish()
    print('src_host', src_host)
    print('dst_host', dst_host)
    print('np.sqrt(src_host)', np.sqrt(src_host))

    assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
Example #21
0
def test_foo(context, q, float_data, float_data_gpu, cuSourcecode):
    kernelName = test_common.mangle('foo', ['float *'])
    testcudakernel1 = compile_code(cl,
                                   context,
                                   cuSourcecode,
                                   kernelName,
                                   num_clmems=1)
    testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                            offset_type(0), offset_type(0),
                                            cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    assert float_data[0] == 123
Example #22
0
def test_structs(context, q, float_data, float_data_gpu, int_data,
                 int_data_gpu):

    code = """
struct MyStruct {
    int myint;
    float myfloat;
};

__global__ void testStructs(MyStruct *structs, float *float_data, int *int_data) {
    int_data[0] = structs[0].myint;
    float_data[0] = structs[0].myfloat;
    float_data[1] = structs[1].myfloat;
}
"""
    kernel = test_common.compile_code_v3(
        cl,
        context,
        code,
        test_common.mangle('testStructs', ['MyStruct *', 'float *', 'int *']),
        num_clmems=3)['kernel']

    # my_struct = np.dtype([("myfloat", np.float32), ("myint", np.int32)])  # I dont know why, but seems these are back to front...
    my_struct = np.dtype([
        ("myint", np.int32), ("myfloat", np.float32)
    ])  # seems these are wrong way around on HD5500.  Works ok on 940M
    my_struct, my_struct_c_decl = pyopencl.tools.match_dtype_to_c_struct(
        context.devices[0], "MyStruct", my_struct)
    my_struct = cl.tools.get_or_register_dtype("MyStruct", my_struct)
    structs = np.empty(2, my_struct)
    structs[0]['myint'] = 123
    structs[1]['myint'] = 33
    structs[0]['myfloat'] = 567
    structs[1]['myfloat'] = 44
    structs_gpu = cl.array.to_device(q, structs)
    # p = structs_gpu.map_to_host(q)
    # print('p', p)
    # q.finish()
    kernel(q, (32, ), (32, ), structs_gpu.data, offset_type(0), float_data_gpu,
           offset_type(0), int_data_gpu, offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    print('int_data[0]', int_data[0])
    print('int_data[1]', int_data[1])
    print('float_data[0]', float_data[0])
    print('float_data[1]', float_data[1])
    assert int_data[0] == 123
    assert float_data[0] == 567
    assert float_data[1] == 44
Example #23
0
def test_copy_float(cuSourcecode, context, q, float_data, float_data_gpu):
    argTypes = ['float *']
    kernelName = test_common.mangle('copy_float', argTypes)
    testcudakernel1 = compile_code(cl,
                                   context,
                                   cuSourcecode,
                                   kernelName,
                                   num_clmems=1)
    testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                            offset_type(0), offset_type(0),
                                            cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    assert float_data[0] == float_data[1]
Example #24
0
def test_use_tid2(cuSourcecode, context, q, int_data, int_data_gpu):
    int_data_orig = np.copy(int_data)
    kernelName = test_common.mangle('use_tid2', ['int *'])
    testcudakernel1 = compile_code(cl,
                                   context,
                                   cuSourcecode,
                                   kernelName,
                                   num_clmems=1)
    testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), int_data_gpu,
                                            offset_type(0), offset_type(0),
                                            cl.LocalMemory(4))
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    assert int_data[0] == int_data_orig[0] + 0
    assert int_data[10] == int_data_orig[10] + 10
    assert int_data[31] == int_data_orig[31] + 31
Example #25
0
def test_float4(cuSourcecode, context, ctx, q, float_data, float_data_gpu):
    float_data_orig = np.copy(float_data)
    kernelName = test_common.mangle('testFloat4', ['float4 *'])
    testcudakernel1 = compile_code(cl,
                                   context,
                                   cuSourcecode,
                                   kernelName,
                                   num_clmems=1)
    testcudakernel1.__getattr__(kernelName)(q, (32, ), (32, ), float_data_gpu,
                                            offset_type(0), offset_type(0),
                                            cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()

    print('float_data_orig[:8]', float_data_orig[:8])
    print('float_data[:8]', float_data[:8])
    assert float_data[1] == float_data_orig[4 + 2] * float_data_orig[4 + 3]
Example #26
0
def test_nested_for(context, q, float_data, float_data_gpu):
    source = """
__device__ void myfunc(float *data, int a, int b) {
    data[1] = data[2];
    float sum = 0;
    for(int i = 0; i < a; i++) {
        for(int j = 0; j < b; j++) {
            sum += data[i * 17 + j * 7];
        }
    }
    data[0] = sum;
}

__global__ void mykernel(float *data, int a, int b) {
    myfunc(data, a, b);
}
"""
    kernelName = test_common.mangle('mykernel', ['float *', 'int', 'int'])
    kernel = test_common.compile_code_v3(cl,
                                         context,
                                         source,
                                         kernelName,
                                         num_clmems=1)['kernel']
    float_data_orig = np.copy(float_data)

    a = 2
    b = 3
    kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0), offset_type(0),
           np.int32(a), np.int32(b), cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    with open('/tmp/testprog-device.cl', 'r') as f:
        cl_code = f.read()
    print('cl_code', cl_code)

    sum = 0
    for i in range(a):
        for j in range(b):
            sum += float_data_orig[i * 17 + j * 7]

    print('float_data_orig', float_data_orig[:3])
    print('float_data', float_data[:3])

    assert float_data[1] == float_data_orig[2]
    assert abs(float_data[0] - sum) <= 1e-4
Example #27
0
def test_clz(context, q, float_data, float_data_gpu, int_data, int_data_gpu):

    code = """
__global__ void myKernel(int *int_data) {
    int_data[0] = __clz(int_data[1]);
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['int *']), num_clmems=1)['kernel']
    int_data[1] = 15
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel(
        q, (32,), (32,),
        int_data_gpu, offset_type(0), offset_type(0),
        cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, int_data, int_data_gpu)
    q.finish()
    print('int_data[:2]', int_data[:2])
Example #28
0
def test_floatconstants(context, q, float_data, float_data_gpu):

    code = """
__device__ float4 getvals() {
    return make_float4(0xFFF0000000000000, 0x7FF0000000000000, INFINITY, -INFINITY);
}

__global__ void myKernel(float *data) {
    data[0] = 18442240474082181120.0f; // 0xFFF0000000000000
    data[1] = 9218868437227405312.0f; // 0x7FF0000000000000
    float4 vals = getvals();
    data[2] = vals.x;
    data[3] = vals.y;
    data[4] = vals.w;
    data[5] = vals.z;
    data[6] = INFINITY;
    data[7] = -INFINITY;
    // data[8] = 0xFFEFFFFFFFFFFFFF;
}
"""
    kernel = test_common.compile_code_v3(cl, context, code, test_common.mangle('myKernel', ['float *']), num_clmems=1)['kernel']
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('float_data[0]', float_data[0])
    print('float_data[1]', float_data[1])
    print('float_data[2]', float_data[2])
    print('float_data[3]', float_data[3])
    print('float_data[4]', float_data[4])
    print('float_data[5]', float_data[5])
    print('float_data[6]', float_data[6])
    print('float_data[7]', float_data[7])
    # print('float_data[8]', float_data[8])
    assert float_data[0] > 100000000
    assert float_data[1] > 100000000
    assert float_data[2] > 100000000
    assert float_data[3] > 100000000
    assert float_data[4] == - np.inf
    assert float_data[5] == np.inf
    assert float_data[6] == np.inf
    assert float_data[7] == - np.inf
Example #29
0
def test_fabs_double(context, q, float_data, float_data_gpu):
    cu_code = """
__global__ void mykernel(float *data) {
    data[0] = fabs(data[0]);
}
"""
    cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPf', 1)
    print('cl_code', cl_code)
    float_data[0] = -0.123
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPf')
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    expected = 0.123
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert abs(expected - from_gpu[0].item()) < 1e-4
Example #30
0
def test_float_constants_from_ll(context, q, float_data, float_data_gpu):
    ll_code = """
define void @kernel_float_constants(float* nocapture %data) #1 {
  store float 0x3E7AD7F2A0000000, float* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1)
    print('cl_code', cl_code)
    # try compiling it, just to be sure...
    kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants')
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    print('from_gpu[0]', from_gpu[0])
    print(type(from_gpu[0]), type(1e-7))
    assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10
    assert 'data[0] = 1e-07f' in cl_code