Example #1
0
def test_sext(context, q, int_data, int_data_gpu):
    ll_code = """
define void @mykernel(i32* %data) {
  %1 = load i32, i32* %data
  %2 = sext i32 %1 to i64
  %3 = lshr i64 %2, 32
  %4 = trunc i64 %3 to i32
  store i32 %4, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]:
        int_data[0] = experiment['in']
        cl.enqueue_copy(q, int_data_gpu, int_data)
        kernel = test_common.build_kernel(context, cl_code, 'mykernel')
        kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
               cl.LocalMemory(32))
        from_gpu = np.copy(int_data)
        cl.enqueue_copy(q, from_gpu, int_data_gpu)
        q.finish()
        # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32
        expected = experiment['out']
        print('expected', expected)
        print('from_gpu[0]', from_gpu[0])
        assert expected == from_gpu[0].item()
        split_cl = cl_code.split('\n')
        found_long_cast = False
        for line in split_cl:
            if ' >> 32' in line and '(long)' in line:
                found_long_cast = True
        assert found_long_cast
Example #2
0
def test_umulhi(context, q, int_data, int_data_gpu):
    ll_code = """
declare i32 @_Z8__umulhiii(i32, i32)

define void @test_umulhi(i32* %data) {
  %1 = load i32, i32* %data

  %2 = getelementptr i32, i32* %data, i32 1
  %3 = load i32, i32* %2

  %4 = getelementptr i32, i32* %data, i32 2
  %5 = load i32, i32* %4

  %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5)
  store i32 %6, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1)
    print('cl_code', cl_code)
    int_data[0] = 0
    int_data[1] = -50
    int_data[2] = 2523123
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'test_umulhi')
    kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert expected == from_gpu[0].item()
Example #3
0
def test_memcpy(context, q, int_data, int_data_gpu):
    ll_code = """
declare void @_Z6memcpyPvPKvm(i8*, i8*, i64)

define void @mykernel(i32* %data) {
  %1 = bitcast i32* %data to i8*

  %2 = getelementptr i32, i32* %data, i32 8
  %3 = bitcast i32* %2 to i8*

  call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32)
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1)
    print('cl_code', cl_code)
    for i in range(8):
        int_data[i] = 3 + i
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'mykernel')
    kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    for i in range(8):
        print(i, from_gpu[8 + i])
        assert from_gpu[8 + i] == 3 + i
Example #4
0
def test_internal_struct(context, q, float_data, float_data_gpu):
    cu_code = """
struct MyStruct {
    float afloat;
    int anint;
    float *floatpointer;
    // float **floatstarstart;
};

__attribute__((device)) void processStruct(MyStruct *myStruct) {
    myStruct->afloat = myStruct->floatpointer[0];
}

__attribute__((global)) void mykernel(float *data) {
    float afloat = data[0];
    float float2 = data[1];
    struct MyStruct myStruct = { afloat, 3, &float2 };
    processStruct(&myStruct);
    data[2] = myStruct.afloat;
}
"""
    ll_code = test_common.cu_to_devicell_noopt(cu_code)
    print('ll_code',
          'define ' + ll_code.split('define ')[1].split('}')[0] + '}')
    cl_code = test_common.ll_to_cl(ll_code, '_Z8mykernelPf', num_clmems=1)
    print('cl_code', cl_code)
Example #5
0
def extract_value_cl():
    # lets check it's compileable ll first, using llvm
    ll_filepath = 'test/extract_value.ll'

    with open(ll_filepath, 'r') as f:
        ll_sourcecode = f.read()
    cl_sourcecode = test_common.ll_to_cl(ll_sourcecode=ll_sourcecode,
                                         kernelName=kernelname,
                                         num_clmems=1)
    return cl_sourcecode
Example #6
0
def test_load_globalfloatstar(context, q, float_data, float_data_gpu):
    ll_code = """define void @mykernel(float * %p1) {
    %1 = load float, float* %p1
    ret void
}
"""
    cl_sourcecode = test_common.ll_to_cl(ll_code, "mykernel", num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    assert len([
        l for l in cl_sourcecode.split('\n') if l.strip() == 'float v2;'
    ]) == 1
Example #7
0
def test_kernelparam_ll(context, q, float_data, float_data_gpu):
    ll_code = """define void @mykernel(float * %p1) {
    ret void
}
"""
    cl_sourcecode = test_common.ll_to_cl(ll_code, "mykernel", num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    assert len([
        l for l in cl_sourcecode.split('\n')
        if l.strip().startswith('global float* p1')
    ]) == 1
Example #8
0
def dotdotdot_cl():
    # lets check it's compileable ll first, using llvm
    ll_filepath = 'test/dotdotdot.ll'

    with open(ll_filepath, 'r') as f:
        ll_sourcecode = f.read()

    kernelName = test_common.mangle('test_si', ['float *'])
    cl_sourcecode = test_common.ll_to_cl(ll_sourcecode,
                                         kernelName,
                                         num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    return cl_sourcecode
Example #9
0
def test_addr_of_float(context, q, float_data, float_data_gpu):
    cu_code = """
__attribute__((global)) void mykernel(float *data) {
    float v = data[0];
    float *v1 = &v;
    *v1 = 5.0f;
}
"""
    ll_code = test_common.cu_to_devicell_noopt(cu_code)
    print('ll_code',
          'define ' + ll_code.split('define ')[1].split('}')[0] + '}')
    cl_code = test_common.ll_to_cl(ll_code, '_Z8mykernelPf', num_clmems=1)
    print('cl_code', cl_code)
Example #10
0
def test_float_constants_from_ll(context, q, float_data, float_data_gpu):
    ll_code = """
define void @kernel_float_constants(float* nocapture %data) #1 {
  store float 0x3E7AD7F2A0000000, float* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1)
    print('cl_code', cl_code)
    # try compiling it, just to be sure...
    kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants')
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    print('from_gpu[0]', from_gpu[0])
    print(type(from_gpu[0]), type(1e-7))
    assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10
    assert 'data[0] = 1e-07f' in cl_code
Example #11
0
def test_struct_byval(context, q, float_data, float_data_gpu):
    cu_code = """
struct MyStruct {
    float afloat;
    int anint;
    float *floatpointer;
    float **floatstarstar;
};

__attribute__((global)) void mykernel(struct MyStruct myStruct) {
}
"""
    ll_code = test_common.cu_to_devicell_noopt(cu_code)
    print('ll_code',
          'define ' + ll_code.split('define ')[1].split('}')[0] + '}')
    cl_code = test_common.ll_to_cl(ll_code,
                                   '_Z8mykernel8MyStruct',
                                   num_clmems=1)
    print('cl_code', cl_code)
Example #12
0
def test_atomic_add_floats(context, q, float_data, float_data_gpu):
    ll_code = """
declare float @_Z9atomicAddIfET_PS0_S0_(float *, float)

define void @mykernel(float* nocapture %data) #1 {
  %1 = call float @_Z9atomicAddIfET_PS0_S0_(float * %data, float 3.25)
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    # try compiling it, just to be sure...
    float_data[0] = 0
    kernel = test_common.build_kernel(context, cl_code, 'mykernel')
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(q, (128,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    print('from_gpu[0]', from_gpu[0])
    assert from_gpu[0] == 3.25 * 128
Example #13
0
def try_build(context, ll_filepath, kernelname):
    with open(ll_filepath, 'r') as f:
        llcode = f.read()
    clcode = test_common.ll_to_cl(llcode, kernelname)
    test_common.build_kernel(context, clcode, kernelname)
Example #14
0
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu):
    with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f:
        ll_code = f.read()

    cl_sourcecode = test_common.ll_to_cl(
        ll_code,
        '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_',
        num_clmems=3)

    prog = cl.Program(context, cl_sourcecode).build()

    N = 10

    # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch

    # what we need:
    # struct Eigen__TensorEvaluator_nopointers   Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
    # global float *eval_ptr0  => this will receive the result.  just create a sufficiently large buffer
    # ptr0_offset => 0
    # eval_ptr1 => will contian the data we want to reduce
    # eval_ptr1_offset=> 0
    # size =>  eg 10, to reduce 10 values
    # scratch => set to workgroupsize * sizeof(float)

    # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use
    # offset into this

    src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset_bytes = 128
    dst_offset_bytes = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)
    # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_ptr0_gpu = huge_buf_gpu
    eval_ptr0_offset = dst_offset_bytes

    eval_ptr1_gpu = huge_buf_gpu
    eval_ptr1_offset = src_offset_bytes

    size = N

    # copy our host memory across
    # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4)
    test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4)

    global_size = 256
    workgroup_size = 256
    scratch = workgroup_size * 4

    prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')(
        queue, (global_size,), (workgroup_size,),
        eval_nopointers_gpu, offset_type(0),
        eval_ptr0_gpu, offset_type(0),
        eval_ptr1_gpu, offset_type(0),
        offset_type(0),
        offset_type(eval_ptr0_offset),
        offset_type(eval_ptr1_offset),
        np.int32(size),
        cl.LocalMemory(scratch)
    )
    # check for errors
    queue.finish()

    test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4)
    # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4)
    queue.finish()
    print('dst_host[:N]', dst_host[:N])

    expected = np.sqrt(src_host)
    print('expected[:10]', expected[:N])
    assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4
Example #15
0
def test_cwise_sqrt(context, q, float_data, float_data_gpu):
    with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f:
        ll_code = f.read()

    cl_sourcecode = test_common.ll_to_cl(
        ll_code,
        '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_',
        num_clmems=3)

    print('creating program...')
    prog_unbuilt = cl.Program(context, cl_sourcecode)
    print('building kernel...')
    prog = prog_unbuilt.build()

    N = 10

    # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch

    # what we need:
    # struct Eigen__TensorEvaluator_nopointers   Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
    # global float *eval_ptr0  => this will receive the result.  just create a sufficiently large buffer
    # ptr0_offset => 0
    # eval_ptr1 => will contian the data we want to reduce
    # eval_ptr1_offset=> 0
    # size =>  eg 10, to reduce 10 values
    # scratch => set to workgroupsize * sizeof(float)

    eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_ptr0 = np.zeros(1024, dtype=np.float32)
    eval_ptr0_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr0)
    eval_ptr0_offset = 0

    eval_ptr1 = np.random.uniform(0, 1, size=(1024,)).astype(np.float32) + 1.0
    eval_ptr1_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf=eval_ptr1)
    eval_ptr1_offset = 0

    size = N

    global_size = 256
    workgroup_size = 256
    scratch = workgroup_size * 4

    print('running kernel...')
    prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')(
        q, (global_size,), (workgroup_size,),
        eval_nopointers_gpu, offset_type(0),
        eval_ptr0_gpu, offset_type(0),
        eval_ptr1_gpu, offset_type(0),
        offset_type(0),
        offset_type(eval_ptr0_offset),
        offset_type(eval_ptr1_offset),
        np.int32(size),
        cl.LocalMemory(scratch)
    )
    # check for errors
    q.finish()
    # copy eval_ptr0 back, and check the results...
    cl.enqueue_copy(q, eval_ptr0, eval_ptr0_gpu)
    q.finish()
    print('eval_ptr0[:N]', eval_ptr0[:N])

    expected = np.sqrt(eval_ptr1)
    print('expected[:10]', expected[:N])
    assert np.abs(expected[:N] - eval_ptr0[:N]).max() < 1e-4