Example #1
0
def test_singlebuffer_sqrt_cocl(context, queue):
    """
    Test doing stuff with one single large buffer for destination and source, just offset a bit
    """
    code = """
__global__ void myKernel(float *data0, float *data1, int N) {
    if(threadIdx.x < N) {
        data0[threadIdx.x] = sqrt(data1[threadIdx.x]);
    }
}
"""
    mangledName = '_Z8myKernelPfS_i'
    kernel = test_common.compile_code_v3(cl,
                                         context,
                                         code,
                                         mangledName,
                                         num_clmems=2)['kernel']

    N = 10

    src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset = 128
    dst_offset = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    test_common.enqueue_write_buffer_ext(cl,
                                         queue,
                                         huge_buf_gpu,
                                         src_host,
                                         device_offset=src_offset,
                                         size=N * 4)

    global_size = 256
    workgroup_size = 256
    # scratch = workgroup_size * 4

    kernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu,
           offset_type(0), huge_buf_gpu, offset_type(0),
           offset_type(dst_offset), offset_type(src_offset), np.int32(N),
           cl.LocalMemory(4))
    queue.finish()
    test_common.enqueue_read_buffer_ext(cl,
                                        queue,
                                        huge_buf_gpu,
                                        dst_host,
                                        device_offset=dst_offset,
                                        size=N * 4)

    queue.finish()
    print('src_host', src_host)
    print('dst_host', dst_host)
    print('np.sqrt(src_host)', np.sqrt(src_host))

    assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
Example #2
0
def test_singlebuffer_sqrt_opencl_1(context, queue):
    """
    Test doing stuff with one single large buffer for destination and source, just offset a bit
    """
    code = """
kernel void myKernel(global float *data0, long offset0, global float *data1, long offset1, int N) {
    data0 += (offset0 >> 2);
    data1 += (offset1 >> 2);
    if(get_global_id(0) < N) {
        data0[get_global_id(0)] = sqrt(data1[get_global_id(0)]);
    }
}
"""
    prog = cl.Program(context, code).build()

    N = 10

    src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset = 128
    dst_offset = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    global_size = 256
    workgroup_size = 256

    test_common.enqueue_write_buffer_ext(cl,
                                         queue,
                                         huge_buf_gpu,
                                         src_host,
                                         device_offset=src_offset,
                                         size=N * 4)
    queue.finish()

    prog.myKernel(queue, (global_size, ), (workgroup_size, ), huge_buf_gpu,
                  np.int64(dst_offset), huge_buf_gpu, np.int64(src_offset),
                  np.int32(N))
    queue.finish()

    test_common.enqueue_read_buffer_ext(cl,
                                        queue,
                                        huge_buf_gpu,
                                        dst_host,
                                        device_offset=dst_offset,
                                        size=N * 4)
    queue.finish()

    print('src_host', src_host)
    print('dst_host', dst_host)
    print('np.sqrt(src_host)', np.sqrt(src_host))

    assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
Example #3
0
def test_singlebuffer_sqrt_opencl_2(context, queue):
    """
    Test doing stuff with one single large buffer for destination and source, just offset a bit
    """
    code = """
kernel void _Z8myKernelPfS_i(global float* data0, long data0_offset, global float* data1, long data1_offset, int N) {
    //data1 = (global float*)((global char *)data1 + data1_offset);
    //data0 = (global float*)((global char *)data0 + data0_offset);
    //data1 = data1 + (data1_offset >> 2);
    //data0 = data0 + (data0_offset >> 2);
    data1 = data1 + data1_offset;
    data0 = data0 + data0_offset;

    if(get_local_id(0) < N) {
        data0[get_local_id(0)] = (float)sqrt(data1[get_local_id(0)]);
    }
}
"""
    prog = cl.Program(context, code).build()

    N = 10
    bufsize = 64 * 1024 * 1024

    np.random.seed(444)
    src_host = np.random.uniform(0, 1, size=(N, )).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset = 128
    dst_offset = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=bufsize)

    global_size = 256
    workgroup_size = 256

    test_common.enqueue_write_buffer_ext(cl,
                                         queue,
                                         huge_buf_gpu,
                                         src_host,
                                         device_offset=src_offset,
                                         size=N * 4)
    queue.finish()

    prog._Z8myKernelPfS_i(queue, (global_size, ),
                          (workgroup_size, ), huge_buf_gpu,
                          np.int64(dst_offset // 4), huge_buf_gpu,
                          np.int64(src_offset // 4), np.int32(N))
    queue.finish()

    test_common.enqueue_read_buffer_ext(cl,
                                        queue,
                                        huge_buf_gpu,
                                        dst_host,
                                        device_offset=dst_offset,
                                        size=N * 4)
    queue.finish()

    print('src_host', src_host)
    print('dst_host', dst_host)
    print('np.sqrt(src_host)', np.sqrt(src_host))

    assert np.abs(np.sqrt(src_host) - dst_host).max() <= 1e-4
Example #4
0
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu):
    options = test_common.cocl_options()
    i = 0
    opt_options = []
    iropencl_options = []
    while i < len(options):
        if options[i] == '--devicell-opt':
            opt_options.append('-' + options[i + 1])
            i += 2
            continue
        if options[i] in ['--run_branching_transforms', '--branches_as_switch']:
            iropencl_options.append(options[i])
            i += 1
            continue
        raise Exception('unknown option ', options[i])
        i += 1
    print('opt_options', opt_options)
    print('iropencl_options', iropencl_options)
    if 'NOREBUILD' not in os.environ:
        res = subprocess.run([
            join(CLANG_HOME, 'bin/opt')
        ] + opt_options + [
            '-S',
            'test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll',
            '-o', '/tmp/test-opt.ll'
        ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        print(' '.join(res.args))
        print(res.stdout.decode('utf-8'))
        assert res.returncode == 0

        res = subprocess.run([
            'build/ir-to-opencl'
        ] + iropencl_options + [
            '--inputfile', '/tmp/test-opt.ll',
            '--outputfile', '/tmp/test-device.cl',
            '--cmem-indexes', '0,1,2',
            '--kernelname', '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_'
        ], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
        print(' '.join(res.args))
        print(res.stdout.decode('utf-8'))
        assert res.returncode == 0

    with open('/tmp/test-device.cl', 'r') as f:
        cl_sourcecode = f.read()

    prog = cl.Program(context, cl_sourcecode).build()

    N = 10

    # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch

    # what we need:
    # struct Eigen__TensorEvaluator_nopointers   Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
    # global float *eval_ptr0  => this will receive the result.  just create a sufficiently large buffer
    # ptr0_offset => 0
    # eval_ptr1 => will contian the data we want to reduce
    # eval_ptr1_offset=> 0
    # size =>  eg 10, to reduce 10 values
    # scratch => set to workgroupsize * sizeof(float)

    # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use
    # offset into this

    src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset_bytes = 128
    dst_offset_bytes = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)
    # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_ptr0_gpu = huge_buf_gpu
    eval_ptr0_offset = dst_offset_bytes

    eval_ptr1_gpu = huge_buf_gpu
    eval_ptr1_offset = src_offset_bytes

    size = N

    # copy our host memory across
    # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4)
    test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4)

    global_size = 256
    workgroup_size = 256
    scratch = workgroup_size * 4

    prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')(
        queue, (global_size,), (workgroup_size,),
        eval_nopointers_gpu, offset_type(0),
        eval_ptr0_gpu, offset_type(0),
        eval_ptr1_gpu, offset_type(0),
        offset_type(0),
        offset_type(eval_ptr0_offset),
        offset_type(eval_ptr1_offset),
        np.int32(size),
        cl.LocalMemory(scratch)
    )
    # check for errors
    queue.finish()

    test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4)
    # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4)
    queue.finish()
    print('dst_host[:N]', dst_host[:N])

    expected = np.sqrt(src_host)
    print('expected[:10]', expected[:N])
    assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4
Example #5
0
def test_cwise_sqrt_singlebuffer(context, queue, float_data, float_data_gpu):
    with open('test/tf/samples/cwise_op_gpu_sqrt-device-noopt.ll', 'r') as f:
        ll_code = f.read()

    cl_sourcecode = test_common.ll_to_cl(
        ll_code,
        '_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_',
        num_clmems=3)

    prog = cl.Program(context, cl_sourcecode).build()

    N = 10

    # global struct Eigen__TensorEvaluator_nopointers* eval_nopointers, global float* eval_ptr0, long eval_ptr0_offset, global float* eval_ptr1, long eval_ptr1_offset, int size, local int *scratch

    # what we need:
    # struct Eigen__TensorEvaluator_nopointers   Note that none of the values we copy across are actually use, so we can just create a sufficiently large buffer...
    # global float *eval_ptr0  => this will receive the result.  just create a sufficiently large buffer
    # ptr0_offset => 0
    # eval_ptr1 => will contian the data we want to reduce
    # eval_ptr1_offset=> 0
    # size =>  eg 10, to reduce 10 values
    # scratch => set to workgroupsize * sizeof(float)

    # by compariosn to the earlier test, we create a sigle buffer, containing both ptr0 and ptr1, and just use
    # offset into this

    src_host = np.random.uniform(0, 1, size=(N,)).astype(np.float32) + 1.0
    dst_host = np.zeros(N, dtype=np.float32)

    src_offset_bytes = 128
    dst_offset_bytes = 256

    huge_buf_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)
    # huge_buf_gpu_spare = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_nopointers_gpu = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=4096)

    eval_ptr0_gpu = huge_buf_gpu
    eval_ptr0_offset = dst_offset_bytes

    eval_ptr1_gpu = huge_buf_gpu
    eval_ptr1_offset = src_offset_bytes

    size = N

    # copy our host memory across
    # cl.enqueue_copy(q, huge_buf_gpu_spare, src_host, device_offset=256, size=N * 4)
    test_common.enqueue_write_buffer_ext(cl, queue, huge_buf_gpu, src_host, device_offset=src_offset_bytes, size=N * 4)

    global_size = 256
    workgroup_size = 256
    scratch = workgroup_size * 4

    prog.__getattr__('_ZN5Eigen8internal15EigenMetaKernelINS_15TensorEvaluatorIKNS_14TensorAssignOpINS_9TensorMapINS_6TensorIfLi1ELi1EiEELi16ENS_11MakePointerEEEKNS_18TensorCwiseUnaryOpINS0_14scalar_sqrt_opIfEEKNS4_INS5_IKfLi1ELi1EiEELi16ES7_EEEEEENS_9GpuDeviceEEEiEEvT_T0_')(
        queue, (global_size,), (workgroup_size,),
        eval_nopointers_gpu, offset_type(0),
        eval_ptr0_gpu, offset_type(0),
        eval_ptr1_gpu, offset_type(0),
        offset_type(0),
        offset_type(eval_ptr0_offset),
        offset_type(eval_ptr1_offset),
        np.int32(size),
        cl.LocalMemory(scratch)
    )
    # check for errors
    queue.finish()

    test_common.enqueue_read_buffer_ext(cl, queue, huge_buf_gpu, dst_host, device_offset=dst_offset_bytes, size=N * 4)
    # cl.enqueue_copy(queue, dst_host, huge_buf_gpu, device_offset=128, size=N * 4)
    queue.finish()
    print('dst_host[:N]', dst_host[:N])

    expected = np.sqrt(src_host)
    print('expected[:10]', expected[:N])
    assert np.abs(expected[:N] - dst_host[:N]).max() < 1e-4