Beispiel #1
0
def bilinear_interpolation_inverse_array(O, I, ss, fs, invalid=-1):
    # output dtype
    d = O.dtype

    # inputs:
    Iin = I.astype(np.float32)
    ssin = ss.astype(np.float32)
    fsin = fs.astype(np.float32)

    if type(invalid) is np.ndarray:
        Iin[~invalid] = -1
    else:
        Iin[Iin == invalid] = 0

    # outputs:
    Oin = O.astype(np.float32)

    #bilinear_interpolation_inverse_array_cl(queue, ss.shape, (1, 1),
    bilinear_interpolation_inverse_array_cl(queue, (1, 1), (1, 1), cl.SVM(Iin),
                                            cl.SVM(Oin), cl.SVM(ssin),
                                            cl.SVM(fsin), ss.shape[0],
                                            ss.shape[1], O.shape[0],
                                            O.shape[1])
    queue.finish()

    return Oin.astype(d)
Beispiel #2
0
def test_fine_grain_svm(ctx_factory):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    from pyopencl.characterize import has_fine_grain_buffer_svm
    from pytest import skip
    if not has_fine_grain_buffer_svm(queue.device):
        skip("device does not support fine-grain SVM")

    n = 3000
    ary = cl.fsvm_empty(ctx, n, np.float32, alignment=64)

    if not is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/52
        assert isinstance(ary.base, cl.SVMAllocation)

    ary.fill(17)
    orig_ary = ary.copy()

    prg = cl.Program(
        ctx, """
        __kernel void twice(__global float *a_g)
        {
          a_g[get_global_id(0)] *= 2;
        }
        """).build()

    prg.twice(queue, ary.shape, None, cl.SVM(ary))
    queue.finish()

    print(ary)
    assert np.array_equal(orig_ary * 2, ary)
Beispiel #3
0
def bilinear_interpolation_array(array, ss, fs):
    # inputs:
    arrayin = array.astype(np.float32)
    ssin = ss.astype(np.float32)
    fsin = fs.astype(np.float32)

    # outputs:
    out = np.zeros(ss.shape, dtype=np.float32)

    bilinear_interpolation_array_cl(queue, ss.shape, (1, 1), cl.SVM(arrayin),
                                    cl.SVM(out), cl.SVM(ssin), cl.SVM(fsin),
                                    ss.shape[0], ss.shape[1], arrayin.shape[0],
                                    arrayin.shape[1])
    queue.finish()

    return out.astype(array.dtype)
Beispiel #4
0
def test_coarse_grain_svm(ctx_factory):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]

    has_svm = (ctx._get_cl_version() >= (2, 0)
               and ctx.devices[0]._get_cl_version() >= (2, 0)
               and cl.get_cl_header_version() >= (2, 0))

    if dev.platform.name == "Portable Computing Language":
        has_svm = (get_pocl_version(dev.platform) >= (1, 0)
                   and cl.get_cl_header_version() >= (2, 0))

    if not has_svm:
        from pytest import skip
        skip("SVM only available in OpenCL 2.0 and higher")

    if ("AMD" in dev.platform.name and dev.type & cl.device_type.CPU):
        pytest.xfail("AMD CPU doesn't do coarse-grain SVM")

    n = 3000
    svm_ary = cl.SVM(cl.csvm_empty(ctx, (n, ), np.float32, alignment=64))
    if not is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/52
        assert isinstance(svm_ary.mem.base, cl.SVMAllocation)

    cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype))

    with svm_ary.map_rw(queue) as ary:
        ary.fill(17)
        orig_ary = ary.copy()

    prg = cl.Program(
        ctx, """
        __kernel void twice(__global float *a_g)
        {
          a_g[get_global_id(0)] *= 2;
        }
        """).build()

    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)

    with svm_ary.map_ro(queue) as ary:
        print(ary)
        assert np.array_equal(orig_ary * 2, ary)

    new_ary = np.empty_like(orig_ary)
    new_ary.fill(-1)

    if ctx.devices[0].platform.name != "Portable Computing Language":
        # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)"
        # in pocl up to and including 1.0rc1.

        cl.enqueue_copy(queue, new_ary, svm_ary)
        assert np.array_equal(orig_ary * 2, new_ary)
Beispiel #5
0
def test_fine_grain_svm(ctx_factory):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    from pytest import skip
    if (ctx._get_cl_version() < (2, 0) or cl.get_cl_header_version() < (2, 0)):
        skip("SVM only available in OpenCL 2.0 and higher")

    if not (ctx.devices[0].svm_capabilities
            & cl.device_svm_capabilities.FINE_GRAIN_BUFFER):
        skip("device does not support fine-grain SVM")

    n = 3000
    ary = cl.fsvm_empty(ctx, n, np.float32, alignment=64)

    if not is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/52
        assert isinstance(ary.base, cl.SVMAllocation)

    ary.fill(17)
    orig_ary = ary.copy()

    prg = cl.Program(
        ctx, """
        __kernel void twice(__global float *a_g)
        {
          a_g[get_global_id(0)] *= 2;
        }
        """).build()

    prg.twice(queue, ary.shape, None, cl.SVM(ary))
    queue.finish()

    print(ary)
    assert np.array_equal(orig_ary * 2, ary)
Beispiel #6
0
def test_coarse_grain_svm(ctx_factory):
    import sys
    is_pypy = "__pypy__" in sys.builtin_module_names

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    _xfail_if_pocl_gpu(queue.device, "SVM")

    dev = ctx.devices[0]

    from pyopencl.characterize import has_coarse_grain_buffer_svm
    from pytest import skip
    if not has_coarse_grain_buffer_svm(queue.device):
        skip("device does not support coarse-grain SVM")

    if ("AMD" in dev.platform.name
            and dev.type & cl.device_type.CPU):
        pytest.xfail("AMD CPU doesn't do coarse-grain SVM")
    if ("AMD" in dev.platform.name
            and dev.type & cl.device_type.GPU):
        pytest.xfail("AMD GPU crashes on SVM unmap")

    n = 3000
    svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64))
    if not is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/52
        assert isinstance(svm_ary.mem.base, cl.SVMAllocation)

    cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype))

    with svm_ary.map_rw(queue) as ary:
        ary.fill(17)
        orig_ary = ary.copy()

    prg = cl.Program(ctx, """
        __kernel void twice(__global float *a_g)
        {
          a_g[get_global_id(0)] *= 2;
        }
        """).build()

    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)

    with svm_ary.map_ro(queue) as ary:
        print(ary)
        assert np.array_equal(orig_ary*2, ary)

    new_ary = np.empty_like(orig_ary)
    new_ary.fill(-1)

    if ctx.devices[0].platform.name != "Portable Computing Language":
        # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)"
        # in pocl up to and including 1.0rc1.

        cl.enqueue_copy(queue, new_ary, svm_ary)
        assert np.array_equal(orig_ary*2, new_ary)

    # {{{ https://github.com/inducer/pyopencl/issues/372

    buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32)
    out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32)

    svm_buf_arr = cl.SVM(buf_arr)
    svm_out_arr = cl.SVM(out_arr)
    with svm_buf_arr.map_rw(queue) as ary:
        ary.fill(17)

    prg_ro = cl.Program(ctx, r"""
        __kernel void twice_ro(__global int *out_g, __global int *in_g)
        {
          out_g[get_global_id(0)] = 2*in_g[get_global_id(0)];
        }
        """).build()

    prg_ro.twice_ro(queue, buf_arr.shape, None, svm_out_arr, svm_buf_arr)

    with svm_out_arr.map_ro(queue) as ary:
        print(ary)
def make_pixel_map_err(data,
                       mask,
                       W,
                       O,
                       pixel_map,
                       n0,
                       m0,
                       dij_n,
                       roi,
                       search_window=20,
                       grid=[20, 20]):
    # demand that the data is float32 to avoid excess mem. usage
    assert (data.dtype == np.float32)

    import time
    t0 = time.time()
    ##################################################################
    # OpenCL crap
    ##################################################################
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device = devices[0]
            break

    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue = cl.CommandQueue(context)

    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program = cl.Program(context, kernelsource).build()
    make_error_map_subpixel = program.make_error_map_subpixel

    make_error_map_subpixel.set_scalar_arg_dtypes([
        None, None, None, None, None, None, None, None, None, np.float32,
        np.float32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32,
        np.int32, np.int32, np.int32, np.int32
    ])

    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = make_error_map_subpixel.get_work_group_info(
        cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)

    # allocate local memory and dtype conversion
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])

    # inputs:
    Win = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin = O.astype(np.float32)
    dij_nin = dij_n.astype(np.float32)
    maskin = mask.astype(np.int32)

    # outputs:
    err_map = np.zeros((grid[0] * grid[1], search_window**2), dtype=np.float32)
    pixel_mapout = pixel_map.astype(np.float32)
    ##################################################################

    if type(search_window) is int:
        s_ss = search_window
        s_fs = search_window
    else:
        s_ss, s_fs = search_window

    ss_min, ss_max = (-(s_ss - 1) // 2, (s_ss + 1) // 2)
    fs_min, fs_max = (-(s_fs - 1) // 2, (s_fs + 1) // 2)

    # list the pixels for which to calculate the error grid
    ijs = []
    for i in np.linspace(roi[0], roi[1] - 1, grid[0]):
        for j in np.linspace(roi[2], roi[3] - 1, grid[1]):
            ijs.append([round(i), round(j)])

    ijs = np.array(ijs).astype(np.int32)

    for i in tqdm.trange(1, desc='calculating pixel map shift errors'):
        make_error_map_subpixel(queue, (1, ijs.shape[0]), (1, 1), cl.SVM(Win),
                                cl.SVM(data), localmem, cl.SVM(err_map),
                                cl.SVM(Oin), cl.SVM(pixel_mapout),
                                cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ijs),
                                n0, m0, ijs.shape[0], data.shape[0],
                                data.shape[1], data.shape[2], O.shape[0],
                                O.shape[1], ss_min, ss_max, fs_min, fs_max)

        queue.finish()
    t1 = time.time()
    t = t1 - t0

    res = make_pixel_map_err_report(ijs, err_map, mask, search_window, roi, t)

    return ijs, err_map, res
def calc_errs(data, mask, W, O, pixel_map, n0, m0, dij_n, ss, fs):
    # demand that the data is float32 to avoid excess mem. usage
    assert (data.dtype == np.float32)
    #assert(ss.dtype == np.int)
    #assert(fs.dtype == np.int)

    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device = devices[0]
            break

    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue = cl.CommandQueue(context)

    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program = cl.Program(context, kernelsource).build()
    translations_err_cl = program.translations_err

    translations_err_cl.set_scalar_arg_dtypes(10 * [None] + 2 * [np.float32] +
                                              4 * [np.int32])

    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = translations_err_cl.get_work_group_info(
        cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)

    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])

    # inputs:
    Win = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin = O.astype(np.float32)
    dij_nin = dij_n.astype(np.float32)
    maskin = mask.astype(np.int32)
    ssin = ss.astype(np.float32)
    fsin = fs.astype(np.float32)
    ns = np.arange(data.shape[0]).astype(np.int32)

    # outputs:
    dij_nout = dij_n.copy()
    errs = -np.ones((len(ss), ), dtype=np.float32)
    out = np.zeros(data.shape[0]).astype(np.float32)

    step = max_comp
    translations_err_cl(queue, (ns.shape[0], 1), (1, 1), cl.SVM(Win),
                        cl.SVM(data), cl.SVM(Oin), cl.SVM(pixel_mapin),
                        cl.SVM(dij_nin),
                        cl.SVM(maskin), cl.SVM(ns), cl.SVM(out), cl.SVM(ssin),
                        cl.SVM(fsin), n0, m0, data.shape[1], data.shape[2],
                        O.shape[0], O.shape[1])
    queue.finish()

    errs = out

    return errs
Beispiel #9
0
def quadratic_refinement_1d_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n):
    # demand that the data is float32 to avoid excess mem. usage
    assert(data.dtype == np.float32)
    
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device   = devices[0]
            break
    
    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue   = cl.CommandQueue(context)
    
    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program     = cl.Program(context, kernelsource).build()
    update_pixel_map_cl = program.pixel_map_err
    
    update_pixel_map_cl.set_scalar_arg_dtypes(
                        8*[None] + 2*[np.float32] + 7*[np.int32])
    
    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = update_pixel_map_cl.get_work_group_info(
                       cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)
    
    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    
    # inputs:
    Win         = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin         = O.astype(np.float32)
    dij_nin     = dij_n.astype(np.float32)
    maskin      = mask.astype(np.int32)
    
    # outputs:
    err_map      = np.empty(W.shape, dtype=np.float32)
    pixel_shift  = np.zeros(pixel_map.shape, dtype=np.float32)
    err_quad     = np.empty((3,) + W.shape, dtype=np.float32)
    out          = pixel_map.copy()
    
    import time
    d0 = time.time()
    
    # qudratic fit refinement
    pixel_shift.fill(0.)
    
    A = []
    if data.shape[1] == 1:
        ss_shifts = [0]
    else :
        ss_shifts = [-1, 0, 1]

    if data.shape[2] == 1:
        fs_shifts = [0]
    else :
        fs_shifts = [-1, 0, 1]
    
    for ss_shift in ss_shifts:
        for fs_shift in fs_shifts:
            err_map.fill(9999)
            update_pixel_map_cl( queue, W.shape, (1, 1), 
                  cl.SVM(Win), 
                  cl.SVM(data), 
                  localmem, 
                  cl.SVM(err_map), 
                  cl.SVM(Oin), 
                  cl.SVM(pixel_mapin), 
                  cl.SVM(dij_nin), 
                  cl.SVM(maskin),
                  n0, m0, 
                  data.shape[0], data.shape[1], data.shape[2], 
                  O.shape[0], O.shape[1], ss_shift, fs_shift)
            queue.finish()
            
            if data.shape[1] == 1 :
                err_quad[fs_shift+1, :, :] = err_map
                A.append([fs_shift**2, fs_shift, 1])
            else :
                err_quad[ss_shift+1, :, :] = err_map
                A.append([ss_shift**2, ss_shift, 1])
    
    # now we have 3 equations and 3 unknowns
    # a x^2 + b x + c = err_i
    B = np.linalg.pinv(A)
    C = np.dot(B, np.transpose(err_quad, (1, 0, 2)))
    
    # minima is defined by
    # 2 a x + b = 0
    # x = -b / 2a
    # where C = [a, b, c]
    #           [0, 1, 2]
    det = 2*C[0]

    # make sure all sampled shifts have a valid error
    m    = np.all(err_quad!=9999, axis=0)
    # make sure the determinant is non zero
    m    = m * (det != 0)

    if data.shape[1] == 1 :
        pixel_shift[1][m] = (-C[1])[m] / det[m]
        #print(pixel_shift[1][m])
    elif data.shape[2] == 1 :
        pixel_shift[0][m] = (-C[1])[m] / det[m]
        #print(pixel_shift[0][m])

    # now only update pixels for which x**2 < 3**2
    m = m * (np.sum(pixel_shift**2, axis=0) < 9)
    
    out[0][m] = out[0][m] + pixel_shift[0][m]
    out[1][m] = out[1][m] + pixel_shift[1][m]
      
    error = np.sum(np.min(err_quad, axis=0))
    return out, {'pixel_shift': pixel_shift, 'error': error, 'err_quad': err_quad}
Beispiel #10
0
def update_pixel_map_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n, subpixel, subsample, search_window, ss, fs):
    # demand that the data is float32 to avoid excess mem. usage
    assert(data.dtype == np.float32)
    
    ##################################################################
    # OpenCL crap
    ##################################################################
    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device   = devices[0]
            break
    
    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue   = cl.CommandQueue(context)
    
    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program     = cl.Program(context, kernelsource).build()
    
    if subpixel:
        update_pixel_map_cl = program.update_pixel_map_subpixel
    else :
        update_pixel_map_cl = program.update_pixel_map
    
    update_pixel_map_cl.set_scalar_arg_dtypes(
            [None, None, None, None, None, None, None, None, None, None,
             np.float32, np.float32, np.float32, np.int32, np.int32, 
             np.int32, np.int32, np.int32, np.int32, np.int32,
             np.int32, np.int32])
    
    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = update_pixel_map_cl.get_work_group_info(
                       cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    #print('maximum workgroup size:', max_size)
    #print('maximum compute units :', max_comp)
    
    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    
    # inputs:
    Win         = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin         = O.astype(np.float32)
    dij_nin     = dij_n.astype(np.float32)
    maskin      = mask.astype(np.int32)
    ss          = ss.ravel().astype(np.int32)
    fs          = fs.ravel().astype(np.int32)
    
    ss_min, ss_max = (-(search_window[0]-1)//2, (search_window[0]+1)//2) 
    fs_min, fs_max = (-(search_window[1]-1)//2, (search_window[1]+1)//2) 
    
    print(ss_min, ss_max)
    print(fs_min, fs_max)
    # outputs:
    err_map      = np.zeros(W.shape, dtype=np.float32)
    pixel_mapout = pixel_map.astype(np.float32)
    ##################################################################
    # End crap
    ##################################################################

    # evaluate err_map0
    ssi = ss
    fsi = fs
    update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), 
                        cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), 
                        cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin),
                        cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, 
                        data.shape[0], data.shape[1], data.shape[2], 
                        O.shape[0], O.shape[1], 0, 1, 0, 1)
    queue.finish()

    pixel_mapout = pixel_map.astype(np.float32)
    err_map0     = err_map.copy()
    
    step = min(100, ss.shape[0])
    it = tqdm.tqdm(np.arange(ss.shape[0])[::step], desc='updating pixel map')
    for i in it:
        ssi = ss[i:i+step:]
        fsi = fs[i:i+step:]
        update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), 
              cl.SVM(Win), 
              cl.SVM(data), 
              localmem, 
              cl.SVM(err_map), 
              cl.SVM(Oin), 
              cl.SVM(pixel_mapout), 
              cl.SVM(dij_nin), 
              cl.SVM(maskin),
              cl.SVM(ssi),
              cl.SVM(fsi),
              n0, m0, subsample,
              data.shape[0], data.shape[1], data.shape[2], 
              O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max)
        queue.finish()
        
        er = np.mean(err_map[err_map>0])
        it.set_description("updating pixel map: {:.2e}".format(er))
        
        #it.set_description("updating pixel map: {:.2e}".format(np.sum(err_map) \
                #                   / np.sum(err_map>0)))
    
    # only return filled values
    out = np.zeros((2,) + ss.shape, dtype=pixel_map.dtype)
    out[0] = pixel_mapout[0][ss, fs]
    out[1] = pixel_mapout[1][ss, fs]
    return out, {'error_map': err_map, 'error': np.sum(err_map)}
Beispiel #11
0
def quadratic_refinement_split_opencl(data, mask, W, O, pixel_map, n0, m0,
                                      dij_n):
    # demand that the data is float32 to avoid excess mem. usage
    assert (data.dtype == np.float32)

    import os
    import pyopencl as cl
    ## Step #1. Obtain an OpenCL platform.
    # with a cpu device
    for p in cl.get_platforms():
        devices = p.get_devices(cl.device_type.CPU)
        if len(devices) > 0:
            platform = p
            device = devices[0]
            break

    ## Step #3. Create a context for the selected device.
    context = cl.Context([device])
    queue = cl.CommandQueue(context)

    # load and compile the update_pixel_map opencl code
    here = os.path.split(os.path.abspath(__file__))[0]
    kernelsource = os.path.join(here, 'update_pixel_map.cl')
    kernelsource = open(kernelsource).read()
    program = cl.Program(context, kernelsource).build()
    update_pixel_map_cl = program.pixel_map_err_split

    update_pixel_map_cl.set_scalar_arg_dtypes(9 * [None] + 2 * [np.float32] +
                                              7 * [np.int32])

    # Get the max work group size for the kernel test on our device
    max_comp = device.max_compute_units
    max_size = update_pixel_map_cl.get_work_group_info(
        cl.kernel_work_group_info.WORK_GROUP_SIZE, device)
    print('maximum workgroup size:', max_size)
    print('maximum compute units :', max_comp)

    # allocate local memory and dtype conversion
    ############################################
    localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0])
    localmem_mask = cl.LocalMemory(np.dtype(np.int32).itemsize * mask.shape[0])

    # inputs:
    Win = W.astype(np.float32)
    pixel_mapin = pixel_map.astype(np.float32)
    Oin = O.astype(np.float32)
    dij_nin = dij_n.astype(np.float32)
    maskin = mask.astype(np.int32)

    # outputs:
    err_map = np.empty(W.shape, dtype=np.float32)
    pixel_shift = np.zeros(pixel_map.shape, dtype=np.float32)
    err_quad = np.empty((9, ) + W.shape, dtype=np.float32)
    out = pixel_map.copy()

    import time
    d0 = time.time()

    # qudratic fit refinement
    pixel_shift.fill(0.)

    A = []
    print('\nquadratic refinement:')
    print('---------------------')
    for ss_shift in [-1, 0, 1]:
        for fs_shift in [-1, 0, 1]:
            A.append([
                ss_shift**2, fs_shift**2, ss_shift, fs_shift,
                ss_shift * fs_shift, 1
            ])
            print(ss_shift, fs_shift)
            update_pixel_map_cl(queue, W.shape, (1, 1), cl.SVM(Win),
                                cl.SVM(data), localmem, cl.SVM(err_map),
                                cl.SVM(Oin), cl.SVM(pixel_mapin),
                                cl.SVM(dij_nin), cl.SVM(maskin), localmem_mask,
                                n0, m0, data.shape[0], data.shape[1],
                                data.shape[2], O.shape[0], O.shape[1],
                                ss_shift, fs_shift)
            queue.finish()

            err_quad[3 * (ss_shift + 1) + fs_shift + 1, :, :] = err_map

    # now we have 9 equations and 6 unknowns
    # c_20 x^2 + c_02 y^2 + c_10 x + c_01 y + c_11 x y + c_00 = err_i
    B = np.linalg.pinv(A)
    C = np.dot(B, np.transpose(err_quad, (1, 0, 2)))

    # minima is defined by
    # 2 c_20 x +   c_11 y = -c_10
    #   c_11 x + 2 c_02 y = -c_01
    # where C = [c_20, c_02, c_10, c_01, c_11, c_00]
    #           [   0,    1,    2,    3,    4,    5]
    # [x y] = [[2c_02 -c_11], [-c_11, 2c_20]] . [-c_10 -c_01] / (2c_20 * 2c_02 - c_11**2)
    # x     = (-2c_02 c_10 + c_11   c_01) / det
    # y     = (  c_11 c_10 - 2 c_20 c_01) / det
    det = 2 * C[0] * 2 * C[1] - C[4]**2

    # make sure all sampled shifts have a valid error
    m = np.all(err_quad < np.finfo(np.float32).max, axis=0)
    # make sure the determinant is non zero
    m = m * (det != 0)
    pixel_shift[0][m] = (-2 * C[1] * C[2] + C[4] * C[3])[m] / det[m]
    pixel_shift[1][m] = (C[4] * C[2] - 2 * C[0] * C[3])[m] / det[m]

    # now only update pixels for which (x**2 + y**2) < 3**2
    m = m * (np.sum(pixel_shift**2, axis=0) < 9)

    out[0][m] = out[0][m] + pixel_shift[0][m]
    out[1][m] = out[1][m] + pixel_shift[1][m]

    print('calculation took:', time.time() - d0, 's')

    error = np.sum(np.min(err_quad, axis=0))
    return out, {
        'pixel_shift': pixel_shift,
        'error': error,
        'err_quad': err_quad
    }
Beispiel #12
0
    )

prg = cl.Program(ctx, """
__kernel void twice(
    __global float *a_g)
{
  int gid = get_global_id(0);
  a_g[gid] = 2*a_g[gid];
}
""").build()


if has_coarse_grain_buffer_svm(dev):
    print("Testing coarse-grained buffer SVM...", end="")

    svm_ary = cl.SVM(cl.csvm_empty(ctx, 10, np.float32))
    assert isinstance(svm_ary.mem, np.ndarray)

    with svm_ary.map_rw(queue) as ary:
        ary.fill(17)  # use from host
        orig_ary = ary.copy()

    prg.twice(queue, svm_ary.mem.shape, None, svm_ary)
    queue.finish()

    with svm_ary.map_ro(queue) as ary:
        assert(np.array_equal(orig_ary*2, ary))

    print(" done.")

if has_fine_grain_buffer_svm(dev):