def bilinear_interpolation_inverse_array(O, I, ss, fs, invalid=-1): # output dtype d = O.dtype # inputs: Iin = I.astype(np.float32) ssin = ss.astype(np.float32) fsin = fs.astype(np.float32) if type(invalid) is np.ndarray: Iin[~invalid] = -1 else: Iin[Iin == invalid] = 0 # outputs: Oin = O.astype(np.float32) #bilinear_interpolation_inverse_array_cl(queue, ss.shape, (1, 1), bilinear_interpolation_inverse_array_cl(queue, (1, 1), (1, 1), cl.SVM(Iin), cl.SVM(Oin), cl.SVM(ssin), cl.SVM(fsin), ss.shape[0], ss.shape[1], O.shape[0], O.shape[1]) queue.finish() return Oin.astype(d)
def test_fine_grain_svm(ctx_factory): import sys is_pypy = '__pypy__' in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) from pyopencl.characterize import has_fine_grain_buffer_svm from pytest import skip if not has_fine_grain_buffer_svm(queue.device): skip("device does not support fine-grain SVM") n = 3000 ary = cl.fsvm_empty(ctx, n, np.float32, alignment=64) if not is_pypy: # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(ary.base, cl.SVMAllocation) ary.fill(17) orig_ary = ary.copy() prg = cl.Program( ctx, """ __kernel void twice(__global float *a_g) { a_g[get_global_id(0)] *= 2; } """).build() prg.twice(queue, ary.shape, None, cl.SVM(ary)) queue.finish() print(ary) assert np.array_equal(orig_ary * 2, ary)
def bilinear_interpolation_array(array, ss, fs): # inputs: arrayin = array.astype(np.float32) ssin = ss.astype(np.float32) fsin = fs.astype(np.float32) # outputs: out = np.zeros(ss.shape, dtype=np.float32) bilinear_interpolation_array_cl(queue, ss.shape, (1, 1), cl.SVM(arrayin), cl.SVM(out), cl.SVM(ssin), cl.SVM(fsin), ss.shape[0], ss.shape[1], arrayin.shape[0], arrayin.shape[1]) queue.finish() return out.astype(array.dtype)
def test_coarse_grain_svm(ctx_factory): import sys is_pypy = '__pypy__' in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) dev = ctx.devices[0] has_svm = (ctx._get_cl_version() >= (2, 0) and ctx.devices[0]._get_cl_version() >= (2, 0) and cl.get_cl_header_version() >= (2, 0)) if dev.platform.name == "Portable Computing Language": has_svm = (get_pocl_version(dev.platform) >= (1, 0) and cl.get_cl_header_version() >= (2, 0)) if not has_svm: from pytest import skip skip("SVM only available in OpenCL 2.0 and higher") if ("AMD" in dev.platform.name and dev.type & cl.device_type.CPU): pytest.xfail("AMD CPU doesn't do coarse-grain SVM") n = 3000 svm_ary = cl.SVM(cl.csvm_empty(ctx, (n, ), np.float32, alignment=64)) if not is_pypy: # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(svm_ary.mem.base, cl.SVMAllocation) cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) with svm_ary.map_rw(queue) as ary: ary.fill(17) orig_ary = ary.copy() prg = cl.Program( ctx, """ __kernel void twice(__global float *a_g) { a_g[get_global_id(0)] *= 2; } """).build() prg.twice(queue, svm_ary.mem.shape, None, svm_ary) with svm_ary.map_ro(queue) as ary: print(ary) assert np.array_equal(orig_ary * 2, ary) new_ary = np.empty_like(orig_ary) new_ary.fill(-1) if ctx.devices[0].platform.name != "Portable Computing Language": # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" # in pocl up to and including 1.0rc1. cl.enqueue_copy(queue, new_ary, svm_ary) assert np.array_equal(orig_ary * 2, new_ary)
def test_fine_grain_svm(ctx_factory): import sys is_pypy = '__pypy__' in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) from pytest import skip if (ctx._get_cl_version() < (2, 0) or cl.get_cl_header_version() < (2, 0)): skip("SVM only available in OpenCL 2.0 and higher") if not (ctx.devices[0].svm_capabilities & cl.device_svm_capabilities.FINE_GRAIN_BUFFER): skip("device does not support fine-grain SVM") n = 3000 ary = cl.fsvm_empty(ctx, n, np.float32, alignment=64) if not is_pypy: # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(ary.base, cl.SVMAllocation) ary.fill(17) orig_ary = ary.copy() prg = cl.Program( ctx, """ __kernel void twice(__global float *a_g) { a_g[get_global_id(0)] *= 2; } """).build() prg.twice(queue, ary.shape, None, cl.SVM(ary)) queue.finish() print(ary) assert np.array_equal(orig_ary * 2, ary)
def test_coarse_grain_svm(ctx_factory): import sys is_pypy = "__pypy__" in sys.builtin_module_names ctx = ctx_factory() queue = cl.CommandQueue(ctx) _xfail_if_pocl_gpu(queue.device, "SVM") dev = ctx.devices[0] from pyopencl.characterize import has_coarse_grain_buffer_svm from pytest import skip if not has_coarse_grain_buffer_svm(queue.device): skip("device does not support coarse-grain SVM") if ("AMD" in dev.platform.name and dev.type & cl.device_type.CPU): pytest.xfail("AMD CPU doesn't do coarse-grain SVM") if ("AMD" in dev.platform.name and dev.type & cl.device_type.GPU): pytest.xfail("AMD GPU crashes on SVM unmap") n = 3000 svm_ary = cl.SVM(cl.csvm_empty(ctx, (n,), np.float32, alignment=64)) if not is_pypy: # https://bitbucket.org/pypy/numpy/issues/52 assert isinstance(svm_ary.mem.base, cl.SVMAllocation) cl.enqueue_svm_memfill(queue, svm_ary, np.zeros((), svm_ary.mem.dtype)) with svm_ary.map_rw(queue) as ary: ary.fill(17) orig_ary = ary.copy() prg = cl.Program(ctx, """ __kernel void twice(__global float *a_g) { a_g[get_global_id(0)] *= 2; } """).build() prg.twice(queue, svm_ary.mem.shape, None, svm_ary) with svm_ary.map_ro(queue) as ary: print(ary) assert np.array_equal(orig_ary*2, ary) new_ary = np.empty_like(orig_ary) new_ary.fill(-1) if ctx.devices[0].platform.name != "Portable Computing Language": # "Blocking memcpy is unimplemented (clEnqueueSVMMemcpy.c:61)" # in pocl up to and including 1.0rc1. cl.enqueue_copy(queue, new_ary, svm_ary) assert np.array_equal(orig_ary*2, new_ary) # {{{ https://github.com/inducer/pyopencl/issues/372 buf_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_ONLY, 10, np.int32) out_arr = cl.svm_empty(ctx, cl.svm_mem_flags.READ_WRITE, 10, np.int32) svm_buf_arr = cl.SVM(buf_arr) svm_out_arr = cl.SVM(out_arr) with svm_buf_arr.map_rw(queue) as ary: ary.fill(17) prg_ro = cl.Program(ctx, r""" __kernel void twice_ro(__global int *out_g, __global int *in_g) { out_g[get_global_id(0)] = 2*in_g[get_global_id(0)]; } """).build() prg_ro.twice_ro(queue, buf_arr.shape, None, svm_out_arr, svm_buf_arr) with svm_out_arr.map_ro(queue) as ary: print(ary)
def make_pixel_map_err(data, mask, W, O, pixel_map, n0, m0, dij_n, roi, search_window=20, grid=[20, 20]): # demand that the data is float32 to avoid excess mem. usage assert (data.dtype == np.float32) import time t0 = time.time() ################################################################## # OpenCL crap ################################################################## import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() make_error_map_subpixel = program.make_error_map_subpixel make_error_map_subpixel.set_scalar_arg_dtypes([ None, None, None, None, None, None, None, None, None, np.float32, np.float32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32 ]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = make_error_map_subpixel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) # outputs: err_map = np.zeros((grid[0] * grid[1], search_window**2), dtype=np.float32) pixel_mapout = pixel_map.astype(np.float32) ################################################################## if type(search_window) is int: s_ss = search_window s_fs = search_window else: s_ss, s_fs = search_window ss_min, ss_max = (-(s_ss - 1) // 2, (s_ss + 1) // 2) fs_min, fs_max = (-(s_fs - 1) // 2, (s_fs + 1) // 2) # list the pixels for which to calculate the error grid ijs = [] for i in np.linspace(roi[0], roi[1] - 1, grid[0]): for j in np.linspace(roi[2], roi[3] - 1, grid[1]): ijs.append([round(i), round(j)]) ijs = np.array(ijs).astype(np.int32) for i in tqdm.trange(1, desc='calculating pixel map shift errors'): make_error_map_subpixel(queue, (1, ijs.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ijs), n0, m0, ijs.shape[0], data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max) queue.finish() t1 = time.time() t = t1 - t0 res = make_pixel_map_err_report(ijs, err_map, mask, search_window, roi, t) return ijs, err_map, res
def calc_errs(data, mask, W, O, pixel_map, n0, m0, dij_n, ss, fs): # demand that the data is float32 to avoid excess mem. usage assert (data.dtype == np.float32) #assert(ss.dtype == np.int) #assert(fs.dtype == np.int) import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() translations_err_cl = program.translations_err translations_err_cl.set_scalar_arg_dtypes(10 * [None] + 2 * [np.float32] + 4 * [np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = translations_err_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) ssin = ss.astype(np.float32) fsin = fs.astype(np.float32) ns = np.arange(data.shape[0]).astype(np.int32) # outputs: dij_nout = dij_n.copy() errs = -np.ones((len(ss), ), dtype=np.float32) out = np.zeros(data.shape[0]).astype(np.float32) step = max_comp translations_err_cl(queue, (ns.shape[0], 1), (1, 1), cl.SVM(Win), cl.SVM(data), cl.SVM(Oin), cl.SVM(pixel_mapin), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ns), cl.SVM(out), cl.SVM(ssin), cl.SVM(fsin), n0, m0, data.shape[1], data.shape[2], O.shape[0], O.shape[1]) queue.finish() errs = out return errs
def quadratic_refinement_1d_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n): # demand that the data is float32 to avoid excess mem. usage assert(data.dtype == np.float32) import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() update_pixel_map_cl = program.pixel_map_err update_pixel_map_cl.set_scalar_arg_dtypes( 8*[None] + 2*[np.float32] + 7*[np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = update_pixel_map_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) # outputs: err_map = np.empty(W.shape, dtype=np.float32) pixel_shift = np.zeros(pixel_map.shape, dtype=np.float32) err_quad = np.empty((3,) + W.shape, dtype=np.float32) out = pixel_map.copy() import time d0 = time.time() # qudratic fit refinement pixel_shift.fill(0.) A = [] if data.shape[1] == 1: ss_shifts = [0] else : ss_shifts = [-1, 0, 1] if data.shape[2] == 1: fs_shifts = [0] else : fs_shifts = [-1, 0, 1] for ss_shift in ss_shifts: for fs_shift in fs_shifts: err_map.fill(9999) update_pixel_map_cl( queue, W.shape, (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapin), cl.SVM(dij_nin), cl.SVM(maskin), n0, m0, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_shift, fs_shift) queue.finish() if data.shape[1] == 1 : err_quad[fs_shift+1, :, :] = err_map A.append([fs_shift**2, fs_shift, 1]) else : err_quad[ss_shift+1, :, :] = err_map A.append([ss_shift**2, ss_shift, 1]) # now we have 3 equations and 3 unknowns # a x^2 + b x + c = err_i B = np.linalg.pinv(A) C = np.dot(B, np.transpose(err_quad, (1, 0, 2))) # minima is defined by # 2 a x + b = 0 # x = -b / 2a # where C = [a, b, c] # [0, 1, 2] det = 2*C[0] # make sure all sampled shifts have a valid error m = np.all(err_quad!=9999, axis=0) # make sure the determinant is non zero m = m * (det != 0) if data.shape[1] == 1 : pixel_shift[1][m] = (-C[1])[m] / det[m] #print(pixel_shift[1][m]) elif data.shape[2] == 1 : pixel_shift[0][m] = (-C[1])[m] / det[m] #print(pixel_shift[0][m]) # now only update pixels for which x**2 < 3**2 m = m * (np.sum(pixel_shift**2, axis=0) < 9) out[0][m] = out[0][m] + pixel_shift[0][m] out[1][m] = out[1][m] + pixel_shift[1][m] error = np.sum(np.min(err_quad, axis=0)) return out, {'pixel_shift': pixel_shift, 'error': error, 'err_quad': err_quad}
def update_pixel_map_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n, subpixel, subsample, search_window, ss, fs): # demand that the data is float32 to avoid excess mem. usage assert(data.dtype == np.float32) ################################################################## # OpenCL crap ################################################################## import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() if subpixel: update_pixel_map_cl = program.update_pixel_map_subpixel else : update_pixel_map_cl = program.update_pixel_map update_pixel_map_cl.set_scalar_arg_dtypes( [None, None, None, None, None, None, None, None, None, None, np.float32, np.float32, np.float32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = update_pixel_map_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) #print('maximum workgroup size:', max_size) #print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) ss = ss.ravel().astype(np.int32) fs = fs.ravel().astype(np.int32) ss_min, ss_max = (-(search_window[0]-1)//2, (search_window[0]+1)//2) fs_min, fs_max = (-(search_window[1]-1)//2, (search_window[1]+1)//2) print(ss_min, ss_max) print(fs_min, fs_max) # outputs: err_map = np.zeros(W.shape, dtype=np.float32) pixel_mapout = pixel_map.astype(np.float32) ################################################################## # End crap ################################################################## # evaluate err_map0 ssi = ss fsi = fs update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], 0, 1, 0, 1) queue.finish() pixel_mapout = pixel_map.astype(np.float32) err_map0 = err_map.copy() step = min(100, ss.shape[0]) it = tqdm.tqdm(np.arange(ss.shape[0])[::step], desc='updating pixel map') for i in it: ssi = ss[i:i+step:] fsi = fs[i:i+step:] update_pixel_map_cl(queue, (1, fsi.shape[0]), (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapout), cl.SVM(dij_nin), cl.SVM(maskin), cl.SVM(ssi), cl.SVM(fsi), n0, m0, subsample, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_min, ss_max, fs_min, fs_max) queue.finish() er = np.mean(err_map[err_map>0]) it.set_description("updating pixel map: {:.2e}".format(er)) #it.set_description("updating pixel map: {:.2e}".format(np.sum(err_map) \ # / np.sum(err_map>0))) # only return filled values out = np.zeros((2,) + ss.shape, dtype=pixel_map.dtype) out[0] = pixel_mapout[0][ss, fs] out[1] = pixel_mapout[1][ss, fs] return out, {'error_map': err_map, 'error': np.sum(err_map)}
def quadratic_refinement_split_opencl(data, mask, W, O, pixel_map, n0, m0, dij_n): # demand that the data is float32 to avoid excess mem. usage assert (data.dtype == np.float32) import os import pyopencl as cl ## Step #1. Obtain an OpenCL platform. # with a cpu device for p in cl.get_platforms(): devices = p.get_devices(cl.device_type.CPU) if len(devices) > 0: platform = p device = devices[0] break ## Step #3. Create a context for the selected device. context = cl.Context([device]) queue = cl.CommandQueue(context) # load and compile the update_pixel_map opencl code here = os.path.split(os.path.abspath(__file__))[0] kernelsource = os.path.join(here, 'update_pixel_map.cl') kernelsource = open(kernelsource).read() program = cl.Program(context, kernelsource).build() update_pixel_map_cl = program.pixel_map_err_split update_pixel_map_cl.set_scalar_arg_dtypes(9 * [None] + 2 * [np.float32] + 7 * [np.int32]) # Get the max work group size for the kernel test on our device max_comp = device.max_compute_units max_size = update_pixel_map_cl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, device) print('maximum workgroup size:', max_size) print('maximum compute units :', max_comp) # allocate local memory and dtype conversion ############################################ localmem = cl.LocalMemory(np.dtype(np.float32).itemsize * data.shape[0]) localmem_mask = cl.LocalMemory(np.dtype(np.int32).itemsize * mask.shape[0]) # inputs: Win = W.astype(np.float32) pixel_mapin = pixel_map.astype(np.float32) Oin = O.astype(np.float32) dij_nin = dij_n.astype(np.float32) maskin = mask.astype(np.int32) # outputs: err_map = np.empty(W.shape, dtype=np.float32) pixel_shift = np.zeros(pixel_map.shape, dtype=np.float32) err_quad = np.empty((9, ) + W.shape, dtype=np.float32) out = pixel_map.copy() import time d0 = time.time() # qudratic fit refinement pixel_shift.fill(0.) A = [] print('\nquadratic refinement:') print('---------------------') for ss_shift in [-1, 0, 1]: for fs_shift in [-1, 0, 1]: A.append([ ss_shift**2, fs_shift**2, ss_shift, fs_shift, ss_shift * fs_shift, 1 ]) print(ss_shift, fs_shift) update_pixel_map_cl(queue, W.shape, (1, 1), cl.SVM(Win), cl.SVM(data), localmem, cl.SVM(err_map), cl.SVM(Oin), cl.SVM(pixel_mapin), cl.SVM(dij_nin), cl.SVM(maskin), localmem_mask, n0, m0, data.shape[0], data.shape[1], data.shape[2], O.shape[0], O.shape[1], ss_shift, fs_shift) queue.finish() err_quad[3 * (ss_shift + 1) + fs_shift + 1, :, :] = err_map # now we have 9 equations and 6 unknowns # c_20 x^2 + c_02 y^2 + c_10 x + c_01 y + c_11 x y + c_00 = err_i B = np.linalg.pinv(A) C = np.dot(B, np.transpose(err_quad, (1, 0, 2))) # minima is defined by # 2 c_20 x + c_11 y = -c_10 # c_11 x + 2 c_02 y = -c_01 # where C = [c_20, c_02, c_10, c_01, c_11, c_00] # [ 0, 1, 2, 3, 4, 5] # [x y] = [[2c_02 -c_11], [-c_11, 2c_20]] . [-c_10 -c_01] / (2c_20 * 2c_02 - c_11**2) # x = (-2c_02 c_10 + c_11 c_01) / det # y = ( c_11 c_10 - 2 c_20 c_01) / det det = 2 * C[0] * 2 * C[1] - C[4]**2 # make sure all sampled shifts have a valid error m = np.all(err_quad < np.finfo(np.float32).max, axis=0) # make sure the determinant is non zero m = m * (det != 0) pixel_shift[0][m] = (-2 * C[1] * C[2] + C[4] * C[3])[m] / det[m] pixel_shift[1][m] = (C[4] * C[2] - 2 * C[0] * C[3])[m] / det[m] # now only update pixels for which (x**2 + y**2) < 3**2 m = m * (np.sum(pixel_shift**2, axis=0) < 9) out[0][m] = out[0][m] + pixel_shift[0][m] out[1][m] = out[1][m] + pixel_shift[1][m] print('calculation took:', time.time() - d0, 's') error = np.sum(np.min(err_quad, axis=0)) return out, { 'pixel_shift': pixel_shift, 'error': error, 'err_quad': err_quad }
) prg = cl.Program(ctx, """ __kernel void twice( __global float *a_g) { int gid = get_global_id(0); a_g[gid] = 2*a_g[gid]; } """).build() if has_coarse_grain_buffer_svm(dev): print("Testing coarse-grained buffer SVM...", end="") svm_ary = cl.SVM(cl.csvm_empty(ctx, 10, np.float32)) assert isinstance(svm_ary.mem, np.ndarray) with svm_ary.map_rw(queue) as ary: ary.fill(17) # use from host orig_ary = ary.copy() prg.twice(queue, svm_ary.mem.shape, None, svm_ary) queue.finish() with svm_ary.map_ro(queue) as ary: assert(np.array_equal(orig_ary*2, ary)) print(" done.") if has_fine_grain_buffer_svm(dev):