def test_memoize_kernel(): # thread = Thread(profile=True) ary_a = np.ones(int(1e3)) ary_b = np.zeros(ary_a.shape) ary_a_buffer = to_device(ary_a) ary_b_buffer = to_device(ary_b) n_recompilations = 100 for i in range(n_recompilations + 1): kernels = [] for j in range(10): some_knl = Kernel( f'some_knl_{j}', { 'ary_a': Global(ary_a_buffer), 'ary_b': Global(ary_b_buffer) }, """ ary_b[get_global_id(0)] = ary_a[get_global_id(0)]; """) kernels.append(some_knl) Program(kernels=kernels).compile() some_knl(global_size=ary_a.shape) if i == 1: t = time.time() time_per_recompile = (time.time() - t) / n_recompilations # thread.queue.get_profiler().show_histogram_cumulative_kernel_times() print(time_per_recompile) assert time_per_recompile < 0.001 # less than 1 ms overhead per recompilation achieved through caching
def test_copy_array_region_on_device_between_buffers(): ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.complex64) in_cl = to_device(ary_np) out_np = np.zeros(shape=(4, 4), dtype=in_cl) out_cl = to_device(out_np) copy_region = CopyArrayRegion(in_buffer=in_cl, region_in=Slice[:1, 1:3], out_buffer=out_cl, region_out=Slice[1:2, 2:4]) out = copy_region() out_np[1:2, 2:4] = in_cl.get()[0:1, 1:3] assert np.all(out.get() == out_np)
def test_two_input_integer_functions(name, dtype): a_cl = to_device(np.ones((10, ), dtype)) a_emulation = to_device(np.ones((10, ), dtype)) knl = Kernel(f'knl_{name}', { 'a': Global(a_cl), 'num': Scalar(dtype(0)) }, f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);', global_size=a_cl.shape) knl.compile()(a=a_cl) knl.compile(emulate=True)(a=a_emulation) assert np.all(a_cl.get() == a_emulation.get())
def test_get_refreshed_argument_of_memoized_kernel(): for i in range(10): ary_a = np.ones(100) + i ary_b = np.zeros(100) some_knl = Kernel( 'some_knl', { 'ary_a': Global(to_device(ary_a)), 'ary_b': Global(to_device(ary_b)) }, """ ary_b[get_global_id(0)] = ary_a[get_global_id(0)]; """).compile() some_knl(global_size=ary_a.shape) assert np.all(some_knl.ary_b.get() == ary_a)
def test_debug_kernel_with_barriers(): buff = np.zeros(shape=(2, 4)).astype(Types.int) mem_buffer = to_device(buff) knl = Kernel('knl', {'mem_glob': Global(mem_buffer)}, """ __local int mem[2]; mem[0]=0; mem[1]=0; mem[get_local_id(1)] = get_local_id(1); barrier(CLK_LOCAL_MEM_FENCE); mem[get_local_id(1)] = mem[1]; //barrier(CLK_GLOBAL_MEM_FENCE); mem_glob[get_global_id(0)*get_global_size(1)+get_global_id(1)] = mem[get_local_id(1)]; """, global_size=(2, 4), local_size=(1, 2)) compiled_cl = knl.compile( emulate=False, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) compiled_cl() mem_buffer_py = zeros_like(mem_buffer) compiled_py = knl.compile( emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real compiled_py(mem_glob=mem_buffer_py) assert np.all(mem_buffer.get() == mem_buffer_py.get())
def test_transpose_complex(): ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=Types.cfloat) ary_np = np.hstack([ary_np, ary_np]) # array([[1, 2, 3], [4, 5, 6]]) in_cl = to_device(ary_np) transpose_in_buffer = Transpose(in_cl, (1, 0)) out_cl = transpose_in_buffer() assert np.all(out_cl.get() == ary_np.T)
def test_copy_array_region_on_device_given_axis_index(): ary_np = np.ones(shape=(5, 10), dtype=np.int32) in_cl = to_device(ary_np) copy_region = CopyArrayRegion(in_buffer=in_cl, region_in=Slice[2:-1, :]) out = copy_region() out_np = in_cl.get()[2:-1, :] assert np.all(out.get() == out_np)
def test_copy_array_region_on_device_negative_indexing(): ary_np = np.ones(shape=(5, 10), dtype=np.int32) in_cl = to_device(ary_np) copy_region = CopyArrayRegion(in_buffer=in_cl, region_in=Slice[:, 2:-1]) out = copy_region() out_np = in_cl.get()[:, 2:-1] assert np.all(out.get() == out_np)
def test_multiple_command_queues(): queue1 = create_queue(device_id=0) queue2 = create_queue(context=queue1.context) ary_a = to_device(np.ones(100000) + 1, queue1) ary_b = to_device(np.zeros(100000), queue1) some_knl = Kernel('some_knl', { 'ary_a': Global(ary_a), 'ary_b': Global(ary_b) }, """ ary_b[get_global_id(0)] += ary_a[get_global_id(0)]; """, global_size=ary_a.shape).compile(queue2.context) some_knl(queue=queue2) # thread2.queue.finish() some_knl(queue=queue1) test = 0
def test_sum_along_axis_1d(): ary = np.array([1, 2, 3]) ary_buffer = to_device(ary) sum_along_axis = SumAlongAxis(ary_buffer, axis=0) # res_py = sum_along_axis(emulate=True).get() res_cl = sum_along_axis().get() ref = ary.sum(axis=0) assert np.all(res_cl == ref)
def test_copy_array_region_on_device(): ary_np = np.array([[1, 2, 3], [4, 5, 6]], dtype=np.complex64) in_cl = to_device(ary_np) copy_region = CopyArrayRegion(in_cl, region_in=Slice[0:1, 1:3]) out = copy_region() out_np = in_cl.get()[0:1, 1:3] assert np.all(out.get() == out_np)
def test_sum_along_axis(): ary = np.array([[1, 2, 3], [1, 2, 3]]) ary_buffer = to_device(ary) """" sum_along_axis = SumAlongAxis(ary_buffer, axis=0) res = sum_along_axis().get() ref = ary.sum(axis=0) assert np.all(res == ref) """ sum_along_axis = SumAlongAxis(ary_buffer, axis=1) res = sum_along_axis().get() ref = ary.sum(axis=1) assert np.all(res == ref)
def test_ifft(in_data_np): in_ = to_device(in_data_np) fft_cl = Fft(in_buffer=in_) fft_in_ = fft_cl() ref_ifft_fft_in_ = np.fft.ifft(fft_in_.get(), axis=-1) ifft_cl = IFft(fft_in_) ifft_fft_in_ = ifft_cl().get() if in_data_np.size < 1024: ifft_cl_py = IFft(fft_in_, emulate=True) py_np_ifft_fft_in_ = ifft_cl_py().get() a = py_np_ifft_fft_in_ b = ifft_fft_in_ assert np.allclose(a, b) assert np.allclose(ifft_fft_in_, ref_ifft_fft_in_)
def eval_code(emulate=False): data = to_device(np.array([0]).astype(Types.char)) knl = Kernel('knl_test_for_loop', { 'data': Global(data) }, """ ${header}{ data[0]+=i; } """, replacements={ 'header': header }, global_size=data.shape).compile(emulate=emulate) knl() get_current_queue().finish() res = knl.data.get() return res
def test_access_complex_variable(): buff = np.array([0.5]).astype(Types.cfloat) buff_in = to_device(buff) buff_out = zeros_like(buff_in) knl = Kernel('knl', { 'inp': Global(buff_in), 'out': Global(buff_out) }, """ out[get_global_id(0)].real = inp[get_global_id(0)].real; """, global_size=(1, )) compiled_cl = knl.compile( emulate=False, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) compiled_cl() buff_out_py = zeros_like(buff_in) compiled_py = knl.compile( emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real compiled_py(out=buff_out_py) assert np.all(buff_out.get() == buff_out_py.get())
def test_add_functions_inside_function_or_kernel_definition(): ary_a = to_device(np.ones(100)) fnc_add3 = Function('add_three', {'a': Scalar(Types.int)}, 'return a + 3;', returns=Types.int) fnc_add5 = Function('add_five', {'a': Scalar(Types.int)}, """ return add_three(a)+2; """, functions=[fnc_add3], returns=Types.int) some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)}, """ ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]); """, global_size=ary_a.shape, functions=[fnc_add5]) functions = [ ] # funcitons defined here have higher proiority in case of name conflicts Program(functions=functions, kernels=[some_knl]).compile() some_knl() assert ary_a.get()[0] == 6
def run(emulate=False): ary = to_device(np.ones(10).astype(data_t)) local_mem = LocalArray( dtype=data_t, shape=5) # 5 is to to test that local array argument is changed knl = Kernel('knl_local_arg', { 'ary': Global(ary), 'local_mem': local_mem }, """ int offset = get_group_id(0)*get_local_size(0); for(int i=0; i<5; i++) local_mem[i] = ary[offset + i]; barrier(CLK_LOCAL_MEM_FENCE); data_t sum = (data_t)(0); for(int i=0; i<5; i++) sum+=local_mem[i]; ary[get_global_id(0)] = sum; """, type_defs={'data_t': data_t}, global_size=ary.shape, local_size=(5, )) local_mem = LocalArray(dtype=data_t, shape=5) knl.compile(emulate=emulate)(local_mem=local_mem) return ary.get()
def test_fft(in_data_np): atol = 1e-4 if in_data_np.dtype == Types.cfloat else 1e-8 import numpy as np in_data_cl = to_device(in_data_np) fft_cl = Fft(in_data_cl, emulate=False) # zero padding data for numpy axis = 1 N = in_data_np.shape[axis] if not np.log2(N).is_integer(): # if not power of 2, pad accordingly N = 2**int(np.log2(N) + 1) in_data_np_power_of_two = np.zeros((in_data_np.shape[0], N), in_data_np.dtype) in_data_np_power_of_two[:, :in_data_np.shape[axis]] = in_data_np def measure(call): attempts = 3 ts = [] for i in range(attempts): t1 = time.time() call() t2 = time.time() ts.append(t2 - t1) return min(ts) # import pyfftw # t_fftw = measure(lambda: pyfftw.interfaces.numpy_fft.fft(in_data_np_power_of_two, axis=-1)) t_np = measure(lambda: np.fft.fft(in_data_np_power_of_two, axis=-1)) fft_in_data_np = np.fft.fft(in_data_np_power_of_two, axis=-1) def fft_call(): fft_in_data_cl = fft_cl() fft_in_data_cl.queue.finish() t_cl = measure(fft_call) fft_in_data_cl = fft_cl() if in_data_np.size < 1024: # Test against emulation (commented since it is slower) use_existing_file_for_emulation(False) fft_cl_py = Fft(in_data_cl, emulate=True) fft_in_data_cl_py = fft_cl_py() a = fft_in_data_cl_py.get().view(Types.cdouble) b = fft_in_data_cl.get().view(Types.cdouble) c = fft_in_data_np.view(Types.cdouble) assert np.allclose(a, b) assert np.allclose(c, b) assert np.allclose(c, a) # import matplotlib.pyplot as plt # plt.plot(fft_in_data_np.flatten()) # plt.plot(fft_in_data_cl_emulation.get().flatten()) # plt.show() assert np.allclose(fft_in_data_np, fft_in_data_cl.get(), atol=atol) # benchmark using reikna if False: # change to true to run against reikna's fft. Note: Reikna takes quite some optimization time before run from reikna.cluda import any_api from reikna.fft import FFT import numpy api = any_api() thr = api.Thread.create() data = in_data_np dtype = data.dtype axes = (1, ) fft = FFT(data, axes=axes) fftc = fft.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data_dev) ts = [] for i in range(attempts): t1 = time.time() fftc(res_dev, data_dev) thr.synchronize() t2 = time.time() ts.append(t2 - t1) fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) tnp = time.time() fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) tnp = time.time() - tnp # numpy.fft.fftn(data[:, :, 0], axes=(1,)) treikna_min = min(ts) assert np.allclose(fft_in_data_np, res_dev.get())
def test_transpose_single_dimension(): ary_np = np.array([[1, 2, 3]], dtype=Types.int).T in_cl = to_device(ary_np) transpose_on_input_buffer = Transpose(in_cl, (1, 0)) out_cl = transpose_on_input_buffer() assert np.all(out_cl.get() == ary_np.T)
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value): """ example usage: set slice of array with scalar value val = 1 cl_set(ary, Slice[:,2:3], val) set slice of array with equally shaped numpy array like the slice some_np_array = np.array([[3,4]) cl_set(ary, Slice[1:2,2:3], some_np_array) :param array: :param region: :param value: :return: """ # todo test if array c contiguous region_arg = region # if slice is contiguous block of memory set it as # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer) # _buffer_np[:, memory: -memory] = mapper.alphabet[0] # add_symbols_memory_initialization.out_buffer.set(_buffer_np) region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array) region = CopyArrayRegion._deal_with_none_in_stop(region, array) region = CopyArrayRegion._deal_with_negative_region(region, array) # test if requested region is for axis, _slice in enumerate(region): step_width = _slice[2] if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]: raise ValueError('Slicing out of array bounds') if any([(part[0] - part[1]) == 0 for part in region]): # check that there is no empty slice return global_size = np.product([part[1] - part[0] for part in region]) target_shape = to_device(np.array(array.shape).astype(Types.int)) offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int)) source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int)) source_n_dims = len(source_shape) if isinstance(value, np.ndarray): source = to_device(value.astype(array.dtype)) arg_source = Global(source) code_source = 'source[get_global_id(0)]' else: arg_source = Scalar(array.dtype) source = value code_source = 'source' knl = Kernel('set_cl_array', {'target': Global(array), 'target_shape': Global(target_shape), 'offset_target': Global(offset_target), 'source': arg_source, 'source_shape': Global(source_shape), 'source_n_dims': Scalar(Types.int)}, """ // id_source = get_global_id(0) // id_source points to element of array source which replaces element with id_target in array target. // we need to compute id_target from id_source: // we assume c-contiguous addressing like: // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array) // At first we need to compute individual ids of source array from id_source: // id3 = int(gid % s3), temp = (gid-id3)/s3 // id2 = int(temp % s2), temp = (temp-id2)/s2 // id1 = int(temp % s1), temp = (temp-id1)/s1 // id0 = int(temp % s0), temp = (temp-id0)/s1 // Finally, we can determine the id of the target array and copy element to corresponding position: // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x) int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions int temp = get_global_id(0); int prod_source_id_multiplier = 1; int prod_target_id_multiplier = 1; for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source int id_source = temp % source_shape[i]; temp = (int)((temp-id_source)/source_shape[i]); prod_source_id_multiplier *= source_shape[i]; id_target += (offset_target[i]+id_source)*prod_target_id_multiplier; prod_target_id_multiplier *= target_shape[i]; } target[id_target] = ${source}; """, replacements={'addr': Helpers.command_compute_address(array.ndim), 'source': code_source}, global_size=(global_size,) ).compile(array.context, emulate=False) knl(source=source, source_n_dims=source_n_dims)
def deal_with_np_arrays(v): if isinstance(v, Global) and isinstance(v.default, np.ndarray): v.default = to_device(ary=v.default, queue=queue) return v else: return v