def test_pointer_increment(dtype): # todo use https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html data = np.array([0]).astype(dtype) func = Function('func', {'data': Private(data.dtype)}, """ return data[0]; """, returns=data.dtype) # Assigning to array pointers does not work in c (e.g. b=a does not compile): # https://stackoverflow.com/questions/744536/c-array-declaration-and-assignment # Below this can be solved by creating pointers p1 and p2 where their address can be exchange by assignment knl = Kernel('knl_pointer_arithmetics', {'data': data}, """ private dtype a[5] = {0}; private dtype b[5] = {0}; dtype *p1 = a; dtype *p2 = b; a[3] = (dtype)(5); p2 = a; data[0] = func(p2+3); """, global_size=data.shape, type_defs={'dtype': dtype}) prog = Program(functions=[func], kernels=[knl]) knl_cl = prog.compile().knl_pointer_arithmetics knl_cl() res_cl = knl_cl.data.get() knl_py = prog.compile(emulate=True).knl_pointer_arithmetics knl_py() res_py = knl_cl.data.get() assert np.all(res_cl[0] == res_py[0])
def _get_kernel(stage_builder: FftStageBuilder, iteration, radix, data_in, data_out, emulate=False): if iteration == 0: # data0 is in_buffer, whose length might not be power of two offset_data_in = 'get_global_id(0)*N_INPUT' else: offset_data_in = 'get_global_id(0)*N' stage_builder.radix = radix funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages( [radix]) funcs.append(stage_builder.get_fft_stage_func()) d['M'] = (M := data_in.shape[0]) # number of FFTs to perform simultaneously knl_gpu_fft = Kernel( 'gpu_fft', { 'Ns': Scalar(Types.int), # In each iteration, the algorithm can be thought of combining the radix R FFTs on # subsequences of length Ns into the FFT of a new sequence of length RNs by # performing an FFT of length R on the corresponding elements of the subsequences. 'data_in': Global(data_in), 'data_out': Global(data_out), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ ${shared_mem_fft_stage} int t = get_local_id(2); // thread index int b = get_global_id(1); // thread block index fft_stage(t, b, Ns, data_in +${offset_data_in}, data_out +get_global_id(0)*N, shared,direction, iteration);""", replacements={ 'offset_data_in': offset_data_in, 'shared_mem_fft_stage': shared_mem_fft_stage }, global_size=(d['M'], max(1, int(d['N'] / (d['R'] * d['T']))), d['T']), local_size=(1, 1, d['T'])) program = Program( funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile( context=data_in.context, emulate=emulate, file=Program.get_default_dir_pycl_kernels().joinpath( f'fft_{iteration}_{stage_builder.radix}')) return program.gpu_fft
def __init__(self, in_buffer: Array, out_buffer_dtype: np.dtype): self.in_buffer = in_buffer self.out_buffer = empty(in_buffer.shape, out_buffer_dtype, in_buffer.queue) knl = Kernel(name='type', args={'in_buffer': Global(self.in_buffer, read_only=True), 'out_buffer': Global(self.out_buffer)}, body=[""" int addr_in = ${command_addr_in}; int addr_out = ${command_addr_out}; out_buffer[addr_out]=convert_${buff_out_t}(in_buffer[addr_in]); """], replacements={'command_addr_in': Helpers.command_compute_address(self.in_buffer.ndim), 'command_addr_out': Helpers.command_compute_address(self.out_buffer.ndim), 'buff_out_t': c_name_from_dtype(self.out_buffer.dtype)}, global_size=self.in_buffer.shape) self.program = Program(kernels=[knl]).compile(context=in_buffer.context, emulate=False)
def _get_cl_program(self) -> Program: knl = Kernel(name='sum_along_axis', args={ 'in_buffer': Global(self.in_buffer), 'axis': Scalar(Types.int(self.axis)), 'out_buffer': Global(self.out_buffer) }, body=[ """ buff_t sum = (buff_t) 0; for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis sum+=in_buffer[${addr_in}]; } out_buffer[${addr}] = sum; """ ], replacements={ 'size_input_axis': self.in_buffer.shape[self.axis], 'addr': Helpers.command_compute_address(self.out_buffer.ndim), 'addr_in': self._command_compute_address_in() }, global_size=self.out_buffer.shape) type_defs = {'buff_t': self.in_buffer.dtype} return Program(type_defs=type_defs, kernels=[knl])
def test_memoize_kernel(): # thread = Thread(profile=True) ary_a = np.ones(int(1e3)) ary_b = np.zeros(ary_a.shape) ary_a_buffer = to_device(ary_a) ary_b_buffer = to_device(ary_b) n_recompilations = 100 for i in range(n_recompilations + 1): kernels = [] for j in range(10): some_knl = Kernel( f'some_knl_{j}', { 'ary_a': Global(ary_a_buffer), 'ary_b': Global(ary_b_buffer) }, """ ary_b[get_global_id(0)] = ary_a[get_global_id(0)]; """) kernels.append(some_knl) Program(kernels=kernels).compile() some_knl(global_size=ary_a.shape) if i == 1: t = time.time() time_per_recompile = (time.time() - t) / n_recompilations # thread.queue.get_profiler().show_histogram_cumulative_kernel_times() print(time_per_recompile) assert time_per_recompile < 0.001 # less than 1 ms overhead per recompilation achieved through caching
def __init__(self, mode='a', b_create_kernel_file: bool = True): self.mode = mode self.buff = zeros((10, ), Types.short) self.data_t = self.buff.dtype func = Function('plus_one', { 'buffer': Global(self.buff.dtype), 'idx': Scalar(Types.int) }, [ """ return buffer[idx]+${some_integer}; """ ], {'some_integer': 5}, returns=Types.float) knl = Kernel('some_operation', { 'buff': Global(self.buff), 'number': Scalar(Types.short(3.0)) }, [ """ data_t factor = convert_${data_t}(1.3); buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor; """ ], replacements={'data_t': c_name_from_dtype(self.data_t)}, global_size=self.buff.shape) defines = {'SOME_CONSTANT': 6} type_defs = {'data_t': self.data_t} self.program = Program(defines=defines, type_defs=type_defs, functions=[func], kernels=[knl]).compile() self.knl = knl
def test_bit_shift( ): # todo use https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html data = np.array([0, 0, 0, 0]).astype(Types.char) knl = Kernel('knl_bit_packing', {'data': data}, """ uchar a = 5; uchar b = 3; uchar c = (a << 4) | b; data[0] = (c & 0xF0) >> 4; data[1] = c & (0x0F); """, global_size=data.shape) prog = Program(kernels=[knl]) knl_cl = prog.compile().knl_bit_packing knl_py = prog.compile(emulate=True).knl_bit_packing knl_cl() get_current_queue().finish() res_cl = knl_cl.data.get() knl_py() res_py = knl_cl.data.get() assert np.all(res_cl == res_py)
def test_macro_with_arguments(): defines = { 'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }' } # this is a macro with arguments ary = zeros((2, ), Types.int) func_add_two = Function('add_two', {'a': Scalar(Types.int)}, 'return a + 2;', returns=Types.int) knl = Kernel('knl_macro_func', {'ary': Global(ary)}, """ int a = 1; int b = 2; FUNC(a, b, add_two) ary[get_global_id(0)] = a; """, defines=defines, global_size=ary.shape) Program([func_add_two], [knl]).compile().knl_macro_func() assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
def test_add_functions_inside_function_or_kernel_definition(): ary_a = to_device(np.ones(100)) fnc_add3 = Function('add_three', {'a': Scalar(Types.int)}, 'return a + 3;', returns=Types.int) fnc_add5 = Function('add_five', {'a': Scalar(Types.int)}, """ return add_three(a)+2; """, functions=[fnc_add3], returns=Types.int) some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)}, """ ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]); """, global_size=ary_a.shape, functions=[fnc_add5]) functions = [ ] # funcitons defined here have higher proiority in case of name conflicts Program(functions=functions, kernels=[some_knl]).compile() some_knl() assert ary_a.get()[0] == 6
""" return nested_func(ary, shared); """, returns=Types.int) ary = to_device((ary_np := np.array([1, 2]).astype(Types.int))) use_existing_file_for_emulation(False) knl = Kernel('some_knl', { 'ary': Global(ary), }, """ __local int shared[2]; ary[get_global_id(0)] = parent(ary, shared); """, global_size=ary.shape) prog = Program([func_nested, func_parent], [knl]) prog_py = prog.compile(emulate=True) prog_cl = prog.compile(emulate=False) prog_py.some_knl() ary_py = ary.get() ary.set(ary_np) prog_cl.some_knl() ary_cl = ary.get() get_current_queue().finish() assert np.allclose(ary_py, np.array([2, 1])) assert np.allclose(ary_py, ary_cl) def test_macro_with_arguments(): defines = {