def __init__(self, in_buffer: Array, axes_order: Tuple[int, ...]): self.axes_order = axes_order shape_out = tuple([list(in_buffer.shape)[i] for i in axes_order]) self.out_buffer = empty(shape_out, in_buffer.dtype) self.in_buffer = in_buffer self.knl = Kernel(name='transpose', args={ 'in_buffer': Global(self.in_buffer, read_only=True), 'out_buffer': Global(self.out_buffer) }, body=[ """ int i_in = ${i_in}; int i_out = ${i_out}; out_buffer[i_out] = in_buffer[i_in]; """ ], replacements={ 'i_in': self._command_for_input_address_computation(), 'i_out': self._command_for_output_address_computation() }, global_size=self.in_buffer.shape).compile()
def _get_cl_program(self) -> Program: knl = Kernel(name='sum_along_axis', args={ 'in_buffer': Global(self.in_buffer), 'axis': Scalar(Types.int(self.axis)), 'out_buffer': Global(self.out_buffer) }, body=[ """ buff_t sum = (buff_t) 0; for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis sum+=in_buffer[${addr_in}]; } out_buffer[${addr}] = sum; """ ], replacements={ 'size_input_axis': self.in_buffer.shape[self.axis], 'addr': Helpers.command_compute_address(self.out_buffer.ndim), 'addr_in': self._command_compute_address_in() }, global_size=self.out_buffer.shape) type_defs = {'buff_t': self.in_buffer.dtype} return Program(type_defs=type_defs, kernels=[knl])
def test_memoize_kernel(): # thread = Thread(profile=True) ary_a = np.ones(int(1e3)) ary_b = np.zeros(ary_a.shape) ary_a_buffer = to_device(ary_a) ary_b_buffer = to_device(ary_b) n_recompilations = 100 for i in range(n_recompilations + 1): kernels = [] for j in range(10): some_knl = Kernel( f'some_knl_{j}', { 'ary_a': Global(ary_a_buffer), 'ary_b': Global(ary_b_buffer) }, """ ary_b[get_global_id(0)] = ary_a[get_global_id(0)]; """) kernels.append(some_knl) Program(kernels=kernels).compile() some_knl(global_size=ary_a.shape) if i == 1: t = time.time() time_per_recompile = (time.time() - t) / n_recompilations # thread.queue.get_profiler().show_histogram_cumulative_kernel_times() print(time_per_recompile) assert time_per_recompile < 0.001 # less than 1 ms overhead per recompilation achieved through caching
def get_fft_stage_func(self): replacements = {'fft_radix_R': f'fft_radix_{self.radix}'} func_fft_iteration = Function( 'fft_stage', { # thread index: T radix R fft ops are done in parallel 't': Scalar(Types.int), # thread block index: identifies block of N/(R*T) blocks. 'b': Scalar(Types.int), 'Ns': Scalar(Types.int), 'data0': Local(self.data_t) if self.b_local_memory else Global( self.data_t, read_only=True), 'data1': Local(self.data_t) if self.b_local_memory else Global(self.data_t), 'shared': Local(self.real_t), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ int j = b*T + t; // as proposed in [1, p.3, text section A] private data_t v[R]; real_t angle = -2*PI*(j % Ns)/(Ns*R); for(int r=0; r<R; r++){ int idxS = j + r* N/R; // idxS=idxSource v_from_data0(v, data0, idxS, r, direction,iteration); real_t cos_angle = cos(r*angle); real_t sin_angle = sin(r*angle); v[r] = (data_t)(v[r].s0*cos_angle-v[r].s1 *sin_angle, v[r].s0*sin_angle+v[r].s1 *cos_angle); } ${fft_radix_R}(v); if(Ns>=CW){ // todo: remove condition and store as separate kernel // changed line 27 of [1, Figure 2] to work with global dimensions of this class: int offset = expand(j, Ns, R); for(int r=0; r<R; r++){ int idxD = offset + r*Ns; // idxD=idxDestination data_1_from_v(v, data1, idxD, r, direction, iteration); } }else{ // According to [1, p.4], such that global memory writes are coalesced // !! mistake in [1] where *Ns is missing: idxD = (int)(t/Ns)*R + (t%Ns); !! int idxD = (int)(t/Ns)*Ns*R + (t%Ns); exchange( v, idxD, Ns, t, T, shared); int offset = b*R*T+ t; for( int r=0; r<R; r++ ){ idxD = offset + r*T; data_1_from_v(v, data1, idxD, r, direction, iteration); } } """, replacements=replacements) return func_fft_iteration
def __init__(self, mode='a', b_create_kernel_file: bool = True): self.mode = mode self.buff = zeros((10, ), Types.short) self.data_t = self.buff.dtype func = Function('plus_one', { 'buffer': Global(self.buff.dtype), 'idx': Scalar(Types.int) }, [ """ return buffer[idx]+${some_integer}; """ ], {'some_integer': 5}, returns=Types.float) knl = Kernel('some_operation', { 'buff': Global(self.buff), 'number': Scalar(Types.short(3.0)) }, [ """ data_t factor = convert_${data_t}(1.3); buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor; """ ], replacements={'data_t': c_name_from_dtype(self.data_t)}, global_size=self.buff.shape) defines = {'SOME_CONSTANT': 6} type_defs = {'data_t': self.data_t} self.program = Program(defines=defines, type_defs=type_defs, functions=[func], kernels=[knl]).compile() self.knl = knl
def test_get_refreshed_argument_of_memoized_kernel(): for i in range(10): ary_a = np.ones(100) + i ary_b = np.zeros(100) some_knl = Kernel( 'some_knl', { 'ary_a': Global(to_device(ary_a)), 'ary_b': Global(to_device(ary_b)) }, """ ary_b[get_global_id(0)] = ary_a[get_global_id(0)]; """).compile() some_knl(global_size=ary_a.shape) assert np.all(some_knl.ary_b.get() == ary_a)
def _get_kernel(stage_builder: FftStageBuilder, iteration, radix, data_in, data_out, emulate=False): if iteration == 0: # data0 is in_buffer, whose length might not be power of two offset_data_in = 'get_global_id(0)*N_INPUT' else: offset_data_in = 'get_global_id(0)*N' stage_builder.radix = radix funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages( [radix]) funcs.append(stage_builder.get_fft_stage_func()) d['M'] = (M := data_in.shape[0]) # number of FFTs to perform simultaneously knl_gpu_fft = Kernel( 'gpu_fft', { 'Ns': Scalar(Types.int), # In each iteration, the algorithm can be thought of combining the radix R FFTs on # subsequences of length Ns into the FFT of a new sequence of length RNs by # performing an FFT of length R on the corresponding elements of the subsequences. 'data_in': Global(data_in), 'data_out': Global(data_out), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ ${shared_mem_fft_stage} int t = get_local_id(2); // thread index int b = get_global_id(1); // thread block index fft_stage(t, b, Ns, data_in +${offset_data_in}, data_out +get_global_id(0)*N, shared,direction, iteration);""", replacements={ 'offset_data_in': offset_data_in, 'shared_mem_fft_stage': shared_mem_fft_stage }, global_size=(d['M'], max(1, int(d['N'] / (d['R'] * d['T']))), d['T']), local_size=(1, 1, d['T'])) program = Program( funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile( context=data_in.context, emulate=emulate, file=Program.get_default_dir_pycl_kernels().joinpath( f'fft_{iteration}_{stage_builder.radix}')) return program.gpu_fft
def test_debug_c_code_with_unary_increment_operation_inside_of_array(): buff_cl = zeros((6, 1), Types.short) knl = Kernel('knl', {'buff': Global(buff_cl)}, """ int number = -1; number++; buff[number++] = 1; buff[number] = 2; number = 0; buff[2+ number--] = 3; buff[3+ ++number] = 4; buff[5 + --number] = 5; int count = 0; for(int i=1; i<3; i++){ count = count + i; } buff[5] = count; """, global_size=(1, )) compiled_cl = knl.compile(emulate=False) compiled_cl(buff=buff_cl) buff_py = zeros((6, 1), Types.short) compiled_py = knl.compile( emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) compiled_py(buff=buff_py) assert np.all(buff_py.get() == buff_cl.get())
def test_debug_kernel_with_barriers(): buff = np.zeros(shape=(2, 4)).astype(Types.int) mem_buffer = to_device(buff) knl = Kernel('knl', {'mem_glob': Global(mem_buffer)}, """ __local int mem[2]; mem[0]=0; mem[1]=0; mem[get_local_id(1)] = get_local_id(1); barrier(CLK_LOCAL_MEM_FENCE); mem[get_local_id(1)] = mem[1]; //barrier(CLK_GLOBAL_MEM_FENCE); mem_glob[get_global_id(0)*get_global_size(1)+get_global_id(1)] = mem[get_local_id(1)]; """, global_size=(2, 4), local_size=(1, 2)) compiled_cl = knl.compile( emulate=False, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) compiled_cl() mem_buffer_py = zeros_like(mem_buffer) compiled_py = knl.compile( emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real compiled_py(mem_glob=mem_buffer_py) assert np.all(mem_buffer.get() == mem_buffer_py.get())
def __init__(self, in_buffer: Array, out_buffer_dtype: np.dtype): self.in_buffer = in_buffer self.out_buffer = empty(in_buffer.shape, out_buffer_dtype, in_buffer.queue) knl = Kernel(name='type', args={'in_buffer': Global(self.in_buffer, read_only=True), 'out_buffer': Global(self.out_buffer)}, body=[""" int addr_in = ${command_addr_in}; int addr_out = ${command_addr_out}; out_buffer[addr_out]=convert_${buff_out_t}(in_buffer[addr_in]); """], replacements={'command_addr_in': Helpers.command_compute_address(self.in_buffer.ndim), 'command_addr_out': Helpers.command_compute_address(self.out_buffer.ndim), 'buff_out_t': c_name_from_dtype(self.out_buffer.dtype)}, global_size=self.in_buffer.shape) self.program = Program(kernels=[knl]).compile(context=in_buffer.context, emulate=False)
def test_multiple_command_queues(): queue1 = create_queue(device_id=0) queue2 = create_queue(context=queue1.context) ary_a = to_device(np.ones(100000) + 1, queue1) ary_b = to_device(np.zeros(100000), queue1) some_knl = Kernel('some_knl', { 'ary_a': Global(ary_a), 'ary_b': Global(ary_b) }, """ ary_b[get_global_id(0)] += ary_a[get_global_id(0)]; """, global_size=ary_a.shape).compile(queue2.context) some_knl(queue=queue2) # thread2.queue.finish() some_knl(queue=queue1) test = 0
def __init__(self, b_create_kernel_file: bool = True): self.buff = zeros((10, ), Types.short) self.knl = Kernel('some_operation', { 'buff': Global(self.buff), 'number': Scalar(Types.short(1)) }, [ """ buff[get_global_id(0)] = number; """ ], global_size=self.buff.shape).compile()
def test_two_input_integer_functions(name, dtype): a_cl = to_device(np.ones((10, ), dtype)) a_emulation = to_device(np.ones((10, ), dtype)) knl = Kernel(f'knl_{name}', { 'a': Global(a_cl), 'num': Scalar(dtype(0)) }, f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);', global_size=a_cl.shape) knl.compile()(a=a_cl) knl.compile(emulate=True)(a=a_emulation) assert np.all(a_cl.get() == a_emulation.get())
def test_access_complex_variable(): buff = np.array([0.5]).astype(Types.cfloat) buff_in = to_device(buff) buff_out = zeros_like(buff_in) knl = Kernel('knl', { 'inp': Global(buff_in), 'out': Global(buff_out) }, """ out[get_global_id(0)].real = inp[get_global_id(0)].real; """, global_size=(1, )) compiled_cl = knl.compile( emulate=False, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) compiled_cl() buff_out_py = zeros_like(buff_in) compiled_py = knl.compile( emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl')) # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real compiled_py(out=buff_out_py) assert np.all(buff_out.get() == buff_out_py.get())
def test_different_c_operations_at_once(): ary = zeros((2, ), Types.int) knl = Kernel('knl_multiple_c_operations', { 'ary': Global(ary) }, """int a = 1; int b = 2; dtype val; // test variable definition without assignment dtype *ptr1; // test pointer definition global dtype *ptr2; // test global pointer definition ary[get_global_id(0)] = a>get_global_id(0) ? a : b; """, global_size=ary.shape, type_defs={ 'dtype': ary.dtype }).compile(emulate=True) knl() assert np.allclose(ary.get(), np.array([1, 2]).astype(ary.dtype))
def eval_code(emulate=False): data = to_device(np.array([0]).astype(Types.char)) knl = Kernel('knl_test_for_loop', { 'data': Global(data) }, """ ${header}{ data[0]+=i; } """, replacements={ 'header': header }, global_size=data.shape).compile(emulate=emulate) knl() get_current_queue().finish() res = knl.data.get() return res
def test_macro_with_arguments(): defines = { 'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }' } # this is a macro with arguments ary = zeros((2, ), Types.int) func_add_two = Function('add_two', {'a': Scalar(Types.int)}, 'return a + 2;', returns=Types.int) knl = Kernel('knl_macro_func', {'ary': Global(ary)}, """ int a = 1; int b = 2; FUNC(a, b, add_two) ary[get_global_id(0)] = a; """, defines=defines, global_size=ary.shape) Program([func_add_two], [knl]).compile().knl_macro_func() assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
def test_nested_local_barrier_inside_function(): func_nested = Function('nested_func', { 'ary': Global(Types.int), 'shared': Local(Types.int) }, """ shared[get_global_id(0)] = ary[get_global_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); return shared[(get_global_id(0)+1)%2] ; """, returns=Types.int) func_parent = Function('parent', func_nested.args, """ return nested_func(ary, shared); """, returns=Types.int) ary = to_device((ary_np := np.array([1, 2]).astype(Types.int)))
def test_add_functions_inside_function_or_kernel_definition(): ary_a = to_device(np.ones(100)) fnc_add3 = Function('add_three', {'a': Scalar(Types.int)}, 'return a + 3;', returns=Types.int) fnc_add5 = Function('add_five', {'a': Scalar(Types.int)}, """ return add_three(a)+2; """, functions=[fnc_add3], returns=Types.int) some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)}, """ ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]); """, global_size=ary_a.shape, functions=[fnc_add5]) functions = [ ] # funcitons defined here have higher proiority in case of name conflicts Program(functions=functions, kernels=[some_knl]).compile() some_knl() assert ary_a.get()[0] == 6
def run(emulate=False): ary = to_device(np.ones(10).astype(data_t)) local_mem = LocalArray( dtype=data_t, shape=5) # 5 is to to test that local array argument is changed knl = Kernel('knl_local_arg', { 'ary': Global(ary), 'local_mem': local_mem }, """ int offset = get_group_id(0)*get_local_size(0); for(int i=0; i<5; i++) local_mem[i] = ary[offset + i]; barrier(CLK_LOCAL_MEM_FENCE); data_t sum = (data_t)(0); for(int i=0; i<5; i++) sum+=local_mem[i]; ary[get_global_id(0)] = sum; """, type_defs={'data_t': data_t}, global_size=ary.shape, local_size=(5, )) local_mem = LocalArray(dtype=data_t, shape=5) knl.compile(emulate=emulate)(local_mem=local_mem) return ary.get()
def __init__(self, in_buffer: Array, region_in: TypeSliceFormatCopyArrayRegion = None, out_buffer: Array = None, region_out: TypeSliceFormatCopyArrayRegion = None): """ :param in_buffer: :param region_in: e.g. region_in=((0,0,1,2),(1,1,3,2)) selects the region_in of array a, like numpy would do with a[0:1:2,1:3:2] where 2 is the step width. The first element of tuple selects the axis. :param out_buffer: target buffer where data from in_buffer is being copied. (optional) :param region_out: specifies the region of out buffer memory where in_buffer data is copied (optional) """ _region_in_original = copy(region_in) # for debug purposes _region_out_original = copy(region_out) # for debug purposes if region_in is not None: region_in = self._deal_with_incomplete_regions(region_in, in_buffer) if out_buffer is not None and region_out is not None: region_out = self._deal_with_incomplete_regions(region_out, out_buffer) if region_in is not None: region_in = self._deal_with_none_in_stop(region_in, in_buffer) if region_out is not None and out_buffer is not None: region_out = self._deal_with_none_in_stop(region_out, out_buffer) if region_in is not None: region_in = self._deal_with_negative_region(region_in, in_buffer) if region_out is not None and out_buffer is not None: region_out = self._deal_with_negative_region(region_out, out_buffer) self.in_buffer = in_buffer if region_in is None: self.region_in = [(0, self.in_buffer.shape[i_axis], 1) for i_axis in range(self.in_buffer.ndim)] else: self.region_in = region_in if out_buffer is None and region_out is None: shape = [ax[1] - ax[0] for ax in self.region_in] self.out_buffer = empty(tuple(shape), dtype=self.in_buffer.dtype) self.region_out = [(0, i, 1) for i in shape] # (tuple([0]*len(shape)),shape) elif out_buffer is not None and region_out is not None: self.out_buffer = out_buffer self.region_out = region_out else: raise ValueError('Case of input argument combination not supported') if self.in_buffer.dtype != self.out_buffer.dtype: raise ValueError('in and out buffer must be of same type') self.shape_region_out = tuple([ax[1] - ax[0] for ax in self.region_out]) self.in_buffer = in_buffer self.copy_array_region = Kernel(name='copy_array_region', args={'in_buffer': Global(self.in_buffer, read_only=True), 'out_buffer': Global(self.out_buffer, )}, body=[""" int addr_in = ${command_addr_in}; int addr_out = ${command_addr_out}; out_buffer[addr_out]=in_buffer[addr_in]; """], replacements={'command_addr_in': self._command_for_addr_in_computation(), 'command_addr_out': self._command_for_addr_out_computation()}, global_size=self.shape_region_out).compile()
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value): """ example usage: set slice of array with scalar value val = 1 cl_set(ary, Slice[:,2:3], val) set slice of array with equally shaped numpy array like the slice some_np_array = np.array([[3,4]) cl_set(ary, Slice[1:2,2:3], some_np_array) :param array: :param region: :param value: :return: """ # todo test if array c contiguous region_arg = region # if slice is contiguous block of memory set it as # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer) # _buffer_np[:, memory: -memory] = mapper.alphabet[0] # add_symbols_memory_initialization.out_buffer.set(_buffer_np) region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array) region = CopyArrayRegion._deal_with_none_in_stop(region, array) region = CopyArrayRegion._deal_with_negative_region(region, array) # test if requested region is for axis, _slice in enumerate(region): step_width = _slice[2] if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]: raise ValueError('Slicing out of array bounds') if any([(part[0] - part[1]) == 0 for part in region]): # check that there is no empty slice return global_size = np.product([part[1] - part[0] for part in region]) target_shape = to_device(np.array(array.shape).astype(Types.int)) offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int)) source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int)) source_n_dims = len(source_shape) if isinstance(value, np.ndarray): source = to_device(value.astype(array.dtype)) arg_source = Global(source) code_source = 'source[get_global_id(0)]' else: arg_source = Scalar(array.dtype) source = value code_source = 'source' knl = Kernel('set_cl_array', {'target': Global(array), 'target_shape': Global(target_shape), 'offset_target': Global(offset_target), 'source': arg_source, 'source_shape': Global(source_shape), 'source_n_dims': Scalar(Types.int)}, """ // id_source = get_global_id(0) // id_source points to element of array source which replaces element with id_target in array target. // we need to compute id_target from id_source: // we assume c-contiguous addressing like: // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array) // At first we need to compute individual ids of source array from id_source: // id3 = int(gid % s3), temp = (gid-id3)/s3 // id2 = int(temp % s2), temp = (temp-id2)/s2 // id1 = int(temp % s1), temp = (temp-id1)/s1 // id0 = int(temp % s0), temp = (temp-id0)/s1 // Finally, we can determine the id of the target array and copy element to corresponding position: // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x) int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions int temp = get_global_id(0); int prod_source_id_multiplier = 1; int prod_target_id_multiplier = 1; for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source int id_source = temp % source_shape[i]; temp = (int)((temp-id_source)/source_shape[i]); prod_source_id_multiplier *= source_shape[i]; id_target += (offset_target[i]+id_source)*prod_target_id_multiplier; prod_target_id_multiplier *= target_shape[i]; } target[id_target] = ${source}; """, replacements={'addr': Helpers.command_compute_address(array.ndim), 'source': code_source}, global_size=(global_size,) ).compile(array.context, emulate=False) knl(source=source, source_n_dims=source_n_dims)
shared[get_global_id(0)] = ary[get_global_id(0)]; barrier(CLK_LOCAL_MEM_FENCE); return shared[(get_global_id(0)+1)%2] ; """, returns=Types.int) func_parent = Function('parent', func_nested.args, """ return nested_func(ary, shared); """, returns=Types.int) ary = to_device((ary_np := np.array([1, 2]).astype(Types.int))) use_existing_file_for_emulation(False) knl = Kernel('some_knl', { 'ary': Global(ary), }, """ __local int shared[2]; ary[get_global_id(0)] = parent(ary, shared); """, global_size=ary.shape) prog = Program([func_nested, func_parent], [knl]) prog_py = prog.compile(emulate=True) prog_cl = prog.compile(emulate=False) prog_py.some_knl() ary_py = ary.get() ary.set(ary_np) prog_cl.some_knl() ary_cl = ary.get()
sr[i] = v[r].s0; si[i] = v[r].s1; } barrier(CLK_LOCAL_MEM_FENCE); for(int r=0; r<R; r++ ) { int i = (idxS + r*incS)*STRIDE; v[r] = (data_t)(sr[i], si[i]); } """, defines={'STRIDE': 1}) func_v_from_data0 = Function( 'v_from_data0', { 'v': Private(self.data_t), 'data0': Local(self.data_t) if self.b_local_memory else Global( self.data_t, read_only=True), 'idxS': Scalar(Types.int), 'r': Scalar(Types.int), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ if(iteration ==0){ if(idxS<N_INPUT){ v[r]=data0[(int)(idxS)]; }else{ v[r]=(data_t)(0.0, 0.0); }