コード例 #1
0
def test_pointer_increment(dtype):
    # todo use https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
    data = np.array([0]).astype(dtype)
    func = Function('func', {'data': Private(data.dtype)},
                    """
        return data[0];
    """,
                    returns=data.dtype)
    # Assigning to array pointers does not work in c (e.g. b=a does not compile):
    # https://stackoverflow.com/questions/744536/c-array-declaration-and-assignment
    # Below this can be solved by creating pointers p1 and p2 where their address can be exchange by assignment
    knl = Kernel('knl_pointer_arithmetics', {'data': data},
                 """ private dtype a[5] = {0}; private dtype b[5] = {0};
                     dtype *p1 = a; dtype *p2 = b;
                     a[3] = (dtype)(5);                     
                     p2 = a;
                     data[0] = func(p2+3); """,
                 global_size=data.shape,
                 type_defs={'dtype': dtype})
    prog = Program(functions=[func], kernels=[knl])
    knl_cl = prog.compile().knl_pointer_arithmetics
    knl_cl()
    res_cl = knl_cl.data.get()
    knl_py = prog.compile(emulate=True).knl_pointer_arithmetics
    knl_py()
    res_py = knl_cl.data.get()
    assert np.all(res_cl[0] == res_py[0])
コード例 #2
0
 def _get_kernel(stage_builder: FftStageBuilder,
                 iteration,
                 radix,
                 data_in,
                 data_out,
                 emulate=False):
     if iteration == 0:  # data0 is in_buffer, whose length might not be power of two
         offset_data_in = 'get_global_id(0)*N_INPUT'
     else:
         offset_data_in = 'get_global_id(0)*N'
     stage_builder.radix = radix
     funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages(
         [radix])
     funcs.append(stage_builder.get_fft_stage_func())
     d['M'] = (M :=
               data_in.shape[0])  # number of FFTs to perform simultaneously
     knl_gpu_fft = Kernel(
         'gpu_fft',
         {
             'Ns': Scalar(Types.int),
             # In each iteration, the algorithm can be thought of combining the radix R FFTs on
             # subsequences of length Ns into the FFT of a new sequence of length RNs by
             # performing an FFT of length R on the corresponding elements of the subsequences.
             'data_in': Global(data_in),
             'data_out': Global(data_out),
             'direction': Scalar(Types.int),
             'iteration': Scalar(Types.int)
         },
         """
              ${shared_mem_fft_stage}
              int t = get_local_id(2); // thread index 
              int b = get_global_id(1); // thread block index
              fft_stage(t, b, Ns, 
                        data_in  +${offset_data_in},
                        data_out +get_global_id(0)*N,
                        shared,direction, iteration);""",
         replacements={
             'offset_data_in': offset_data_in,
             'shared_mem_fft_stage': shared_mem_fft_stage
         },
         global_size=(d['M'], max(1,
                                  int(d['N'] / (d['R'] * d['T']))), d['T']),
         local_size=(1, 1, d['T']))
     program = Program(
         funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile(
             context=data_in.context,
             emulate=emulate,
             file=Program.get_default_dir_pycl_kernels().joinpath(
                 f'fft_{iteration}_{stage_builder.radix}'))
     return program.gpu_fft
コード例 #3
0
 def __init__(self, in_buffer: Array, out_buffer_dtype: np.dtype):
     self.in_buffer = in_buffer
     self.out_buffer = empty(in_buffer.shape, out_buffer_dtype, in_buffer.queue)
     knl = Kernel(name='type',
                  args={'in_buffer': Global(self.in_buffer, read_only=True),
                        'out_buffer': Global(self.out_buffer)},
                  body=["""
                                int addr_in = ${command_addr_in};
                                int addr_out = ${command_addr_out};
                                out_buffer[addr_out]=convert_${buff_out_t}(in_buffer[addr_in]);
                                """],
                  replacements={'command_addr_in': Helpers.command_compute_address(self.in_buffer.ndim),
                                'command_addr_out': Helpers.command_compute_address(self.out_buffer.ndim),
                                'buff_out_t': c_name_from_dtype(self.out_buffer.dtype)},
                  global_size=self.in_buffer.shape)
     self.program = Program(kernels=[knl]).compile(context=in_buffer.context, emulate=False)
コード例 #4
0
 def _get_cl_program(self) -> Program:
     knl = Kernel(name='sum_along_axis',
                  args={
                      'in_buffer': Global(self.in_buffer),
                      'axis': Scalar(Types.int(self.axis)),
                      'out_buffer': Global(self.out_buffer)
                  },
                  body=[
                      """
              buff_t sum = (buff_t) 0;
              for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis
                 sum+=in_buffer[${addr_in}];
              }                 
              out_buffer[${addr}] = sum;
              """
                  ],
                  replacements={
                      'size_input_axis':
                      self.in_buffer.shape[self.axis],
                      'addr':
                      Helpers.command_compute_address(self.out_buffer.ndim),
                      'addr_in':
                      self._command_compute_address_in()
                  },
                  global_size=self.out_buffer.shape)
     type_defs = {'buff_t': self.in_buffer.dtype}
     return Program(type_defs=type_defs, kernels=[knl])
コード例 #5
0
def test_memoize_kernel():
    # thread = Thread(profile=True)
    ary_a = np.ones(int(1e3))
    ary_b = np.zeros(ary_a.shape)
    ary_a_buffer = to_device(ary_a)
    ary_b_buffer = to_device(ary_b)
    n_recompilations = 100
    for i in range(n_recompilations + 1):
        kernels = []
        for j in range(10):
            some_knl = Kernel(
                f'some_knl_{j}', {
                    'ary_a': Global(ary_a_buffer),
                    'ary_b': Global(ary_b_buffer)
                }, """
                     ary_b[get_global_id(0)] = ary_a[get_global_id(0)];
                     """)
            kernels.append(some_knl)
        Program(kernels=kernels).compile()
        some_knl(global_size=ary_a.shape)
        if i == 1:
            t = time.time()
    time_per_recompile = (time.time() - t) / n_recompilations
    # thread.queue.get_profiler().show_histogram_cumulative_kernel_times()
    print(time_per_recompile)
    assert time_per_recompile < 0.001  # less than 1 ms overhead per recompilation achieved through caching
コード例 #6
0
 def __init__(self, mode='a', b_create_kernel_file: bool = True):
     self.mode = mode
     self.buff = zeros((10, ), Types.short)
     self.data_t = self.buff.dtype
     func = Function('plus_one', {
         'buffer': Global(self.buff.dtype),
         'idx': Scalar(Types.int)
     }, [
         """
                               return buffer[idx]+${some_integer};
                               """
     ], {'some_integer': 5},
                     returns=Types.float)
     knl = Kernel('some_operation', {
         'buff': Global(self.buff),
         'number': Scalar(Types.short(3.0))
     }, [
         """
                             data_t factor = convert_${data_t}(1.3);
                             buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor;
                             """
     ],
                  replacements={'data_t': c_name_from_dtype(self.data_t)},
                  global_size=self.buff.shape)
     defines = {'SOME_CONSTANT': 6}
     type_defs = {'data_t': self.data_t}
     self.program = Program(defines=defines,
                            type_defs=type_defs,
                            functions=[func],
                            kernels=[knl]).compile()
     self.knl = knl
コード例 #7
0
def test_bit_shift(
):  # todo use https://numpy.org/doc/stable/reference/generated/numpy.ndarray.ctypes.html
    data = np.array([0, 0, 0, 0]).astype(Types.char)
    knl = Kernel('knl_bit_packing', {'data': data},
                 """
        uchar a = 5;
        uchar b = 3;
        uchar c = (a << 4) | b;
        data[0] = (c & 0xF0) >> 4;
        data[1] = c & (0x0F);
    """,
                 global_size=data.shape)
    prog = Program(kernels=[knl])
    knl_cl = prog.compile().knl_bit_packing
    knl_py = prog.compile(emulate=True).knl_bit_packing
    knl_cl()
    get_current_queue().finish()
    res_cl = knl_cl.data.get()
    knl_py()
    res_py = knl_cl.data.get()
    assert np.all(res_cl == res_py)
コード例 #8
0
def test_macro_with_arguments():
    defines = {
        'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }'
    }  # this is a macro with arguments
    ary = zeros((2, ), Types.int)
    func_add_two = Function('add_two', {'a': Scalar(Types.int)},
                            'return a + 2;',
                            returns=Types.int)
    knl = Kernel('knl_macro_func', {'ary': Global(ary)},
                 """
               int a = 1;
               int b = 2;
               FUNC(a, b, add_two)
               ary[get_global_id(0)] = a;
               """,
                 defines=defines,
                 global_size=ary.shape)
    Program([func_add_two], [knl]).compile().knl_macro_func()
    assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
コード例 #9
0
def test_add_functions_inside_function_or_kernel_definition():
    ary_a = to_device(np.ones(100))
    fnc_add3 = Function('add_three', {'a': Scalar(Types.int)},
                        'return a + 3;',
                        returns=Types.int)
    fnc_add5 = Function('add_five', {'a': Scalar(Types.int)},
                        """
             return add_three(a)+2;
             """,
                        functions=[fnc_add3],
                        returns=Types.int)
    some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)},
                      """
             ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]);
             """,
                      global_size=ary_a.shape,
                      functions=[fnc_add5])
    functions = [
    ]  # funcitons defined here have higher proiority in case of name conflicts
    Program(functions=functions, kernels=[some_knl]).compile()
    some_knl()
    assert ary_a.get()[0] == 6
コード例 #10
0
                           """
               return nested_func(ary, shared);
               """,
                           returns=Types.int)

    ary = to_device((ary_np := np.array([1, 2]).astype(Types.int)))
    use_existing_file_for_emulation(False)
    knl = Kernel('some_knl', {
        'ary': Global(ary),
    },
                 """
                __local int shared[2];
                ary[get_global_id(0)] = parent(ary, shared);
                   """,
                 global_size=ary.shape)
    prog = Program([func_nested, func_parent], [knl])
    prog_py = prog.compile(emulate=True)
    prog_cl = prog.compile(emulate=False)
    prog_py.some_knl()

    ary_py = ary.get()
    ary.set(ary_np)
    prog_cl.some_knl()
    ary_cl = ary.get()
    get_current_queue().finish()
    assert np.allclose(ary_py, np.array([2, 1]))
    assert np.allclose(ary_py, ary_cl)


def test_macro_with_arguments():
    defines = {