Beispiel #1
0
 def __init__(self, mode='a', b_create_kernel_file: bool = True):
     self.mode = mode
     self.buff = zeros((10, ), Types.short)
     self.data_t = self.buff.dtype
     func = Function('plus_one', {
         'buffer': Global(self.buff.dtype),
         'idx': Scalar(Types.int)
     }, [
         """
                               return buffer[idx]+${some_integer};
                               """
     ], {'some_integer': 5},
                     returns=Types.float)
     knl = Kernel('some_operation', {
         'buff': Global(self.buff),
         'number': Scalar(Types.short(3.0))
     }, [
         """
                             data_t factor = convert_${data_t}(1.3);
                             buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor;
                             """
     ],
                  replacements={'data_t': c_name_from_dtype(self.data_t)},
                  global_size=self.buff.shape)
     defines = {'SOME_CONSTANT': 6}
     type_defs = {'data_t': self.data_t}
     self.program = Program(defines=defines,
                            type_defs=type_defs,
                            functions=[func],
                            kernels=[knl]).compile()
     self.knl = knl
Beispiel #2
0
    def get_fft_stage_func(self):
        replacements = {'fft_radix_R': f'fft_radix_{self.radix}'}
        func_fft_iteration = Function(
            'fft_stage',
            {
                # thread index: T radix R fft ops are done in parallel
                't':
                Scalar(Types.int),
                # thread block index: identifies block of N/(R*T) blocks.
                'b':
                Scalar(Types.int),
                'Ns':
                Scalar(Types.int),
                'data0':
                Local(self.data_t) if self.b_local_memory else Global(
                    self.data_t, read_only=True),
                'data1':
                Local(self.data_t)
                if self.b_local_memory else Global(self.data_t),
                'shared':
                Local(self.real_t),
                'direction':
                Scalar(Types.int),
                'iteration':
                Scalar(Types.int)
            },
            """
        int j = b*T + t; // as proposed in [1, p.3, text section A]
        private data_t v[R];        
        real_t angle = -2*PI*(j % Ns)/(Ns*R);
        for(int r=0; r<R; r++){
            int idxS = j + r* N/R; // idxS=idxSource
            v_from_data0(v, data0, idxS, r, direction,iteration);
            real_t cos_angle = cos(r*angle);
            real_t sin_angle = sin(r*angle);
            v[r] = (data_t)(v[r].s0*cos_angle-v[r].s1 *sin_angle,
                            v[r].s0*sin_angle+v[r].s1 *cos_angle);
        }
        ${fft_radix_R}(v);

        if(Ns>=CW){ // todo: remove condition and store as separate kernel
            // changed line 27 of [1, Figure 2] to work with global dimensions of this class:
            int offset = expand(j, Ns, R); 
            for(int r=0; r<R; r++){
                int idxD = offset + r*Ns; // idxD=idxDestination
                data_1_from_v(v, data1, idxD, r, direction, iteration);
            }
        }else{ // According to [1, p.4], such that global memory writes are coalesced
            // !! mistake in [1] where *Ns is missing: idxD = (int)(t/Ns)*R + (t%Ns); !!
            int idxD = (int)(t/Ns)*Ns*R + (t%Ns);
            exchange( v, idxD, Ns, t, T, shared);
            int offset = b*R*T+ t; 
            for( int r=0; r<R; r++ ){
                idxD = offset + r*T;
                data_1_from_v(v, data1, idxD, r, direction, iteration);
            }  
        }
                   """,
            replacements=replacements)
        return func_fft_iteration
Beispiel #3
0
 def _get_kernel(stage_builder: FftStageBuilder,
                 iteration,
                 radix,
                 data_in,
                 data_out,
                 emulate=False):
     if iteration == 0:  # data0 is in_buffer, whose length might not be power of two
         offset_data_in = 'get_global_id(0)*N_INPUT'
     else:
         offset_data_in = 'get_global_id(0)*N'
     stage_builder.radix = radix
     funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages(
         [radix])
     funcs.append(stage_builder.get_fft_stage_func())
     d['M'] = (M :=
               data_in.shape[0])  # number of FFTs to perform simultaneously
     knl_gpu_fft = Kernel(
         'gpu_fft',
         {
             'Ns': Scalar(Types.int),
             # In each iteration, the algorithm can be thought of combining the radix R FFTs on
             # subsequences of length Ns into the FFT of a new sequence of length RNs by
             # performing an FFT of length R on the corresponding elements of the subsequences.
             'data_in': Global(data_in),
             'data_out': Global(data_out),
             'direction': Scalar(Types.int),
             'iteration': Scalar(Types.int)
         },
         """
              ${shared_mem_fft_stage}
              int t = get_local_id(2); // thread index 
              int b = get_global_id(1); // thread block index
              fft_stage(t, b, Ns, 
                        data_in  +${offset_data_in},
                        data_out +get_global_id(0)*N,
                        shared,direction, iteration);""",
         replacements={
             'offset_data_in': offset_data_in,
             'shared_mem_fft_stage': shared_mem_fft_stage
         },
         global_size=(d['M'], max(1,
                                  int(d['N'] / (d['R'] * d['T']))), d['T']),
         local_size=(1, 1, d['T']))
     program = Program(
         funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile(
             context=data_in.context,
             emulate=emulate,
             file=Program.get_default_dir_pycl_kernels().joinpath(
                 f'fft_{iteration}_{stage_builder.radix}'))
     return program.gpu_fft
Beispiel #4
0
 def _get_cl_program(self) -> Program:
     knl = Kernel(name='sum_along_axis',
                  args={
                      'in_buffer': Global(self.in_buffer),
                      'axis': Scalar(Types.int(self.axis)),
                      'out_buffer': Global(self.out_buffer)
                  },
                  body=[
                      """
              buff_t sum = (buff_t) 0;
              for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis
                 sum+=in_buffer[${addr_in}];
              }                 
              out_buffer[${addr}] = sum;
              """
                  ],
                  replacements={
                      'size_input_axis':
                      self.in_buffer.shape[self.axis],
                      'addr':
                      Helpers.command_compute_address(self.out_buffer.ndim),
                      'addr_in':
                      self._command_compute_address_in()
                  },
                  global_size=self.out_buffer.shape)
     type_defs = {'buff_t': self.in_buffer.dtype}
     return Program(type_defs=type_defs, kernels=[knl])
Beispiel #5
0
 def __init__(self, b_create_kernel_file: bool = True):
     self.buff = zeros((10, ), Types.short)
     self.knl = Kernel('some_operation', {
         'buff': Global(self.buff),
         'number': Scalar(Types.short(1))
     }, [
         """
                             buff[get_global_id(0)] = number;
                             """
     ],
                       global_size=self.buff.shape).compile()
def test_two_input_integer_functions(name, dtype):
    a_cl = to_device(np.ones((10, ), dtype))
    a_emulation = to_device(np.ones((10, ), dtype))
    knl = Kernel(f'knl_{name}', {
        'a': Global(a_cl),
        'num': Scalar(dtype(0))
    },
                 f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);',
                 global_size=a_cl.shape)
    knl.compile()(a=a_cl)
    knl.compile(emulate=True)(a=a_emulation)
    assert np.all(a_cl.get() == a_emulation.get())
Beispiel #7
0
def test_add_functions_inside_function_or_kernel_definition():
    ary_a = to_device(np.ones(100))
    fnc_add3 = Function('add_three', {'a': Scalar(Types.int)},
                        'return a + 3;',
                        returns=Types.int)
    fnc_add5 = Function('add_five', {'a': Scalar(Types.int)},
                        """
             return add_three(a)+2;
             """,
                        functions=[fnc_add3],
                        returns=Types.int)
    some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)},
                      """
             ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]);
             """,
                      global_size=ary_a.shape,
                      functions=[fnc_add5])
    functions = [
    ]  # funcitons defined here have higher proiority in case of name conflicts
    Program(functions=functions, kernels=[some_knl]).compile()
    some_knl()
    assert ary_a.get()[0] == 6
Beispiel #8
0
def get_funcs_radix8(data_t):
    funcs_mulpxpy_8 = [
        Function(k, {'a': Scalar(data_t)}, v, returns=data_t)
        for k, v in mul_pxpy_dict8.items()
    ]
    func_fft_radix_8 = Function(
        'fft_radix_8', {'v': Private(data_t)}, """
            // 4x in-place DFT2
            data_t u0 = v[0];
            data_t u1 = v[1];
            data_t u2 = v[2];
            data_t u3 = v[3];
            data_t u4 = v[4];
            data_t u5 = v[5];
            data_t u6 = v[6];
            data_t u7 = v[7];

            data_t v0 = u0 + u4;
            data_t v4 = mul_p0q4(u0 - u4);
            data_t v1 = u1 + u5;
            data_t v5 = mul_p1q4(u1 - u5);
            data_t v2 = u2 + u6;
            data_t v6 = mul_p2q4(u2 - u6);
            data_t v3 = u3 + u7;
            data_t v7 = mul_p3q4(u3 - u7);

            // 4x in-place DFT2 and twiddle
            u0 = v0 + v2;
            u2 = mul_p0q2(v0 - v2);
            u1 = v1 + v3;
            u3 = mul_p1q2(v1 - v3);
            u4 = v4 + v6;
            u6 = mul_p0q2(v4 - v6);
            u5 = v5 + v7;
            u7 = mul_p1q2(v5 - v7);

            // 4x DFT2 and store (reverse binary permutation)
            v[0]   = u0 + u1;
            v[1]   = u4 + u5;
            v[2] = u2 + u3;
            v[3] = u6 + u7;
            v[4] = u0 - u1;
            v[5] = u4 - u5;
            v[6] = u2 - u3;
            v[7] = u6 - u7;
                                """)
    return funcs_mulpxpy_8, func_fft_radix_8
def test_macro_with_arguments():
    defines = {
        'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }'
    }  # this is a macro with arguments
    ary = zeros((2, ), Types.int)
    func_add_two = Function('add_two', {'a': Scalar(Types.int)},
                            'return a + 2;',
                            returns=Types.int)
    knl = Kernel('knl_macro_func', {'ary': Global(ary)},
                 """
               int a = 1;
               int b = 2;
               FUNC(a, b, add_two)
               ary[get_global_id(0)] = a;
               """,
                 defines=defines,
                 global_size=ary.shape)
    Program([func_add_two], [knl]).compile().knl_macro_func()
    assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
Beispiel #10
0
def get_funcs_radix4(data_t):
    # mul_pxqy(a) returns a*exp(-j * PI * p / q) where p=x and q=y
    funcs_mulpxpy_4 = [
        Function(k, {'a': Scalar(data_t)}, v, returns=data_t)
        for k, v in mul_pxpy_dict4.items()
    ]
    func_fft_radix_4 = Function(
        'fft_radix_4', {'v': Private(data_t)}, """
            // 2x DFT2 and twiddle
            data_t v0 = v[0] + v[2];
            data_t v1 = v[0] - v[2];
            data_t v2 = v[1] + v[3];
            data_t v3 = mul_p1q2(v[1] - v[3]); // twiddle

            // 2x DFT2 and store
            v[0] = v0 + v2;
            v[1] = v1 + v3;
            v[2] = v0 - v2;
            v[3] = v1 - v3;
                                """)
    return funcs_mulpxpy_4, func_fft_radix_4
Beispiel #11
0
def test_conversion_knl_fnc_args_with_no_pointer_format():
    a_np = np.array([0.1, 0.2], dtype=Types.float)
    b_cl = zeros(shape=(2, ), dtype=Types.float)
    fnc = Function(
        'copy_fnc', {
            'a': a_np,
            'b': b_cl,
            'idx': Scalar(Types.int)
        }, """
                   b[idx] = a[idx];
                   """)
    knl = Kernel('some_knl', {
        'a': a_np,
        'b': b_cl
    },
                 """
                 copy_fnc(a, b, get_global_id(0));
                 """,
                 functions=[fnc],
                 global_size=b_cl.shape)
    knl.compile()
    knl()
    assert np.all(a_np == b_cl.get())
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value):
    """
    example usage:
    set slice of array with scalar value
    val = 1
    cl_set(ary, Slice[:,2:3], val)

    set slice of array with equally shaped numpy array like the slice
    some_np_array = np.array([[3,4])
    cl_set(ary, Slice[1:2,2:3], some_np_array)


    :param array:
    :param region:
    :param value:
    :return:
    """
    # todo test if array c contiguous
    region_arg = region
    # if slice is contiguous block of memory set it as
    # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer)
    # _buffer_np[:, memory: -memory] = mapper.alphabet[0]
    # add_symbols_memory_initialization.out_buffer.set(_buffer_np)
    region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array)
    region = CopyArrayRegion._deal_with_none_in_stop(region, array)
    region = CopyArrayRegion._deal_with_negative_region(region, array)

    # test if requested region is
    for axis, _slice in enumerate(region):
        step_width = _slice[2]
        if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]:
            raise ValueError('Slicing out of array bounds')
    if any([(part[0] - part[1]) == 0 for part in region]):  # check that there is no empty slice
        return
    global_size = np.product([part[1] - part[0] for part in region])
    target_shape = to_device(np.array(array.shape).astype(Types.int))
    offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int))
    source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int))
    source_n_dims = len(source_shape)

    if isinstance(value, np.ndarray):
        source = to_device(value.astype(array.dtype))
        arg_source = Global(source)
        code_source = 'source[get_global_id(0)]'
    else:
        arg_source = Scalar(array.dtype)
        source = value
        code_source = 'source'

    knl = Kernel('set_cl_array',
                 {'target': Global(array),
                  'target_shape': Global(target_shape),
                  'offset_target': Global(offset_target),
                  'source': arg_source,
                  'source_shape': Global(source_shape),
                  'source_n_dims': Scalar(Types.int)},
                 """
   // id_source = get_global_id(0)
   // id_source points to element of array source which replaces element with id_target in array target.
   // we need to compute id_target from id_source:
   // we assume c-contiguous addressing like:
   // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array)
   // At first we need to compute individual ids of source array from id_source:
   // id3 = int(gid % s3), temp = (gid-id3)/s3
   // id2 = int(temp % s2), temp = (temp-id2)/s2
   // id1 = int(temp % s1), temp = (temp-id1)/s1
   // id0 = int(temp % s0), temp = (temp-id0)/s1
   // Finally, we can determine the id of the target array and copy element to corresponding position:
   // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x)
   int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions

   int temp = get_global_id(0);
   int prod_source_id_multiplier = 1;
   int prod_target_id_multiplier = 1;

   for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source
    int id_source = temp % source_shape[i];
    temp = (int)((temp-id_source)/source_shape[i]);
    prod_source_id_multiplier *= source_shape[i];
    id_target += (offset_target[i]+id_source)*prod_target_id_multiplier;
    prod_target_id_multiplier *= target_shape[i];
   }
   target[id_target] = ${source};
                   """,
                 replacements={'addr': Helpers.command_compute_address(array.ndim),
                               'source': code_source},
                 global_size=(global_size,)
                 ).compile(array.context, emulate=False)
    knl(source=source, source_n_dims=source_n_dims)
Beispiel #13
0
def get_funcs_radix16(data_t):
    func_mul1 = Function(
        'mul_1', {
            'a': Scalar(data_t),
            'b': Scalar(data_t)
        },
        'data_t x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x;',
        returns=data_t,
        defines={
            'MUL_RE(a,b)': '(a.even*b.even - a.odd*b.odd)',
            'MUL_IM(a,b)': '(a.even*b.odd + a.odd*b.even)'
        })

    funcs_mulpxpy_16 = [
        Function(k, {'a': Scalar(data_t)}, v, returns=data_t)
        for k, v in mul_pxpy_dict16.items()
    ]
    funcs_mulpxpy_16[0].defines = {
        'COS_8': np.cos(np.pi / 8),
        'SIN_8': np.sin(np.pi / 8)
    }
    func_fft_radix_16 = Function(
        'fft_radix_16', {'v': Private(data_t)},
        """
            data_t u[16];
            for (int m=0;m<16;m++) u[m] = v[m];
            // 8x in-place DFT2 and twiddle (1)
            DFT2_TWIDDLE(u[0],u[8],mul_p0q8);
            DFT2_TWIDDLE(u[1],u[9],mul_p1q8);
            DFT2_TWIDDLE(u[2],u[10],mul_p2q8);
            DFT2_TWIDDLE(u[3],u[11],mul_p3q8);
            DFT2_TWIDDLE(u[4],u[12],mul_p4q8);
            DFT2_TWIDDLE(u[5],u[13],mul_p5q8);
            DFT2_TWIDDLE(u[6],u[14],mul_p6q8);
            DFT2_TWIDDLE(u[7],u[15],mul_p7q8);

            // 8x in-place DFT2 and twiddle (2)
            DFT2_TWIDDLE(u[0],u[4],mul_p0q4);
            DFT2_TWIDDLE(u[1],u[5],mul_p1q4);
            DFT2_TWIDDLE(u[2],u[6],mul_p2q4);
            DFT2_TWIDDLE(u[3],u[7],mul_p3q4);
            DFT2_TWIDDLE(u[8],u[12],mul_p0q4);
            DFT2_TWIDDLE(u[9],u[13],mul_p1q4);
            DFT2_TWIDDLE(u[10],u[14],mul_p2q4);
            DFT2_TWIDDLE(u[11],u[15],mul_p3q4);

            // 8x in-place DFT2 and twiddle (3)
            DFT2_TWIDDLE(u[0],u[2],mul_p0q2);
            DFT2_TWIDDLE(u[1],u[3],mul_p1q2);
            DFT2_TWIDDLE(u[4],u[6],mul_p0q2);
            DFT2_TWIDDLE(u[5],u[7],mul_p1q2);
            DFT2_TWIDDLE(u[8],u[10],mul_p0q2);
            DFT2_TWIDDLE(u[9],u[11],mul_p1q2);
            DFT2_TWIDDLE(u[12],u[14],mul_p0q2);
            DFT2_TWIDDLE(u[13],u[15],mul_p1q2);

            // 8x DFT2 and store (reverse binary permutation)
            v[0]  = u[0]  + u[1];
            v[1]  = u[8]  + u[9];
            v[2]  = u[4]  + u[5];
            v[3]  = u[12] + u[13];
            v[4]  = u[2]  + u[3];
            v[5]  = u[10] + u[11];
            v[6]  = u[6]  + u[7];
            v[7]  = u[14] + u[15];
            v[8]  = u[0]  - u[1];
            v[9]  = u[8]  - u[9];
            v[10] = u[4]  - u[5];
            v[11] = u[12] - u[13];
            v[12] = u[2]  - u[3];
            v[13] = u[10] - u[11];
            v[14] = u[6]  - u[7];
            v[15] = u[14] - u[15];
                                """,
        defines={
            'DFT2_TWIDDLE(a,b,t)': '{ data_t tmp = t(a-b); a += b; b = tmp; }'
        })
    funcs_helpers = [func_mul1] + funcs_mulpxpy_16
    return funcs_helpers, func_fft_radix_16
Beispiel #14
0
            # work group size:
            'T': (T := min(int(N / R), conf.global_mem_cacheline_size)),
            'CW': (CW := conf.global_mem_cacheline_size),  # Coalescing width,
            'N_INPUT': conf.
            size_data_in_first_iteration,  # in_buffer length which might not be power of two
            'ITERATION_MAX': self.
            iteration_max,  # in_buffer length which might not be power of two
        }

        funcs_radix, defines_radices = get_funcs_radixes(radixes, data_t)
        defines = {**defines, **defines_radices}

        # [1]: The expand() function can be thought of as inserting a dimension of length N2 after the first
        # dimension of length N1 in a linearized index.
        func_expand = Function('expand', {
            'idxL': Scalar(Types.int),
            'N1': Scalar(Types.int),
            'N2': Scalar(Types.int)
        },
                               """
                                 return (int)(idxL/N1)*N1*N2 + (idxL%N1);
                                 """,
                               returns=Types.int)
        # float2* v, int R, int idxD, int incD, int idxS, int incS
        func_exchange = Function('exchange', {
            'v': Private(data_t),
            'idxD': Scalar(Types.int),
            'incD': Scalar(Types.int),
            'idxS': Scalar(Types.int),
            'incS': Scalar(Types.int),
            'shared': Local(real_t),