Beispiel #1
0
    def _generate_kernel_call(self):

        kernel_call = cgen.Module(
            [cgen.Comment('#### Kernel call arguments ####')])
        kernel_call_symbols = []
        if self._kernel.static_args is not None:
            for i, dat in enumerate(self._kernel.static_args.items()):
                kernel_call_symbols.append(dat[0])

        for i, dat in enumerate(self._dat_dict.items()):

            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]

            g = self._components['PARTICLE_DAT_C'][symbol]
            kernel_call_symbols.append(g.kernel_arg)
            kernel_call.append(g.kernel_create_j_arg)
            self._components['KERNEL_GATHER'] += g.kernel_create_i_arg
            self._components['KERNEL_SCATTER'] += g.kernel_create_i_scatter

        kernel_call.append(cgen.Comment('#### Kernel call ####'))

        kernel_call_symbols_s = ''
        for sx in kernel_call_symbols:
            kernel_call_symbols_s += sx + ','
        kernel_call_symbols_s = kernel_call_symbols_s[:-1]

        kernel_call.append(
            cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s +
                      ');'))

        self._components['LIB_KERNEL_CALL'] = kernel_call
Beispiel #2
0
        def _(iet):
            # TODO: we need to pick the rank from `comm_shm`, not `comm`,
            # so that we have nranks == ngpus (as long as the user has launched
            # the right number of MPI processes per node given the available
            # number of GPUs per node)

            objcomm = None
            for i in iet.parameters:
                if isinstance(i, MPICommObject):
                    objcomm = i
                    break

            devicetype = as_list(self.lang[self.platform])

            try:
                lang_init = [self.lang['init'](devicetype)]
            except TypeError:
                # Not all target languages need to be explicitly initialized
                lang_init = []

            deviceid = DeviceID()
            if objcomm is not None:
                rank = Symbol(name='rank')
                rank_decl = LocalExpression(DummyEq(rank, 0))
                rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)])

                ngpus = Symbol(name='ngpus')
                call = self.lang['num-devices'](devicetype)
                ngpus_init = LocalExpression(DummyEq(ngpus, call))

                osdd_then = self.lang['set-device']([deviceid] + devicetype)
                osdd_else = self.lang['set-device']([rank % ngpus] +
                                                    devicetype)

                body = lang_init + [
                    Conditional(
                        CondNe(deviceid, -1),
                        osdd_then,
                        List(
                            body=[rank_decl, rank_init, ngpus_init, osdd_else
                                  ]),
                    )
                ]

                header = c.Comment('Begin of %s+MPI setup' % self.lang['name'])
                footer = c.Comment('End of %s+MPI setup' % self.lang['name'])
            else:
                body = lang_init + [
                    Conditional(
                        CondNe(deviceid, -1),
                        self.lang['set-device']([deviceid] + devicetype))
                ]

                header = c.Comment('Begin of %s setup' % self.lang['name'])
                footer = c.Comment('End of %s setup' % self.lang['name'])

            init = List(header=header, body=body, footer=(footer, c.Line()))
            iet = iet._rebuild(body=(init, ) + iet.body)

            return iet, {'args': deviceid}
Beispiel #3
0
    def _initialize(iet):
        comm = None

        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = Function('omp_get_num_devices')()
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            set_device_num = Call('omp_set_default_device', [rank % ngpus])

            body = [rank_decl, rank_init, ngpus_init, set_device_num]

            init = List(header=c.Comment('Begin of OpenMP+MPI setup'),
                        body=body,
                        footer=(c.Comment('End of OpenMP+MPI setup'), c.Line()))

            iet = iet._rebuild(body=(init,) + iet.body)

        return iet
Beispiel #4
0
 def _generate_lib_src(self):
     self._components['LIB_SRC'] = cgen.Module([
         self._components['KERNEL_STRUCT_TYPEDEFS'],
         cgen.Comment('#### Kernel function ####'),
         self._components['KERNEL_FUNC'],
         cgen.Comment('#### Library function ####'),
         self._components['LIB_FUNC']
     ])
Beispiel #5
0
 def visit_Section(self, o):
     body = flatten(self._visit(i) for i in o.children)
     if o.is_subsection:
         header = []
         footer = []
     else:
         header = [c.Comment("Begin %s" % o.name)]
         footer = [c.Comment("End %s" % o.name)]
     return c.Module(header + body + footer)
Beispiel #6
0
    def _generate_kernel_call(self):

        kernel_call = cgen.Module([
            cgen.Comment('#### Kernel call arguments ####'),
            cgen.Initializer(
                cgen.Const(
                    cgen.Value('int',
                               self._components['OMP_THREAD_INDEX_SYM'])),
                'omp_get_thread_num()')
        ])
        kernel_call_symbols = []
        shared_syms = self._components['OMP_SHARED_SYMS']

        if self._kernel.static_args is not None:
            for i, dat in enumerate(self._kernel.static_args.items()):
                kernel_call_symbols.append(dat[0])

        for i, dat in enumerate(self._dat_dict.items()):
            if issubclass(type(dat[1][0]), host._Array):
                sym = dat[0]
                if issubclass(type(dat[1][0]), data.GlobalArrayClassic):
                    sym += '[' + self._components['OMP_THREAD_INDEX_SYM'] + ']'
                kernel_call_symbols.append(sym)
                shared_syms.append(dat[0])

            elif issubclass(type(dat[1][0]), host.Matrix):
                call_symbol = dat[0] + '_c'
                kernel_call_symbols.append(call_symbol)

                nc = str(dat[1][0].ncomp)
                _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc

                isym = dat[0] + _ishift
                g = cgen.Value('_' + dat[0] + '_t', call_symbol)
                g = cgen.Initializer(g, '{ ' + isym + '}')

                kernel_call.append(g)
                shared_syms.append(dat[0])

            else:
                print("ERROR: Type not known")

        kernel_call.append(cgen.Comment('#### Kernel call ####'))

        kernel_call_symbols_s = ''
        for sx in kernel_call_symbols:
            kernel_call_symbols_s += sx + ','
        kernel_call_symbols_s = kernel_call_symbols_s[:-1]

        kernel_call.append(
            cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s +
                      ');'))

        self._components['LIB_KERNEL_CALL'] = kernel_call
Beispiel #7
0
    def _generate_kernel_call(self):

        kernel_call = cgen.Module(
            [cgen.Comment('#### Kernel call arguments ####')])
        kernel_call_symbols = []
        if self._kernel.static_args is not None:
            for i, dat in enumerate(self._kernel.static_args.items()):
                kernel_call_symbols.append(dat[0])

        for i, dat in enumerate(self._dat_dict.items()):

            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]

            if issubclass(type(obj), data.GlobalArrayClassic):
                kernel_call_symbols.append(symbol + '_c')
            elif issubclass(type(obj), host._Array):
                kernel_call_symbols.append(symbol)
            elif issubclass(type(obj), host.Matrix):
                call_symbol = symbol + '_c'
                kernel_call_symbols.append(call_symbol)

                nc = str(obj.ncomp)
                _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc
                _jshift = '+' + self._components['LIB_PAIR_INDEX_1'] + '*' + nc

                if mode.write and obj.ncomp <= self._gather_size_limit:
                    isym = '&' + symbol + 'i[0]'
                else:
                    isym = symbol + _ishift
                jsym = symbol + _jshift
                g = cgen.Value('_' + symbol + '_t', call_symbol)
                g = cgen.Initializer(g, '{ ' + isym + ', ' + jsym + '}')

                kernel_call.append(g)

            else:
                print("ERROR: Type not known")

        kernel_call.append(cgen.Comment('#### Kernel call ####'))

        kernel_call_symbols_s = ''
        for sx in kernel_call_symbols:
            kernel_call_symbols_s += sx + ','
        kernel_call_symbols_s = kernel_call_symbols_s[:-1]

        kernel_call.append(
            cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s +
                      ');'))

        self._components['LIB_KERNEL_CALL'] = kernel_call
Beispiel #8
0
def _make_thread_init(threads, tfunc, isdata, sdata, sregistry):
    d = threads.index
    if threads.size == 1:
        callback = lambda body: body
    else:
        callback = lambda body: Iteration(body, d, threads.size - 1)

    # A unique identifier for each created pthread
    pthreadid = d + threads.base_id

    # Initialize `sdata`
    arguments = list(isdata.parameters)
    arguments[-3] = sdata.symbolic_base + d
    arguments[-2] = pthreadid
    arguments[-1] = sregistry.deviceid
    call0 = Call(isdata.name, arguments)

    # Create pthreads
    call1 = Call('pthread_create', (threads.symbolic_base + d,
                                    Macro('NULL'),
                                    Call(tfunc.name, [], is_indirect=True),
                                    sdata.symbolic_base + d))

    threadsinit = List(
        header=c.Comment("Fire up and initialize `%s`" % threads.name),
        body=callback([call0, call1])
    )

    return threadsinit
Beispiel #9
0
    def _generate_kernel_scatter(self):
        kernel_scatter = cgen.Module(
            [cgen.Comment('#### Post kernel scatter ####')])

        ci = self._components['LIB_CELL_INDEX_0']

        inner_l = []
        src_sym = '_sgpx'
        dst_sym = '_shpx'
        # add dats to omp shared and init global array reduction
        for i, dat in enumerate(self._dat_dict.items()):

            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]

            if issubclass(type(obj), data.ParticleDat) and mode.write:
                tsym = self._components['PARTICLE_DAT_PARTITION'].idict[symbol]
                inner_l.append(
                    DSLStrideScatter(tsym, symbol, obj.ncomp, dst_sym, src_sym,
                                     self._components['CCC_MAX']))

        inner_l.append(cgen.Line(dst_sym + '++;'))
        inner = cgen.Module(inner_l)
        g = self._components['CELL_LIST_ITER'](src_sym, ci, inner)

        kernel_scatter.append(
            cgen.Initializer(cgen.Value('INT64', dst_sym), '0'))
        kernel_scatter.append(g)

        self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
Beispiel #10
0
    def _generate_lib_inner_loop_block(self):
        # generate j gather
        #'J_GATHER'
        cj = self._components['LIB_CELL_INDEX_1']

        j_gather = cgen.Module([
            cgen.Comment('#### Pre kernel j gather ####'),
        ])

        inner_l = []
        src_sym = '_tmp_jgpx'
        dst_sym = self._components['CCC_1']

        # add dats to omp shared and init global array reduction
        for i, dat in enumerate(self._dat_dict.items()):

            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]

            if issubclass(type(obj), data.ParticleDat):
                tsym = self._components['PARTICLE_DAT_PARTITION'].jdict[symbol]
                inner_l.append(
                    DSLStrideGather(symbol, tsym, obj.ncomp, src_sym, dst_sym,
                                    self._components['CCC_MAX']))

        inner_l.append(cgen.Line(dst_sym + '++;'))

        inner = cgen.Module(inner_l)
        g = self._components['CELL_LIST_ITER'](src_sym, cj, inner)

        j_gather.append(cgen.Initializer(cgen.Value('INT64', dst_sym), '0'))
        j_gather.append(g)

        self._components['J_GATHER'] = j_gather
Beispiel #11
0
def _make_thread_activate(threads, sdata, sync_ops, sregistry):
    if threads.size == 1:
        d = threads.index
    else:
        d = Symbol(name=sregistry.make_name(prefix=threads.index.name))

    sync_locks = [s for s in sync_ops if s.is_SyncLock]
    condition = Or(*([CondNe(s.handle, 2) for s in sync_locks] +
                     [CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1)]))

    if threads.size == 1:
        activation = [While(condition)]
    else:
        activation = [DummyExpr(d, 0),
                      While(condition, DummyExpr(d, (d + 1) % threads.size))]

    activation.extend([DummyExpr(FieldFromComposite(i.name, sdata[d]), i)
                       for i in sdata.dynamic_fields])
    activation.extend([DummyExpr(s.handle, 0) for s in sync_locks])
    activation.append(DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2))
    activation = List(
        header=[c.Line(), c.Comment("Activate `%s`" % threads.name)],
        body=activation,
        footer=c.Line()
    )

    return activation
Beispiel #12
0
    def _generate_kernel_scatter(self):

        kernel_scatter = cgen.Module(
            [cgen.Comment('#### Post kernel scatter ####')])

        if self._kernel.static_args is not None:

            for i, dat in enumerate(self._kernel.static_args.items()):
                pass

        for i, dat in enumerate(self._dat_dict.items()):

            if issubclass(type(dat[1][0]), host._Array):
                pass
            elif issubclass(type(dat[1][0]), host.Matrix)\
                    and dat[1][1].write\
                    and dat[1][0].ncomp <= self._gather_size_limit:

                isym = dat[0] + 'i'
                nc = dat[1][0].ncomp
                ncb = '[' + str(nc) + ']'
                dtype = host.ctypes_map[dat[1][0].dtype]
                ix = self._components['LIB_PAIR_INDEX_0']

                b = cgen.Assign(dat[0] + '[' + str(nc) + '*' + ix + '+_tx]',
                                isym + '[_tx]')
                g = cgen.For('int _tx=0', '_tx<' + str(nc), '_tx++',
                             cgen.Block([b]))

                kernel_scatter.append(g)

        self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
Beispiel #13
0
    def _generate_lib_outer_loop(self):

        block = cgen.Block([self._components['LIB_KERNEL_GATHER'],
                            self._components['LIB_INNER_LOOP'],
                            self._components['LIB_KERNEL_SCATTER']])

        i = self._components['LIB_PAIR_INDEX_0']

        shared = ''
        for sx in self._components['OMP_SHARED_SYMS']:
            shared+= sx+','
        shared = shared[:-1]
        pragma = cgen.Pragma('omp parallel for schedule(static) // default(shared) shared(' + shared + ')')
        if runtime.OMP_NUM_THREADS is None:
            pragma = cgen.Comment(pragma)

        loop = cgen.Module([
            cgen.Line('omp_set_num_threads(_NUM_THREADS);'),
            pragma,
            cgen.For('int ' + i + '=0',
                    i + '<_N_LOCAL',
                    i+'++',
                    block)
        ])

        self._components['LIB_OUTER_LOOP'] = loop
Beispiel #14
0
    def _make_fetchupdate(self, iet, sync_ops, pieces, *args):
        # Construct fetches
        postactions = []
        for s in sync_ops:
            # The condition is already encoded in `iet` with a Conditional,
            # which stems from the originating Cluster's guards
            assert s.fcond is None

            imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            postactions.append(
                PragmaTransfer(self.lang._map_update_device,
                               s.target,
                               imask=imask))

        # Turn init IET into a Callable
        functions = filter_ordered(
            flatten([(s.target, s.function) for s in sync_ops]))
        name = self.sregistry.make_name(prefix='init_device')
        body = List(body=iet.body + tuple(postactions))
        parameters = filter_sorted(functions + derive_parameters(body))
        func = Callable(name, body, 'void', parameters, 'static')
        pieces.funcs.append(func)

        # Perform initial fetch by the main thread
        iet = List(header=c.Comment("Initialize data stream"),
                   body=Call(name, parameters))

        return iet
Beispiel #15
0
def avoid_denormals(iet, platform=None):
    """
    Introduce nodes in the Iteration/Expression tree that will expand to C
    macros telling the CPU to flush denormal numbers in hardware. Denormals
    are normally flushed when using SSE-based instruction sets, except when
    compiling shared objects.
    """
    # There is unfortunately no known portable way of flushing denormal to zero.
    # See for example: https://stackoverflow.com/questions/59546406/\
    #                       a-robust-portable-way-to-set-flush-denormals-to-zero
    try:
        if 'sse' not in platform.known_isas:
            return iet, {}
    except AttributeError:
        return iet, {}

    if iet.is_ElementalFunction:
        return iet, {}

    header = (cgen.Comment('Flush denormal numbers to zero in hardware'),
              cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
              cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)'),
              cgen.Line())

    body = iet.body._rebuild(body=(List(header=header),) + iet.body.body)
    iet = iet._rebuild(body=body)

    return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
Beispiel #16
0
    def _(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)

        objcomm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                objcomm = i
                break

        deviceid = DeviceID()
        device_nvidia = Macro('acc_device_nvidia')
        if objcomm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia])
            asdn_else = Call('acc_set_device_num',
                             [rank % ngpus, device_nvidia])

            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1), asdn_then,
                    List(body=[rank_decl, rank_init, ngpus_init, asdn_else]))
            ]
        else:
            body = [
                Call('acc_init', [device_nvidia]),
                Conditional(
                    CondNe(deviceid, -1),
                    Call('acc_set_device_num', [deviceid, device_nvidia]))
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))
        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet, {'args': deviceid}
Beispiel #17
0
    def _generate_kernel_arg_decls(self):

        _kernel_arg_decls = []
        _kernel_lib_arg_decls = []
        _kernel_structs = cgen.Module(
            [cgen.Comment('#### Structs generated per ParticleDat ####')])

        if self._kernel.static_args is not None:

            for i, dat in enumerate(self._kernel.static_args.items()):
                _kernel_arg_decls.append(
                    cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0])))

        for i, dat in enumerate(self._dat_dict.items()):

            assert type(dat[1]) is tuple, "Access descriptors not found"

            kernel_lib_arg = cgen.Pointer(
                cgen.Value(host.ctypes_map[dat[1][0].dtype],
                           Restrict(self._cc.restrict_keyword, dat[0])))

            # print host.ctypes_map[dat[1][0].dtype], dat[1][0].dtype

            if issubclass(type(dat[1][0]), host._Array):
                kernel_arg = cgen.Pointer(
                    cgen.Value(host.ctypes_map[dat[1][0].dtype],
                               Restrict(self._cc.restrict_keyword, dat[0])))
                if not dat[1][1].write:
                    kernel_arg = cgen.Const(kernel_arg)

                _kernel_arg_decls.append(kernel_arg)

            elif issubclass(type(dat[1][0]), host.Matrix):
                # MAKE STRUCT TYPE
                dtype = dat[1][0].dtype
                ti = cgen.Pointer(
                    cgen.Value(ctypes_map(dtype),
                               Restrict(self._cc.restrict_keyword, 'i')))
                tj = cgen.Pointer(
                    cgen.Value(ctypes_map(dtype),
                               Restrict(self._cc.restrict_keyword, 'j')))
                if not dat[1][1].write:
                    ti = cgen.Const(ti)
                    tj = cgen.Const(tj)
                typename = '_' + dat[0] + '_t'
                _kernel_structs.append(
                    cgen.Typedef(cgen.Struct('', [ti, tj], typename)))

                # MAKE STRUCT ARG
                _kernel_arg_decls.append(cgen.Value(typename, dat[0]))

            if not dat[1][1].write:
                kernel_lib_arg = cgen.Const(kernel_lib_arg)

            _kernel_lib_arg_decls.append(kernel_lib_arg)

        self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls
        self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls
        self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
Beispiel #18
0
    def _map_function_on_high_bw_mem(self, obj, storage):
        if obj in storage._high_bw_mem:
            return

        decl = c.Comment("no-op")
        alloc = OffloadingOmpizer._map_to(obj)
        free = OffloadingOmpizer._map_from(obj)

        storage._high_bw_mem[obj] = (decl, alloc, free)
Beispiel #19
0
    def _alloc_array_on_high_bw_mem(self, obj, storage):
        if obj in storage._high_bw_mem:
            return

        decl = c.Comment("no-op")
        alloc = OffloadingOmpizer._map_alloc(obj)
        free = OffloadingOmpizer._map_delete(obj)

        storage._high_bw_mem[obj] = (decl, alloc, free)
Beispiel #20
0
 def __init__(self, header=None, body=None, footer=None):
     b = [
         Element(c.Comment('Flush denormal numbers to zero in hardware')),
         Element(
             c.Statement(
                 '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)')),
         Element(c.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)'))
     ]
     super(Denormals, self).__init__(header, b, footer)
Beispiel #21
0
 def _avoid_denormals(self, iet):
     header = [
         cgen.Comment('Flush denormal numbers to zero in hardware'),
         cgen.Statement(
             '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
         cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')
     ]
     iet = List(header=header, body=iet)
     return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
Beispiel #22
0
def gen_op_kernel_compute_fn():
    compute_fn = []
    compute_fn.append(c.Line("void Compute(OpKernelContext* context) override"))
    fn_body = []

    inputs = [c.Comment("Booyah")]
    fn_body.extend(inputs)
    compute_fn.append(c.Block(fn_body))
    return compute_fn
Beispiel #23
0
    def _generate_map_macros(self):
        g = cgen.Module([cgen.Comment('#### KERNEL_MAP_MACROS ####')])
        for i, dat in enumerate(self._dat_dict.items()):
            if type(dat[1][0]) is cuda_data.GlobalArray or \
                issubclass(type(dat[1][0]), cuda_base.Array):
                g.append(cgen.Define(dat[0] + '(x)', '(' + dat[0] + '[(x)])'))
            if issubclass(type(dat[1][0]), cuda_base.Matrix):
                g.append(cgen.Define(dat[0] + '(y)', dat[0] + '.i[(y)]'))

        self._components['KERNEL_MAP_MACROS'] = g
Beispiel #24
0
    def _make_waitlock(self, iet, sync_ops, *args):
        waitloop = List(
            header=c.Comment("Wait for `%s` to be copied to the host" %
                             ",".join(s.target.name for s in sync_ops)),
            body=BusyWait(Or(*[CondEq(s.handle, 0) for s in sync_ops])),
            footer=c.Line())

        iet = List(body=(waitloop, ) + iet.body)

        return iet
Beispiel #25
0
    def _generate_kernel_call(self):

        kernel_call = cgen.Module(
            [cgen.Comment('#### Kernel call arguments ####')])
        kernel_call_symbols = []

        for i, dat in enumerate(self._dat_dict.items()):
            if issubclass(type(dat[1][0]), host._Array):
                kernel_call_symbols.append(dat[0])
            elif issubclass(type(dat[1][0]), host.Matrix):
                call_symbol = dat[0] + '_c'
                kernel_call_symbols.append(call_symbol)

                nc = str(dat[1][0].ncomp)
                _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc
                _jshift = '+' + self._components['LIB_PAIR_INDEX_1'] + '*' + nc

                if dat[1][1].write and dat[1][
                        0].ncomp <= self._gather_size_limit:
                    isym = '&' + dat[0] + 'i[0]'
                else:
                    isym = dat[0] + _ishift
                jsym = dat[0] + _jshift
                g = cgen.Value('_' + dat[0] + '_t', call_symbol)
                g = cgen.Initializer(g, '{ ' + isym + ', ' + jsym + '}')

                kernel_call.append(g)

            else:
                raise RuntimeError("ERROR: Type not known")

        kernel_call.append(cgen.Comment('#### Kernel call ####'))

        kernel_call_symbols_s = ''
        for sx in kernel_call_symbols:
            kernel_call_symbols_s += sx + ','
        kernel_call_symbols_s = kernel_call_symbols_s[:-1]

        kernel_call.append(
            cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s +
                      ');'))

        self._components['LIB_KERNEL_CALL'] = kernel_call
Beispiel #26
0
    def _initialize(iet):
        # TODO: we need to pick the rank from `comm_shm`, not `comm`,
        # so that we have nranks == ngpus (as long as the user has launched
        # the right number of MPI processes per node given the available
        # number of GPUs per node)
        comm = None
        for i in iet.parameters:
            if isinstance(i, MPICommObject):
                comm = i
                break

        device_nvidia = Macro('acc_device_nvidia')
        body = Call('acc_init', [device_nvidia])

        if comm is not None:
            rank = Symbol(name='rank')
            rank_decl = LocalExpression(DummyEq(rank, 0))
            rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)])

            ngpus = Symbol(name='ngpus')
            call = DefFunction('acc_get_num_devices', device_nvidia)
            ngpus_init = LocalExpression(DummyEq(ngpus, call))

            devicenum = Symbol(name='devicenum')
            devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus))

            set_device_num = Call('acc_set_device_num',
                                  [devicenum, device_nvidia])

            body = [
                rank_decl, rank_init, ngpus_init, devicenum_init,
                set_device_num, body
            ]

        init = List(header=c.Comment('Begin of OpenACC+MPI setup'),
                    body=body,
                    footer=(c.Comment('End of OpenACC+MPI setup'), c.Line()))

        iet = iet._rebuild(body=(init, ) + iet.body)

        return iet
Beispiel #27
0
    def _generate_kernel_gather(self):

        kernel_gather = cgen.Module([
            cgen.Comment('#### Pre kernel gather ####'),
            cgen.Initializer(
                cgen.Const(
                    cgen.Value('int',
                               self._components['OMP_THREAD_INDEX_SYM'])),
                'omp_get_thread_num()')
        ])
        shared_syms = self._components['OMP_SHARED_SYMS']

        for i, dat in enumerate(self._dat_dict.items()):

            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]
            shared_syms.append(symbol)

            if issubclass(type(obj), data.GlobalArrayClassic):
                isym = symbol + '_c'
                val = symbol + '[' + self._components[
                    'OMP_THREAD_INDEX_SYM'] + ']'

                g = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], isym))
                if not mode.write:
                    g = cgen.Const(g)
                g = cgen.Initializer(g, val)

                kernel_gather.append(g)

            elif issubclass(type(obj), host.Matrix) \
                    and mode.write \
                    and obj.ncomp <= self._gather_size_limit:

                isym = symbol + 'i'
                nc = obj.ncomp
                ncb = '[' + str(nc) + ']'
                dtype = host.ctypes_map[obj.dtype]

                t = '{'
                for tx in range(nc):
                    t += '*(' + symbol + '+' + self._components[
                        'LIB_PAIR_INDEX_0']
                    t += '*' + str(nc) + '+' + str(tx) + '),'
                t = t[:-1] + '}'

                g = cgen.Value(dtype, isym + ncb)
                g = cgen.Initializer(g, t)

                kernel_gather.append(g)

        self._components['LIB_KERNEL_GATHER'] = kernel_gather
Beispiel #28
0
    def _generate_kernel_arg_decls(self):

        _kernel_arg_decls = []
        _kernel_lib_arg_decls = []
        _kernel_structs = cgen.Module(
            [cgen.Comment('#### Structs generated per ParticleDat ####')])

        if self._kernel.static_args is not None:
            for i, dat in enumerate(self._kernel.static_args.items()):
                arg = cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0]))
                _kernel_arg_decls.append(arg)
                _kernel_lib_arg_decls.append(arg)

        for i, dat in enumerate(self._dat_dict.items()):

            assert type(dat[1]) is tuple, "Access descriptors not found"
            obj = dat[1][0]
            mode = dat[1][1]
            symbol = dat[0]

            kernel_lib_arg = cgen.Pointer(
                cgen.Value(host.ctypes_map[obj.dtype],
                           Restrict(self._cc.restrict_keyword, symbol)))

            if issubclass(type(obj), data.GlobalArrayClassic):
                kernel_lib_arg = cgen.Pointer(kernel_lib_arg)
                gen = self._components['PARTICLE_DAT_C'][symbol]
                _kernel_arg_decls.append(gen.kernel_arg_decl)
            elif issubclass(type(obj), host._Array):

                gen = self._components['PARTICLE_DAT_C'][symbol]
                _kernel_arg_decls.append(gen.kernel_arg_decl)

                if mode.write is True:
                    assert issubclass(type(obj), data.GlobalArrayClassic), \
                        "global array must be a thread safe type for \
                        write access. Type is:"                                                + str(type(obj))

            elif issubclass(type(dat[1][0]), host.Matrix):
                gen = self._components['PARTICLE_DAT_C'][symbol]
                _kernel_structs.append(gen.header)
                _kernel_arg_decls.append(gen.kernel_arg_decl[0])
                _kernel_arg_decls.append(gen.kernel_arg_decl[1])

            if not dat[1][1].write:
                kernel_lib_arg = cgen.Const(kernel_lib_arg)

            _kernel_lib_arg_decls.append(kernel_lib_arg)

        self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls
        self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls
        self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
Beispiel #29
0
    def __init__(self, prodder):
        # Atomic-ize any single-thread Prodders in the parallel tree
        condition = CondEq(Ompizer.lang['thread-num'], 0)

        # Prod within a while loop until all communications have completed
        # In other words, the thread delegated to prodding is entrapped for as long
        # as it's required
        prod_until = Not(DefFunction(prodder.name, [i.name for i in prodder.arguments]))
        then_body = List(header=c.Comment('Entrap thread until comms have completed'),
                         body=While(prod_until))

        Conditional.__init__(self, condition, then_body)
        Prodder.__init__(self, prodder.name, prodder.arguments, periodic=prodder.periodic)
Beispiel #30
0
    def _generate_map_macros(self):

        g = cgen.Module([cgen.Comment('#### KERNEL_MAP_MACROS ####')])

        for i, dat in enumerate(self._dat_dict.items()):
            if issubclass(type(dat[1][0]), host._Array):
                g.append(cgen.Define(dat[0] + '(x)', '(' + dat[0] + '[(x)])'))
            if issubclass(type(dat[1][0]), host.Matrix):
                g.append(cgen.Define(dat[0] + '(x,y)', dat[0] + '_##x(y)'))
                g.append(cgen.Define(dat[0] + '_0(y)', dat[0] + '.i[(y)]'))
                g.append(cgen.Define(dat[0] + '_1(y)', dat[0] + '.j[(y)]'))

        self._components['KERNEL_MAP_MACROS'] = g