def _generate_kernel_call(self): kernel_call = cgen.Module( [cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] g = self._components['PARTICLE_DAT_C'][symbol] kernel_call_symbols.append(g.kernel_arg) kernel_call.append(g.kernel_create_j_arg) self._components['KERNEL_GATHER'] += g.kernel_create_i_arg self._components['KERNEL_SCATTER'] += g.kernel_create_i_scatter kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break devicetype = as_list(self.lang[self.platform]) try: lang_init = [self.lang['init'](devicetype)] except TypeError: # Not all target languages need to be explicitly initialized lang_init = [] deviceid = DeviceID() if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = self.lang['num-devices'](devicetype) ngpus_init = LocalExpression(DummyEq(ngpus, call)) osdd_then = self.lang['set-device']([deviceid] + devicetype) osdd_else = self.lang['set-device']([rank % ngpus] + devicetype) body = lang_init + [ Conditional( CondNe(deviceid, -1), osdd_then, List( body=[rank_decl, rank_init, ngpus_init, osdd_else ]), ) ] header = c.Comment('Begin of %s+MPI setup' % self.lang['name']) footer = c.Comment('End of %s+MPI setup' % self.lang['name']) else: body = lang_init + [ Conditional( CondNe(deviceid, -1), self.lang['set-device']([deviceid] + devicetype)) ] header = c.Comment('Begin of %s setup' % self.lang['name']) footer = c.Comment('End of %s setup' % self.lang['name']) init = List(header=header, body=body, footer=(footer, c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def _initialize(iet): comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) set_device_num = Call('omp_set_default_device', [rank % ngpus]) body = [rank_decl, rank_init, ngpus_init, set_device_num] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init,) + iet.body) return iet
def _generate_lib_src(self): self._components['LIB_SRC'] = cgen.Module([ self._components['KERNEL_STRUCT_TYPEDEFS'], cgen.Comment('#### Kernel function ####'), self._components['KERNEL_FUNC'], cgen.Comment('#### Library function ####'), self._components['LIB_FUNC'] ])
def visit_Section(self, o): body = flatten(self._visit(i) for i in o.children) if o.is_subsection: header = [] footer = [] else: header = [c.Comment("Begin %s" % o.name)] footer = [c.Comment("End %s" % o.name)] return c.Module(header + body + footer)
def _generate_kernel_call(self): kernel_call = cgen.Module([ cgen.Comment('#### Kernel call arguments ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) kernel_call_symbols = [] shared_syms = self._components['OMP_SHARED_SYMS'] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): sym = dat[0] if issubclass(type(dat[1][0]), data.GlobalArrayClassic): sym += '[' + self._components['OMP_THREAD_INDEX_SYM'] + ']' kernel_call_symbols.append(sym) shared_syms.append(dat[0]) elif issubclass(type(dat[1][0]), host.Matrix): call_symbol = dat[0] + '_c' kernel_call_symbols.append(call_symbol) nc = str(dat[1][0].ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc isym = dat[0] + _ishift g = cgen.Value('_' + dat[0] + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + '}') kernel_call.append(g) shared_syms.append(dat[0]) else: print("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def _generate_kernel_call(self): kernel_call = cgen.Module( [cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): kernel_call_symbols.append(dat[0]) for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.GlobalArrayClassic): kernel_call_symbols.append(symbol + '_c') elif issubclass(type(obj), host._Array): kernel_call_symbols.append(symbol) elif issubclass(type(obj), host.Matrix): call_symbol = symbol + '_c' kernel_call_symbols.append(call_symbol) nc = str(obj.ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc _jshift = '+' + self._components['LIB_PAIR_INDEX_1'] + '*' + nc if mode.write and obj.ncomp <= self._gather_size_limit: isym = '&' + symbol + 'i[0]' else: isym = symbol + _ishift jsym = symbol + _jshift g = cgen.Value('_' + symbol + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + ', ' + jsym + '}') kernel_call.append(g) else: print("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def _make_thread_init(threads, tfunc, isdata, sdata, sregistry): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) # A unique identifier for each created pthread pthreadid = d + threads.base_id # Initialize `sdata` arguments = list(isdata.parameters) arguments[-3] = sdata.symbolic_base + d arguments[-2] = pthreadid arguments[-1] = sregistry.deviceid call0 = Call(isdata.name, arguments) # Create pthreads call1 = Call('pthread_create', (threads.symbolic_base + d, Macro('NULL'), Call(tfunc.name, [], is_indirect=True), sdata.symbolic_base + d)) threadsinit = List( header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([call0, call1]) ) return threadsinit
def _generate_kernel_scatter(self): kernel_scatter = cgen.Module( [cgen.Comment('#### Post kernel scatter ####')]) ci = self._components['LIB_CELL_INDEX_0'] inner_l = [] src_sym = '_sgpx' dst_sym = '_shpx' # add dats to omp shared and init global array reduction for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.ParticleDat) and mode.write: tsym = self._components['PARTICLE_DAT_PARTITION'].idict[symbol] inner_l.append( DSLStrideScatter(tsym, symbol, obj.ncomp, dst_sym, src_sym, self._components['CCC_MAX'])) inner_l.append(cgen.Line(dst_sym + '++;')) inner = cgen.Module(inner_l) g = self._components['CELL_LIST_ITER'](src_sym, ci, inner) kernel_scatter.append( cgen.Initializer(cgen.Value('INT64', dst_sym), '0')) kernel_scatter.append(g) self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
def _generate_lib_inner_loop_block(self): # generate j gather #'J_GATHER' cj = self._components['LIB_CELL_INDEX_1'] j_gather = cgen.Module([ cgen.Comment('#### Pre kernel j gather ####'), ]) inner_l = [] src_sym = '_tmp_jgpx' dst_sym = self._components['CCC_1'] # add dats to omp shared and init global array reduction for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] if issubclass(type(obj), data.ParticleDat): tsym = self._components['PARTICLE_DAT_PARTITION'].jdict[symbol] inner_l.append( DSLStrideGather(symbol, tsym, obj.ncomp, src_sym, dst_sym, self._components['CCC_MAX'])) inner_l.append(cgen.Line(dst_sym + '++;')) inner = cgen.Module(inner_l) g = self._components['CELL_LIST_ITER'](src_sym, cj, inner) j_gather.append(cgen.Initializer(cgen.Value('INT64', dst_sym), '0')) j_gather.append(g) self._components['J_GATHER'] = j_gather
def _make_thread_activate(threads, sdata, sync_ops, sregistry): if threads.size == 1: d = threads.index else: d = Symbol(name=sregistry.make_name(prefix=threads.index.name)) sync_locks = [s for s in sync_ops if s.is_SyncLock] condition = Or(*([CondNe(s.handle, 2) for s in sync_locks] + [CondNe(FieldFromComposite(sdata._field_flag, sdata[d]), 1)])) if threads.size == 1: activation = [While(condition)] else: activation = [DummyExpr(d, 0), While(condition, DummyExpr(d, (d + 1) % threads.size))] activation.extend([DummyExpr(FieldFromComposite(i.name, sdata[d]), i) for i in sdata.dynamic_fields]) activation.extend([DummyExpr(s.handle, 0) for s in sync_locks]) activation.append(DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 2)) activation = List( header=[c.Line(), c.Comment("Activate `%s`" % threads.name)], body=activation, footer=c.Line() ) return activation
def _generate_kernel_scatter(self): kernel_scatter = cgen.Module( [cgen.Comment('#### Post kernel scatter ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): pass for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): pass elif issubclass(type(dat[1][0]), host.Matrix)\ and dat[1][1].write\ and dat[1][0].ncomp <= self._gather_size_limit: isym = dat[0] + 'i' nc = dat[1][0].ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[dat[1][0].dtype] ix = self._components['LIB_PAIR_INDEX_0'] b = cgen.Assign(dat[0] + '[' + str(nc) + '*' + ix + '+_tx]', isym + '[_tx]') g = cgen.For('int _tx=0', '_tx<' + str(nc), '_tx++', cgen.Block([b])) kernel_scatter.append(g) self._components['LIB_KERNEL_SCATTER'] = kernel_scatter
def _generate_lib_outer_loop(self): block = cgen.Block([self._components['LIB_KERNEL_GATHER'], self._components['LIB_INNER_LOOP'], self._components['LIB_KERNEL_SCATTER']]) i = self._components['LIB_PAIR_INDEX_0'] shared = '' for sx in self._components['OMP_SHARED_SYMS']: shared+= sx+',' shared = shared[:-1] pragma = cgen.Pragma('omp parallel for schedule(static) // default(shared) shared(' + shared + ')') if runtime.OMP_NUM_THREADS is None: pragma = cgen.Comment(pragma) loop = cgen.Module([ cgen.Line('omp_set_num_threads(_NUM_THREADS);'), pragma, cgen.For('int ' + i + '=0', i + '<_N_LOCAL', i+'++', block) ]) self._components['LIB_OUTER_LOOP'] = loop
def _make_fetchupdate(self, iet, sync_ops, pieces, *args): # Construct fetches postactions = [] for s in sync_ops: # The condition is already encoded in `iet` with a Conditional, # which stems from the originating Cluster's guards assert s.fcond is None imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] postactions.append( PragmaTransfer(self.lang._map_update_device, s.target, imask=imask)) # Turn init IET into a Callable functions = filter_ordered( flatten([(s.target, s.function) for s in sync_ops])) name = self.sregistry.make_name(prefix='init_device') body = List(body=iet.body + tuple(postactions)) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread iet = List(header=c.Comment("Initialize data stream"), body=Call(name, parameters)) return iet
def avoid_denormals(iet, platform=None): """ Introduce nodes in the Iteration/Expression tree that will expand to C macros telling the CPU to flush denormal numbers in hardware. Denormals are normally flushed when using SSE-based instruction sets, except when compiling shared objects. """ # There is unfortunately no known portable way of flushing denormal to zero. # See for example: https://stackoverflow.com/questions/59546406/\ # a-robust-portable-way-to-set-flush-denormals-to-zero try: if 'sse' not in platform.known_isas: return iet, {} except AttributeError: return iet, {} if iet.is_ElementalFunction: return iet, {} header = (cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)'), cgen.Line()) body = iet.body._rebuild(body=(List(header=header),) + iet.body.body) iet = iet._rebuild(body=body) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() device_nvidia = Macro('acc_device_nvidia') if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia]) asdn_else = Call('acc_set_device_num', [rank % ngpus, device_nvidia]) body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), asdn_then, List(body=[rank_decl, rank_init, ngpus_init, asdn_else])) ] else: body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), Call('acc_set_device_num', [deviceid, device_nvidia])) ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): _kernel_arg_decls.append( cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0]))) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) # print host.ctypes_map[dat[1][0].dtype], dat[1][0].dtype if issubclass(type(dat[1][0]), host._Array): kernel_arg = cgen.Pointer( cgen.Value(host.ctypes_map[dat[1][0].dtype], Restrict(self._cc.restrict_keyword, dat[0]))) if not dat[1][1].write: kernel_arg = cgen.Const(kernel_arg) _kernel_arg_decls.append(kernel_arg) elif issubclass(type(dat[1][0]), host.Matrix): # MAKE STRUCT TYPE dtype = dat[1][0].dtype ti = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'i'))) tj = cgen.Pointer( cgen.Value(ctypes_map(dtype), Restrict(self._cc.restrict_keyword, 'j'))) if not dat[1][1].write: ti = cgen.Const(ti) tj = cgen.Const(tj) typename = '_' + dat[0] + '_t' _kernel_structs.append( cgen.Typedef(cgen.Struct('', [ti, tj], typename))) # MAKE STRUCT ARG _kernel_arg_decls.append(cgen.Value(typename, dat[0])) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def _map_function_on_high_bw_mem(self, obj, storage): if obj in storage._high_bw_mem: return decl = c.Comment("no-op") alloc = OffloadingOmpizer._map_to(obj) free = OffloadingOmpizer._map_from(obj) storage._high_bw_mem[obj] = (decl, alloc, free)
def _alloc_array_on_high_bw_mem(self, obj, storage): if obj in storage._high_bw_mem: return decl = c.Comment("no-op") alloc = OffloadingOmpizer._map_alloc(obj) free = OffloadingOmpizer._map_delete(obj) storage._high_bw_mem[obj] = (decl, alloc, free)
def __init__(self, header=None, body=None, footer=None): b = [ Element(c.Comment('Flush denormal numbers to zero in hardware')), Element( c.Statement( '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)')), Element(c.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')) ] super(Denormals, self).__init__(header, b, footer)
def _avoid_denormals(self, iet): header = [ cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement( '_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)') ] iet = List(header=header, body=iet) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def gen_op_kernel_compute_fn(): compute_fn = [] compute_fn.append(c.Line("void Compute(OpKernelContext* context) override")) fn_body = [] inputs = [c.Comment("Booyah")] fn_body.extend(inputs) compute_fn.append(c.Block(fn_body)) return compute_fn
def _generate_map_macros(self): g = cgen.Module([cgen.Comment('#### KERNEL_MAP_MACROS ####')]) for i, dat in enumerate(self._dat_dict.items()): if type(dat[1][0]) is cuda_data.GlobalArray or \ issubclass(type(dat[1][0]), cuda_base.Array): g.append(cgen.Define(dat[0] + '(x)', '(' + dat[0] + '[(x)])')) if issubclass(type(dat[1][0]), cuda_base.Matrix): g.append(cgen.Define(dat[0] + '(y)', dat[0] + '.i[(y)]')) self._components['KERNEL_MAP_MACROS'] = g
def _make_waitlock(self, iet, sync_ops, *args): waitloop = List( header=c.Comment("Wait for `%s` to be copied to the host" % ",".join(s.target.name for s in sync_ops)), body=BusyWait(Or(*[CondEq(s.handle, 0) for s in sync_ops])), footer=c.Line()) iet = List(body=(waitloop, ) + iet.body) return iet
def _generate_kernel_call(self): kernel_call = cgen.Module( [cgen.Comment('#### Kernel call arguments ####')]) kernel_call_symbols = [] for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): kernel_call_symbols.append(dat[0]) elif issubclass(type(dat[1][0]), host.Matrix): call_symbol = dat[0] + '_c' kernel_call_symbols.append(call_symbol) nc = str(dat[1][0].ncomp) _ishift = '+' + self._components['LIB_PAIR_INDEX_0'] + '*' + nc _jshift = '+' + self._components['LIB_PAIR_INDEX_1'] + '*' + nc if dat[1][1].write and dat[1][ 0].ncomp <= self._gather_size_limit: isym = '&' + dat[0] + 'i[0]' else: isym = dat[0] + _ishift jsym = dat[0] + _jshift g = cgen.Value('_' + dat[0] + '_t', call_symbol) g = cgen.Initializer(g, '{ ' + isym + ', ' + jsym + '}') kernel_call.append(g) else: raise RuntimeError("ERROR: Type not known") kernel_call.append(cgen.Comment('#### Kernel call ####')) kernel_call_symbols_s = '' for sx in kernel_call_symbols: kernel_call_symbols_s += sx + ',' kernel_call_symbols_s = kernel_call_symbols_s[:-1] kernel_call.append( cgen.Line('k_' + self._kernel.name + '(' + kernel_call_symbols_s + ');')) self._components['LIB_KERNEL_CALL'] = kernel_call
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
def _generate_kernel_gather(self): kernel_gather = cgen.Module([ cgen.Comment('#### Pre kernel gather ####'), cgen.Initializer( cgen.Const( cgen.Value('int', self._components['OMP_THREAD_INDEX_SYM'])), 'omp_get_thread_num()') ]) shared_syms = self._components['OMP_SHARED_SYMS'] for i, dat in enumerate(self._dat_dict.items()): obj = dat[1][0] mode = dat[1][1] symbol = dat[0] shared_syms.append(symbol) if issubclass(type(obj), data.GlobalArrayClassic): isym = symbol + '_c' val = symbol + '[' + self._components[ 'OMP_THREAD_INDEX_SYM'] + ']' g = cgen.Pointer(cgen.Value(host.ctypes_map[obj.dtype], isym)) if not mode.write: g = cgen.Const(g) g = cgen.Initializer(g, val) kernel_gather.append(g) elif issubclass(type(obj), host.Matrix) \ and mode.write \ and obj.ncomp <= self._gather_size_limit: isym = symbol + 'i' nc = obj.ncomp ncb = '[' + str(nc) + ']' dtype = host.ctypes_map[obj.dtype] t = '{' for tx in range(nc): t += '*(' + symbol + '+' + self._components[ 'LIB_PAIR_INDEX_0'] t += '*' + str(nc) + '+' + str(tx) + '),' t = t[:-1] + '}' g = cgen.Value(dtype, isym + ncb) g = cgen.Initializer(g, t) kernel_gather.append(g) self._components['LIB_KERNEL_GATHER'] = kernel_gather
def _generate_kernel_arg_decls(self): _kernel_arg_decls = [] _kernel_lib_arg_decls = [] _kernel_structs = cgen.Module( [cgen.Comment('#### Structs generated per ParticleDat ####')]) if self._kernel.static_args is not None: for i, dat in enumerate(self._kernel.static_args.items()): arg = cgen.Const(cgen.Value(host.ctypes_map[dat[1]], dat[0])) _kernel_arg_decls.append(arg) _kernel_lib_arg_decls.append(arg) for i, dat in enumerate(self._dat_dict.items()): assert type(dat[1]) is tuple, "Access descriptors not found" obj = dat[1][0] mode = dat[1][1] symbol = dat[0] kernel_lib_arg = cgen.Pointer( cgen.Value(host.ctypes_map[obj.dtype], Restrict(self._cc.restrict_keyword, symbol))) if issubclass(type(obj), data.GlobalArrayClassic): kernel_lib_arg = cgen.Pointer(kernel_lib_arg) gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_arg_decls.append(gen.kernel_arg_decl) elif issubclass(type(obj), host._Array): gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_arg_decls.append(gen.kernel_arg_decl) if mode.write is True: assert issubclass(type(obj), data.GlobalArrayClassic), \ "global array must be a thread safe type for \ write access. Type is:" + str(type(obj)) elif issubclass(type(dat[1][0]), host.Matrix): gen = self._components['PARTICLE_DAT_C'][symbol] _kernel_structs.append(gen.header) _kernel_arg_decls.append(gen.kernel_arg_decl[0]) _kernel_arg_decls.append(gen.kernel_arg_decl[1]) if not dat[1][1].write: kernel_lib_arg = cgen.Const(kernel_lib_arg) _kernel_lib_arg_decls.append(kernel_lib_arg) self._components['KERNEL_ARG_DECLS'] = _kernel_arg_decls self._components['KERNEL_LIB_ARG_DECLS'] = _kernel_lib_arg_decls self._components['KERNEL_STRUCT_TYPEDEFS'] = _kernel_structs
def __init__(self, prodder): # Atomic-ize any single-thread Prodders in the parallel tree condition = CondEq(Ompizer.lang['thread-num'], 0) # Prod within a while loop until all communications have completed # In other words, the thread delegated to prodding is entrapped for as long # as it's required prod_until = Not(DefFunction(prodder.name, [i.name for i in prodder.arguments])) then_body = List(header=c.Comment('Entrap thread until comms have completed'), body=While(prod_until)) Conditional.__init__(self, condition, then_body) Prodder.__init__(self, prodder.name, prodder.arguments, periodic=prodder.periodic)
def _generate_map_macros(self): g = cgen.Module([cgen.Comment('#### KERNEL_MAP_MACROS ####')]) for i, dat in enumerate(self._dat_dict.items()): if issubclass(type(dat[1][0]), host._Array): g.append(cgen.Define(dat[0] + '(x)', '(' + dat[0] + '[(x)])')) if issubclass(type(dat[1][0]), host.Matrix): g.append(cgen.Define(dat[0] + '(x,y)', dat[0] + '_##x(y)')) g.append(cgen.Define(dat[0] + '_0(y)', dat[0] + '.i[(y)]')) g.append(cgen.Define(dat[0] + '_1(y)', dat[0] + '.j[(y)]')) self._components['KERNEL_MAP_MACROS'] = g