def _make_wait(self, f, hse, key, msg=None): bufs = FieldFromPointer(msg._C_field_bufs, msg) ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') sizes = [ FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions)) ] scatter = Call('scatter_%s' % key, [bufs] + sizes + [f] + ofss) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[waitsend, waitrecv, scatter]) parameters = ([f] + ofss + [fromrank, msg]) return Callable('wait_%s' % key, iet, 'void', parameters, ('static', ))
def _make_sendrecv(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) bufs = FieldFromPointer(msg._C_field_bufs, msg) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] gather = Call('gather%s' % key, [bufg] + sizes + [f] + ofsg) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) recv = IrecvCall([bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv]) send = IsendCall([bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend]) iet = List(body=[recv, gather, send]) parameters = ([f] + ofsg + [fromrank, torank, comm, msg]) return SendRecv(key, iet, parameters, bufg, bufs)
def _make_halowait(self, f, hse, key, msg=None): cast = cast_mapper[(f.dtype, '*')] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) sizes = [FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [FieldFromComposite('%s[%d]' % (msg._C_field_ofss, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [fixed.get(d) or ofss.pop(0) for d in f.dimensions] # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Call('scatter%s' % key, [cast(bufs)] + sizes + [f] + ofss) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([waitsend, waitrecv, scatter], dim, ncomms - 1) parameters = ([f] + list(fixed.values()) + [msg, ncomms]) return Callable('halowait%d' % key, iet, 'void', parameters, ('static',))
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def _make_haloupdate(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufg = FieldFromComposite(msg._C_field_bufg, msgi) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) torank = FieldFromComposite(msg._C_field_to, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofsg, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # Make Irecv/Isend count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([recv, gather, send], dim, ncomms - 1) parameters = ([f, comm, msg, ncomms]) + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def sendrecv(f, fixed): """Construct an IET performing a halo exchange along arbitrary dimension and side.""" assert f.is_Function assert f.grid is not None comm = f.grid.distributor._C_comm buf_dims = [Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions] dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype, scope='external') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg gather = Call('gather_%s' % f.name, parameters) parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss scatter = Call('scatter_%s' % f.name, parameters) # The scatter must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [bufs, count, Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv]) send = Call('MPI_Isend', [bufg, count, Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)]) parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % f.name, iet, 'void', parameters, ('static',))
def _make_thread_init(threads, tfunc, isdata, sdata, sregistry): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) # A unique identifier for each created pthread pthreadid = d + threads.base_id # Initialize `sdata` arguments = list(isdata.parameters) arguments[-3] = sdata.symbolic_base + d arguments[-2] = pthreadid arguments[-1] = sregistry.deviceid call0 = Call(isdata.name, arguments) # Create pthreads call1 = Call('pthread_create', (threads.symbolic_base + d, Macro('NULL'), Call(tfunc.name, [], is_indirect=True), sdata.symbolic_base + d)) threadsinit = List( header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([call0, call1]) ) return threadsinit
def _make_thread_func(name, iet, root, threads, sregistry): # Create the SharedData, that is the data structure that will be used by the # main thread to pass information dows to the child thread(s) required, parameters, dynamic_parameters = diff_parameters(iet, root) parameters = sorted(parameters, key=lambda i: i.is_Function) # Allow casting sdata = SharedData(name=sregistry.make_name(prefix='sdata'), npthreads=threads.size, fields=required, dynamic_fields=dynamic_parameters) sbase = sdata.symbolic_base sid = sdata.symbolic_id # Create a Callable to initialize `sdata` with the known const values iname = 'init_%s' % sdata.dtype._type_.__name__ ibody = [DummyExpr(FieldFromPointer(i._C_name, sbase), i._C_symbol) for i in parameters] ibody.extend([ BlankLine, DummyExpr(FieldFromPointer(sdata._field_id, sbase), sid), DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1) ]) iparameters = parameters + [sdata, sid] isdata = Callable(iname, ibody, 'void', iparameters, 'static') # Prepend the SharedData fields available upon thread activation preactions = [DummyExpr(i, FieldFromPointer(i.name, sbase)) for i in dynamic_parameters] # Append the flag reset postactions = [List(body=[ BlankLine, DummyExpr(FieldFromPointer(sdata._field_flag, sbase), 1) ])] iet = List(body=preactions + [iet] + postactions) # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread iet = Conditional(CondEq(FieldFromPointer(sdata._field_flag, sbase), 2), iet) # The thread keeps spinning until the alive flag is set to 0 by the main thread iet = While(CondNe(FieldFromPointer(sdata._field_flag, sbase), 0), iet) # pthread functions expect exactly one argument, a void*, and must return void* tretval = 'void*' tparameter = VoidPointer('_%s' % sdata.name) # Unpack `sdata` unpack = [PointerCast(sdata, tparameter), BlankLine] for i in parameters: if i.is_AbstractFunction: unpack.extend([Dereference(i, sdata), PointerCast(i)]) else: unpack.append(DummyExpr(i, FieldFromPointer(i.name, sbase))) unpack.append(DummyExpr(sid, FieldFromPointer(sdata._field_id, sbase))) unpack.append(BlankLine) iet = List(body=unpack + [iet, BlankLine, Return(Macro('NULL'))]) tfunc = ThreadFunction(name, iet, tretval, tparameter, 'static') return tfunc, isdata, sdata
def test_null_init(): grid = Grid(shape=(10, 10)) u = Function(name='u', grid=grid) expr = DummyExpr(u.indexed, Macro('NULL'), init=True) assert str(expr) == "float * u = NULL;" assert expr.defines == (u.indexed, )
def _make_sendrecv(self, f, hse, key, **kwargs): comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in hse.loc_indices ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') gather = Call('gather_%s' % key, [bufg] + list(bufg.shape) + [f] + ofsg) scatter = Call('scatter_%s' % key, [bufs] + list(bufs.shape) + [f] + ofss) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) count = reduce(mul, bufs.shape, 1) rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % key, iet, 'void', parameters, ('static', ))
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def _make_thread_finalize(threads, sdata): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) threadswait = List( header=c.Comment("Wait for completion of `%s`" % threads.name), body=callback([ While(CondEq(FieldFromComposite(sdata._field_flag, sdata[d]), 2)), DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0), Call('pthread_join', (threads[d], Macro('NULL'))) ])) return threadswait
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() device_nvidia = Macro('acc_device_nvidia') if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia]) asdn_else = Call('acc_set_device_num', [rank % ngpus, device_nvidia]) body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), asdn_then, List(body=[rank_decl, rank_init, ngpus_init, asdn_else])) ] else: body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), Call('acc_set_device_num', [deviceid, device_nvidia])) ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def new_ops_arg(self, indexed): """ Create an :class:`Indexed` node using OPS representation. Parameters ---------- indexed : :class:`Indexed` Indexed object using devito representation. Returns ------- :class:`Indexed` Indexed node using OPS representation. """ # Build the OPS arg identifier time_index = split_affine(indexed.indices[TimeFunction._time_position]) ops_arg_id = '%s%s%s' % (indexed.name, time_index.var, time_index.shift) if ops_arg_id not in self.ops_args: # Create the indexed object ops_arg = Array(name=ops_arg_id, dimensions=[Dimension(name=namespace['ops_acc'])], dtype=indexed.dtype) self.ops_args[ops_arg_id] = ops_arg else: ops_arg = self.ops_args[ops_arg_id] # Get the space indices space_indices = [ e for i, e in enumerate(indexed.indices) if i != TimeFunction._time_position ] # Define the Macro used in OPS arg index access_macro = Macro( 'OPS_ACC%d(%s)' % (list(self.ops_args).index(ops_arg_id), ','.join( str(split_affine(i).shift) for i in space_indices))) # Create Indexed object representing the OPS arg access new_indexed = Indexed(ops_arg.indexed, access_macro) return new_indexed
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
# OPS API namespace['ops_init'] = 'ops_init' namespace['ops_partition'] = 'ops_partition' namespace['ops_timing_output'] = 'ops_timing_output' namespace['ops_exit'] = 'ops_exit' namespace['ops_par_loop'] = 'ops_par_loop' namespace['ops_dat_fetch_data'] = lambda ops_dat, data: Call( name='ops_dat_fetch_data', arguments=[ops_dat, 0, data]) namespace['ops_decl_stencil'] = Function(name='ops_decl_stencil') namespace['ops_decl_block'] = Function(name='ops_decl_block') namespace['ops_decl_dat'] = Function(name='ops_decl_dat') namespace['ops_arg_dat'] = Function(name='ops_arg_dat') namespace['ops_arg_gbl'] = Function(name='ops_arg_gbl') namespace['ops_read'] = Macro('OPS_READ') namespace['ops_write'] = Macro('OPS_WRITE') namespace['ops_stencil_type'] = 'ops_stencil' namespace['ops_block_type'] = 'ops_block' namespace['ops_dat_type'] = 'ops_dat' # Naming conventions namespace['ops_define_dimension'] = lambda i: '#define OPS_%sD' % i namespace['ops_kernel'] = lambda i: 'OPS_Kernel_%s' % i namespace['ops_stencil_name'] = lambda dims, name, pts: 's%dd_%s_%dpt' % ( dims, name, pts) namespace['ops_dat_dim'] = lambda i: '%s_dim' % i namespace['ops_dat_base'] = lambda i: '%s_base' % i namespace['ops_dat_d_p'] = lambda i: '%s_d_p' % i namespace['ops_dat_d_m'] = lambda i: '%s_d_m' % i
class AccBB(PragmaLangBB): mapper = { # Misc 'name': 'OpenACC', 'header': 'openacc.h', # Platform mapping AMDGPUX: Macro('acc_device_radeon'), NVIDIAX: Macro('acc_device_nvidia'), # Runtime library 'init': lambda args: Call('acc_init', args), 'num-devices': lambda args: DefFunction('acc_get_num_devices', args), 'set-device': lambda args: Call('acc_set_device_num', args), # Pragmas 'atomic': c.Pragma('acc atomic update'), 'map-enter-to': lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)), 'map-enter-to-wait': lambda i, j, k: (c.Pragma('acc enter data copyin(%s%s) async(%s)' % (i, j, k)), c.Pragma('acc wait(%s)' % k)), 'map-enter-alloc': lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)), 'map-present': lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)), 'map-wait': lambda i: c.Pragma('acc wait(%s)' % i), 'map-update': lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('acc update self(%s%s)' % (i, j)), 'map-update-host-async': lambda i, j, k: c.Pragma('acc update self(%s%s) async(%s)' % (i, j, k)), 'map-update-device': lambda i, j: c.Pragma('acc update device(%s%s)' % (i, j)), 'map-update-device-async': lambda i, j, k: c.Pragma('acc update device(%s%s) async(%s)' % (i, j, k)), 'map-release': lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)), 'map-exit-delete': lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)), 'memcpy-to-device': lambda i, j, k: Call('acc_memcpy_to_device', [i, j, k]), 'memcpy-to-device-wait': lambda i, j, k, l: List(body=[ Call('acc_memcpy_to_device_async', [i, j, k, l]), Call('acc_wait', [l]) ]), 'device-get': Call('acc_get_device_num'), 'device-alloc': lambda i, *a, retobj=None: Call( 'acc_malloc', (i, ), retobj=retobj, cast=True), 'device-free': lambda i, *a: Call('acc_free', (i, )) } mapper.update(CBB.mapper) Region = OmpRegion HostIteration = OmpIteration # Host parallelism still goes via OpenMP DeviceIteration = DeviceAccIteration @classmethod def _map_to_wait(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-enter-to-wait'](f.name, sections, queueid) @classmethod def _map_present(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-present'](f.name, sections) @classmethod def _map_delete(cls, f, imask=None, devicerm=None): sections = cls._make_sections_from_imask(f, imask) if devicerm is not None: cond = ' if(%s)' % devicerm.name else: cond = '' return cls.mapper['map-exit-delete'](f.name, sections, cond) @classmethod def _map_update_host_async(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-update-host-async'](f.name, sections, queueid) @classmethod def _map_update_device_async(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-update-device-async'](f.name, sections, queueid)
def _make_sendrecv(self, f, fixed, extra=None): extra = extra or [] comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') args = [bufg] + list(bufg.shape) + [f] + ofsg + extra gather = Call('gather%dd' % f.ndim, args) args = [bufs] + list(bufs.shape) + [f] + ofss + extra scatter = Call('scatter%dd' % f.ndim, args) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') ssend = MPIStatusObject(name='ssend') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, ssend]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=iet_insert_C_decls(iet)) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm] + extra) return Callable('sendrecv%dd' % f.ndim, iet, 'void', parameters, ('static', ))