def _initialize(iet): comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) set_device_num = Call('omp_set_default_device', [rank % ngpus]) body = [rank_decl, rank_init, ngpus_init, set_device_num] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init,) + iet.body) return iet
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def _make_wait(self, f, hse, key, msg=None): bufs = FieldFromPointer(msg._C_field_bufs, msg) ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') sizes = [ FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions)) ] scatter = Call('scatter_%s' % key, [bufs] + sizes + [f] + ofss) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[waitsend, waitrecv, scatter]) parameters = ([f] + ofss + [fromrank, msg]) return Callable('wait_%s' % key, iet, 'void', parameters, ('static', ))
def _make_sendrecv(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) bufs = FieldFromPointer(msg._C_field_bufs, msg) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) recv = Call('MPI_Irecv', [bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv]) send = Call('MPI_Isend', [bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend]) iet = List(body=[recv, gather, send]) parameters = ([f] + ofsg + [fromrank, torank, comm, msg]) return SendRecv(key, iet, parameters, bufg, bufs)
def _make_halowait(self, f, hse, key, msg=None): cast = cast_mapper[(f.dtype, '*')] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) sizes = [FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [FieldFromComposite('%s[%d]' % (msg._C_field_ofss, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [fixed.get(d) or ofss.pop(0) for d in f.dimensions] # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Call('scatter%s' % key, [cast(bufs)] + sizes + [f] + ofss) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([waitsend, waitrecv, scatter], dim, ncomms - 1) parameters = ([f] + list(fixed.values()) + [msg, ncomms]) return Callable('halowait%d' % key, iet, 'void', parameters, ('static',))
def test_nested_calls_cgen(self): call = Call('foo', [ Call('bar', []) ]) code = CGen().visit(call) assert str(code) == 'foo(bar());'
def instrument(self, iet): sections = FindNodes(Section).visit(iet) # Transform the Iteration/Expression tree introducing Advisor calls that # resume and stop data collection mapper = {i: List(body=[Call(self._api_resume), i, Call(self._api_pause)]) for i in sections} iet = Transformer(mapper).visit(iet) return iet
def test_find_symbols_nested(mode, expected): grid = Grid(shape=(4, 4, 4)) call = Call('foo', [ Call('bar', [Symbol(name='x'), Call('baz', [Function(name='f', grid=grid)])]) ]) found = FindSymbols(mode).visit(call) assert [f.name for f in found] == eval(expected)
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) ops_block = OpsBlock('block') # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for section, trees in find_affine_trees(iet).items(): dims.append(len(trees[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(trees[0].root)) symbols -= set(FindSymbols('defines').visit(trees[0].root)) to_dat |= symbols # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(create_ops_dat(f, name_to_ops_dat, ops_block)) for n, (section, trees) in enumerate(find_affine_trees(iet).items()): pre_loop, ops_kernel = opsit(trees, n) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" ops_block_init = Expression( ClusterizedEq( Eq(ops_block, namespace['ops_decl_block'](dims[0], Literal('"block"'))))) self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.append('stdio.h') body = [ ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, ops_exit ] return List(body=body)
def _make_sendrecv(self, f, hse, key, **kwargs): comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in hse.loc_indices ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, padding=0) bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, padding=0) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') gather = Call('gather%s' % key, [bufg] + list(bufg.shape) + [f] + ofsg) scatter = Call('scatter%s' % key, [bufs] + list(bufs.shape) + [f] + ofss) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) count = reduce(mul, bufs.shape, 1) rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') recv = IrecvCall([ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = IsendCall([ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return SendRecv(key, iet, parameters, bufg, bufs)
def update_halo(f, fixed): """ Construct an IET performing a halo exchange for a :class:`TensorFunction`. """ # Requirements assert f.is_Function assert f.grid is not None distributor = f.grid.distributor nb = distributor._C_neighbours.obj comm = distributor._C_comm fixed = {d: Symbol(name="o%s" % d.root) for d in fixed} mapper = get_views(f, fixed) body = [] masks = [] for d in f.dimensions: if d in fixed: continue rpeer = FieldFromPointer("%sright" % d, nb) lpeer = FieldFromPointer("%sleft" % d, nb) # Sending to left, receiving from right lsizes, loffsets = mapper[(d, LEFT, OWNED)] rsizes, roffsets = mapper[(d, RIGHT, HALO)] assert lsizes == rsizes sizes = lsizes parameters = ([f] + list(f.symbolic_shape) + sizes + loffsets + roffsets + [rpeer, lpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sl' % d) body.append(Conditional(mask, call)) masks.append(mask) # Sending to right, receiving from left rsizes, roffsets = mapper[(d, RIGHT, OWNED)] lsizes, loffsets = mapper[(d, LEFT, HALO)] assert rsizes == lsizes sizes = rsizes parameters = ([f] + list(f.symbolic_shape) + sizes + roffsets + loffsets + [lpeer, rpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sr' % d) body.append(Conditional(mask, call)) masks.append(mask) iet = List(body=body) parameters = ([f] + masks + [comm, nb] + list(fixed.values()) + [d.symbolic_size for d in f.dimensions]) return Callable('halo_exchange_%s' % f.name, iet, 'void', parameters, ('static', ))
def _make_haloupdate(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufg = FieldFromComposite(msg._C_field_bufg, msgi) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) torank = FieldFromComposite(msg._C_field_to, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofsg, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # Make Irecv/Isend count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([recv, gather, send], dim, ncomms - 1) parameters = ([f, comm, msg, ncomms]) + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def _call_sendrecv(self, name, *args, msg=None, haloid=None): # Drop `sizes` as this HaloExchangeBuilder conveys them through `msg` # Drop `ofss` as this HaloExchangeBuilder only needs them in `wait()`, # to collect and scatter the result of an MPI_Irecv f, _, ofsg, _, fromrank, torank, comm = args msg = Byref(IndexedPointer(msg, haloid)) return Call(name, [f] + ofsg + [fromrank, torank, comm, msg])
def _make_fetchupdate(self, iet, sync_ops, pieces, *args): # Construct fetches postactions = [] for s in sync_ops: # The condition is already encoded in `iet` with a Conditional, # which stems from the originating Cluster's guards assert s.fcond is None imask = [(s.tstore, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] postactions.append( PragmaTransfer(self.lang._map_update_device, s.target, imask=imask)) # Turn init IET into a Callable functions = filter_ordered( flatten([(s.target, s.function) for s in sync_ops])) name = self.sregistry.make_name(prefix='init_device') body = List(body=iet.body + tuple(postactions)) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread iet = List(header=c.Comment("Initialize data stream"), body=Call(name, parameters)) return iet
def _make_halowait(self, f, hse, key, msg=None): nb = f.grid.distributor._obj_neighborhood wait = self._cache_dims[f.dimensions][2] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} # Only retain the halos required by the Diag scheme # Note: `sorted` is only for deterministic code generation halos = sorted(i for i in hse.halos if isinstance(i.dim, tuple)) body = [] for dims, tosides in halos: mapper = OrderedDict(zip(dims, [i.flip() for i in tosides])) fromrank = FieldFromPointer( ''.join(i.name[0] for i in mapper.values()), nb) ofss = [ fixed.get(d, f._C_get_field(HALO, d, mapper.get(d)).offset) for d in f.dimensions ] msgi = Byref(IndexedPointer(msg, len(body))) body.append(Call(wait.name, [f] + ofss + [fromrank, msgi])) iet = List(body=body) parameters = [f] + list(fixed.values()) + [nb, msg] return Callable('halowait%d' % key, iet, 'void', parameters, ('static', ))
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) osdd_then = Call('omp_set_default_device', [deviceid]) osdd_else = Call('omp_set_default_device', [rank % ngpus]) body = [ Conditional( CondNe(deviceid, -1), osdd_then, List(body=[rank_decl, rank_init, ngpus_init, osdd_else]), ) ] else: body = [ Conditional(CondNe(deviceid, -1), Call('omp_set_default_device', [deviceid])) ] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def make(self, halo_spots): """ Construct Callables and Calls implementing a halo exchange for the provided HaloSpots. For each (unique) HaloSpot, three Callables are built: * ``update_halo``, to be called when a halo exchange is necessary, * ``sendrecv``, called multiple times by ``update_halo``. * ``copy``, called twice by ``sendrecv``, to implement, for example, data gathering prior to an MPI_Send, and data scattering following an MPI recv. """ calls = OrderedDict() generated = OrderedDict() for hs in halo_spots: for f, v in hs.fmapper.items(): # Sanity check assert f.is_Function assert f.grid is not None # Callables construction # ---------------------- # Note: to construct the halo exchange Callables, use the generic `df`, # instead of `f`, so that we don't need to regenerate code for Functions # that are symbolically identical to `f` except for the name df = f.__class__.__base__(name='a', grid=f.grid, shape=f.shape_global, dimensions=f.dimensions) # `gather`, `scatter`, `sendrecv` are generic by construction -- they # only need to be generated once for each `ndim` if f.ndim not in generated: gather, extra = self._make_copy(df, v.loc_indices) scatter, _ = self._make_copy(df, v.loc_indices, swap=True) sendrecv = self._make_sendrecv(df, v.loc_indices, extra) generated[f.ndim] = [gather, scatter, sendrecv] # `haloupdate` is generic by construction -- it only needs to be # generated once for each (`ndim`, `mask`) if (f.ndim, v) not in generated: uniquekey = len( [i for i in generated if isinstance(i, tuple)]) generated[(f.ndim, v)] = [ self._make_haloupdate(df, v.loc_indices, hs.mask[f], extra, uniquekey) ] # `haloupdate` Call construction comm = f.grid.distributor._obj_comm nb = f.grid.distributor._obj_neighborhood loc_indices = list(v.loc_indices.values()) args = [f, comm, nb] + loc_indices + extra call = Call(generated[(f.ndim, v)][0].name, args) calls.setdefault(hs, []).append(call) return flatten(generated.values()), calls
def __make_init_threads(self, threads, sdata, tfunc, pieces): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) idinit = DummyExpr(FieldFromComposite(sdata._field_id, sdata[d]), 1 + sum(i.size for i in pieces.threads) + d) arguments = list(tfunc.parameters) arguments[-1] = sdata.symbolic_base + d call = Call('std::thread', Call(tfunc.name, arguments, is_indirect=True), retobj=threads[d]) threadsinit = List(header=c.Comment("Fire up and initialize `%s`" % threads.name), body=callback([idinit, call])) return threadsinit
def get_ops_args(args, stencils, name_to_dat): ops_args = [] for arg in args: if arg.is_Constant: ops_args.append( Call("ops_arg_gbl", [ Byref(Constant(name=arg.name[1:])), 1, String(dtype_to_cstr(arg.dtype)), OPS_READ ], False)) else: ops_args.append( Call("ops_arg_dat", [ name_to_dat[arg.name], 1, stencils[arg.name], String(dtype_to_cstr(arg.dtype)), OPS_WRITE if arg.is_Write else OPS_READ ], False)) return ops_args
def test_call_indexed(): grid = Grid(shape=(10, 10)) u = Function(name='u', grid=grid) foo = Callable('foo', DummyExpr(u, 1), 'void', parameters=[u, u.indexed]) call = Call(foo.name, [u, u.indexed]) assert str(call) == "foo(u_vec,u);" assert str(foo) == """\
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
def _generate_mpi(self, iet, **kwargs): if configuration['mpi'] is False: return iet halo_spots = FindNodes(HaloSpot).visit(iet) # For each MPI-distributed TensorFunction, generate all necessary # C-level routines to perform a halo update callables = OrderedDict() for hs in halo_spots: for f, v in hs.fmapper.items(): callables[f] = [update_halo(f, v.loc_indices)] callables[f].append(sendrecv(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices, True)) callables = flatten(callables.values()) # Replace HaloSpots with suitable calls performing the halo update mapper = {} for hs in halo_spots: for f, v in hs.fmapper.items(): stencil = [int(i) for i in hs.mask[f].values()] comm = f.grid.distributor._C_comm nb = f.grid.distributor._C_neighbours.obj loc_indices = list(v.loc_indices.values()) dsizes = [d.symbolic_size for d in f.dimensions] parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes call = Call('halo_exchange_%s' % f.name, parameters) mapper.setdefault(hs, []).append(call) # Sorting is for deterministic code generation. However, in practice, # we don't expect `cstructs` to contain more than one element because # there should always be one grid per Operator (though we're not really # enforcing it) cstructs = { f.grid.distributor._C_neighbours.cdef for f in flatten(i.fmapper for i in halo_spots) } self._globals.extend(sorted(cstructs, key=lambda i: i.tpname)) self._includes.append('mpi.h') self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in callables])) # Add in the halo update calls mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()} iet = Transformer(mapper, nested=True).visit(iet) return iet
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def __make_finalize_threads(self, threads, sdata): d = threads.index if threads.size == 1: callback = lambda body: body else: callback = lambda body: Iteration(body, d, threads.size - 1) threadswait = List( header=c.Comment("Wait for completion of `%s`" % threads.name), body=callback([ While( CondEq(FieldFromComposite(sdata._field_flag, sdata[d]), 2)), DummyExpr(FieldFromComposite(sdata._field_flag, sdata[d]), 0), Call(FieldFromComposite('join', threads[d])) ])) return threadswait
def _generate_mpi(self, iet, **kwargs): if configuration['mpi'] is False: return iet # For each function, generate all necessary C-level routines to perform # a halo exchange mapper = {} callables = [] cstructs = set() for hs in FindNodes(HaloSpot).visit(iet): for f, v in hs.fmapper.items(): callables.append(update_halo(f, hs.fixed[f])) callables.append(sendrecv(f, hs.fixed[f])) callables.extend( [copy(f, hs.fixed[f]), copy(f, hs.fixed[f], True)]) stencil = [int(i) for i in hs.mask[f].values()] comm = f.grid.distributor._C_comm nb = f.grid.distributor._C_neighbours.obj fixed = list(hs.fixed[f].values()) dsizes = [d.symbolic_size for d in f.dimensions] parameters = [f] + stencil + [comm, nb] + fixed + dsizes call = Call('halo_exchange_%s' % f.name, parameters) mapper.setdefault(hs, []).append(call) cstructs.add(f.grid.distributor._C_neighbours.cdef) self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in callables])) # Sorting is for deterministic code generation. However, in practice, # we don't expect `cstructs` to contain more than one element because # there should always be one grid per Operator (though we're not really # enforcing this) self._globals.extend(sorted(cstructs, key=lambda i: i.tpname)) self._includes.append('mpi.h') # Add in the halo update calls mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()} iet = Transformer(mapper).visit(iet) return iet
def sendrecv(f, fixed): """Construct an IET performing a halo exchange along arbitrary dimension and side.""" assert f.is_Function assert f.grid is not None comm = f.grid.distributor._C_comm buf_dims = [Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions] dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype, scope='external') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg gather = Call('gather_%s' % f.name, parameters) parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss scatter = Call('scatter_%s' % f.name, parameters) # The scatter must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [bufs, count, Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv]) send = Call('MPI_Isend', [bufg, count, Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)]) parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % f.name, iet, 'void', parameters, ('static',))
def _call_halowait(self, name, f, hse, msg): return Call(name, [f] + list(hse.loc_indices.values()) + [msg, msg.npeers])
def _call_haloupdate(self, name, f, hse, msg): comm = f.grid.distributor._obj_comm return Call(name, [f, comm, msg, msg.npeers] + list(hse.loc_indices.values()))
def _call_halowait(self, name, f, hse, msg): nb = f.grid.distributor._obj_neighborhood return Call(name, [f] + list(hse.loc_indices.values()) + [nb, msg])
def _call_haloupdate(self, name, f, hse, *args): comm = f.grid.distributor._obj_comm nb = f.grid.distributor._obj_neighborhood args = [f, comm, nb] + list(hse.loc_indices.values()) return Call(name, flatten(args))