def _make_halowait(self, f, hse, key, msg=None): cast = cast_mapper[(f.dtype, '*')] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) sizes = [FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [FieldFromComposite('%s[%d]' % (msg._C_field_ofss, i), msgi) for i in range(len(f._dist_dimensions))] ofss = [fixed.get(d) or ofss.pop(0) for d in f.dimensions] # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Call('scatter%s' % key, [cast(bufs)] + sizes + [f] + ofss) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([waitsend, waitrecv, scatter], dim, ncomms - 1) parameters = ([f] + list(fixed.values()) + [msg, ncomms]) return Callable('halowait%d' % key, iet, 'void', parameters, ('static',))
def _make_wait(self, f, hse, key, msg=None): bufs = FieldFromPointer(msg._C_field_bufs, msg) ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') sizes = [ FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions)) ] scatter = Call('scatter_%s' % key, [bufs] + sizes + [f] + ofss) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[waitsend, waitrecv, scatter]) parameters = ([f] + ofss + [fromrank, msg]) return Callable('wait_%s' % key, iet, 'void', parameters, ('static', ))
def _make_sendrecv(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) bufs = FieldFromPointer(msg._C_field_bufs, msg) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] gather = Call('gather%s' % key, [bufg] + sizes + [f] + ofsg) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) recv = IrecvCall([bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv]) send = IsendCall([bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend]) iet = List(body=[recv, gather, send]) parameters = ([f] + ofsg + [fromrank, torank, comm, msg]) return SendRecv(key, iet, parameters, bufg, bufs)
def _make_poke(self, hs, key, msgs): lflag = Symbol(name='lflag') gflag = Symbol(name='gflag') # Init flags body = [Expression(DummyEq(lflag, 0)), Expression(DummyEq(gflag, 1))] # For each msg, build an Iteration calling MPI_Test on all peers for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testsend = Call( 'MPI_Test', [rsend, Byref(lflag), Macro('MPI_STATUS_IGNORE')]) update = AugmentedExpression(DummyEq(gflag, lflag), '&') body.append( Iteration([testsend, update, testrecv, update], dim, msg.npeers - 1)) body.append(Return(gflag)) return make_efunc('pokempi%d' % key, List(body=body), retval='int')
def test_create_ops_dat_time_function(self): grid = Grid(shape=(4)) u = TimeFunction(name='u', grid=grid, space_order=2) block = OpsBlock('block') name_to_ops_dat = {} result = create_ops_dat(u, name_to_ops_dat, block) assert name_to_ops_dat['ut0'].base.name == namespace['ops_dat_name'](u.name) assert name_to_ops_dat['ut0'].indices == (Symbol('t0'),) assert name_to_ops_dat['ut1'].base.name == namespace['ops_dat_name'](u.name) assert name_to_ops_dat['ut1'].indices == (Symbol('t1'),) assert result[0].expr.lhs.name == namespace['ops_dat_dim'](u.name) assert result[0].expr.rhs.params == (Integer(4),) assert result[1].expr.lhs.name == namespace['ops_dat_base'](u.name) assert result[1].expr.rhs.params == (Zero(),) assert result[2].expr.lhs.name == namespace['ops_dat_d_p'](u.name) assert result[2].expr.rhs.params == (Integer(2),) assert result[3].expr.lhs.name == namespace['ops_dat_d_m'](u.name) assert result[3].expr.rhs.params == (Integer(-2),) assert result[4].expr.lhs.name == namespace['ops_dat_name'](u.name) assert len(result[4].expr.rhs.params) == 2 assert result[4].expr.rhs.params[0].name == namespace['ops_decl_dat'].name assert result[4].expr.rhs.params[0].args == ( block, 1, Symbol(namespace['ops_dat_dim'](u.name)), Symbol(namespace['ops_dat_base'](u.name)), Symbol(namespace['ops_dat_d_m'](u.name)), Symbol(namespace['ops_dat_d_p'](u.name)), Byref(u.indexify((0,))), Literal('"%s"' % u._C_typedata), Literal('"ut0"') ) assert result[4].expr.rhs.params[1].name == namespace['ops_decl_dat'].name assert result[4].expr.rhs.params[1].args == ( block, 1, Symbol(namespace['ops_dat_dim'](u.name)), Symbol(namespace['ops_dat_base'](u.name)), Symbol(namespace['ops_dat_d_m'](u.name)), Symbol(namespace['ops_dat_d_p'](u.name)), Byref(u.indexify((1,))), Literal('"%s"' % u._C_typedata), Literal('"ut1"') )
def _make_haloupdate(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufg = FieldFromComposite(msg._C_field_bufg, msgi) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) torank = FieldFromComposite(msg._C_field_to, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofsg, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # Make Irecv/Isend count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([recv, gather, send], dim, ncomms - 1) parameters = ([f, comm, msg, ncomms]) + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def _make_halowait(self, f, hse, key, msg=None): nb = f.grid.distributor._obj_neighborhood wait = self._cache_dims[f.dimensions][2] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} # Only retain the halos required by the Diag scheme # Note: `sorted` is only for deterministic code generation halos = sorted(i for i in hse.halos if isinstance(i.dim, tuple)) body = [] for dims, tosides in halos: mapper = OrderedDict(zip(dims, [i.flip() for i in tosides])) fromrank = FieldFromPointer( ''.join(i.name[0] for i in mapper.values()), nb) ofss = [ fixed.get(d, f._C_get_field(HALO, d, mapper.get(d)).offset) for d in f.dimensions ] msgi = Byref(IndexedPointer(msg, len(body))) body.append(Call(wait.name, [f] + ofss + [fromrank, msgi])) iet = List(body=body) parameters = [f] + list(fixed.values()) + [nb, msg] return Callable('halowait%d' % key, iet, 'void', parameters, ('static', ))
def _initialize(iet): comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = Function('omp_get_num_devices')() ngpus_init = LocalExpression(DummyEq(ngpus, call)) set_device_num = Call('omp_set_default_device', [rank % ngpus]) body = [rank_decl, rank_init, ngpus_init, set_device_num] init = List(header=c.Comment('Begin of OpenMP+MPI setup'), body=body, footer=(c.Comment('End of OpenMP+MPI setup'), c.Line())) iet = iet._rebuild(body=(init,) + iet.body) return iet
def create_ops_arg(p, accessible_origin, name_to_ops_dat, par_to_ops_stencil): elements_per_point = 1 dtype = Literal('"%s"' % dtype_to_cstr(p.dtype)) if p.is_Constant: ops_type = namespace['ops_arg_gbl'] ops_name = Byref(Constant(name=p.name[1:])) rw_flag = namespace['ops_read'] else: ops_type = namespace['ops_arg_dat'] accessible_info = accessible_origin[p.name] ops_name = name_to_ops_dat[p.name] \ if accessible_info.time is None \ else name_to_ops_dat[accessible_info.origin_name].\ indexify([Add(accessible_info.time, accessible_info.shift)]) rw_flag = namespace['ops_read'] if p.read_only else namespace[ 'ops_write'] ops_arg = OpsArgDecl(ops_type=ops_type, ops_name=ops_name, elements_per_point=elements_per_point, dtype=dtype, rw_flag=rw_flag) return ops_arg
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break devicetype = as_list(self.lang[self.platform]) try: lang_init = [self.lang['init'](devicetype)] except TypeError: # Not all target languages need to be explicitly initialized lang_init = [] deviceid = DeviceID() if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = self.lang['num-devices'](devicetype) ngpus_init = LocalExpression(DummyEq(ngpus, call)) osdd_then = self.lang['set-device']([deviceid] + devicetype) osdd_else = self.lang['set-device']([rank % ngpus] + devicetype) body = lang_init + [ Conditional( CondNe(deviceid, -1), osdd_then, List( body=[rank_decl, rank_init, ngpus_init, osdd_else ]), ) ] header = c.Comment('Begin of %s+MPI setup' % self.lang['name']) footer = c.Comment('End of %s+MPI setup' % self.lang['name']) else: body = lang_init + [ Conditional( CondNe(deviceid, -1), self.lang['set-device']([deviceid] + devicetype)) ] header = c.Comment('Begin of %s setup' % self.lang['name']) footer = c.Comment('End of %s setup' % self.lang['name']) init = List(header=header, body=body, footer=(footer, c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def _call_sendrecv(self, name, *args, msg=None, haloid=None): # Drop `sizes` as this HaloExchangeBuilder conveys them through `msg` # Drop `ofss` as this HaloExchangeBuilder only needs them in `wait()`, # to collect and scatter the result of an MPI_Irecv f, _, ofsg, _, fromrank, torank, comm = args msg = Byref(IndexedPointer(msg, haloid)) return Call(name, [f] + ofsg + [fromrank, torank, comm, msg])
def test_create_ops_arg_constant(self): a = Constant(name='*a') ops_arg = create_ops_arg(a, {}, {}, {}) assert ops_arg.ops_type == namespace['ops_arg_gbl'] assert str(ops_arg.ops_name) == str(Byref(Constant(name='a'))) assert ops_arg.elements_per_point == 1 assert ops_arg.dtype == Literal('"%s"' % dtype_to_cstr(a.dtype)) assert ops_arg.rw_flag == namespace['ops_read']
def test_create_ops_arg_constant(self): a = Constant(name='*a') res = create_ops_arg(a, {}, {}, {}) assert type(res) == namespace['ops_arg_gbl'] assert str(res.args[0]) == str(Byref(Constant(name='a'))) assert res.args[1] == 1 assert res.args[2] == Literal('"%s"' % dtype_to_cstr(a.dtype)) assert res.args[3] == namespace['ops_read']
def test_create_ops_arg_constant(self): a = Constant(name='*a') res = create_ops_arg(a, {}, {}) assert res.name == namespace['ops_arg_gbl'].name assert res.args == [ Byref(Constant(name='a')), 1, Literal('"%s"' % dtype_to_cstr(a.dtype)), namespace['ops_read'] ]
def create_ops_arg(p, name_to_ops_dat, par_to_ops_stencil): if p.is_Constant: return namespace['ops_arg_gbl']( Byref(Constant(name=p.name[1:])), 1, Literal('"%s"' % dtype_to_cstr(p.dtype)), namespace['ops_read']) else: return namespace['ops_arg_dat']( name_to_ops_dat[p.name], 1, par_to_ops_stencil[p], Literal('"%s"' % dtype_to_cstr(p.dtype)), namespace['ops_read'] if p.read_only else namespace['ops_write'])
def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage): """ Allocate the following objects in the high bandwidth memory: * The pointer array `obj`; * The pointee Array `obj.array` If the pointer array is defined over `sregistry.threadid`, that is a thread Dimension, then each `obj.array` slice is allocated and freed individually by the owner thread. """ # The pointer array decl = Definition(obj) memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment size = SizeOf(Keyword('%s*' % obj._C_typedata)) * obj.dim.symbolic_size alloc0 = self.lang['host-alloc'](memptr, alignment, size) free0 = self.lang['host-free'](obj._C_symbol) # The pointee Array pobj = IndexedPointer(obj._C_symbol, obj.dim) memptr = VOID(Byref(pobj), '**') size = SizeOf(obj._C_typedata) * prod(obj.array.symbolic_shape) alloc1 = self.lang['host-alloc'](memptr, alignment, size) free1 = self.lang['host-free'](pobj) # Dump if obj.dim is self.sregistry.threadid: storage.update(obj, site, allocs=(decl, alloc0), frees=free0, pallocs=(obj.dim, alloc1), pfrees=(obj.dim, free1)) else: storage.update(obj, site, allocs=(decl, alloc0, alloc1), frees=(free0, free1))
def create_ops_fetch(f, name_to_ops_dat, time_upper_bound): if f.is_TimeFunction: ops_fetch = [ namespace['ops_dat_fetch_data']( name_to_ops_dat[f.name].indexify( [Mod(Add(time_upper_bound, -i), f._time_size)]), Byref( f.indexify([Mod(Add(time_upper_bound, -i), f._time_size)]))) for i in range(f._time_size) ] else: # The second parameter is the beginning of the array. But I didn't manage # to generate a C code like: `v`. Instead, I am generating `&(v[0])`. ops_fetch = [ namespace['ops_dat_fetch_data'](name_to_ops_dat[f.name], Byref(f.indexify([0]))) ] return ops_fetch
def _make_sendrecv(self, f, hse, key, **kwargs): comm = f.grid.distributor._obj_comm buf_dims = [Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in hse.loc_indices] bufg = Array(name=self.sregistry.make_name(prefix='bufg'), dimensions=buf_dims, dtype=f.dtype, padding=0) bufs = Array(name=self.sregistry.make_name(prefix='bufs'), dimensions=buf_dims, dtype=f.dtype, padding=0) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') gather = Call('gather%s' % key, [bufg] + list(bufg.shape) + [f] + ofsg) scatter = Call('scatter%s' % key, [bufs] + list(bufs.shape) + [f] + ofss) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) count = reduce(mul, bufs.shape, 1) rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') recv = IrecvCall([bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, Byref(rrecv)]) send = IsendCall([bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, Byref(rsend)]) waitrecv = Call('MPI_Wait', [Byref(rrecv), Macro('MPI_STATUS_IGNORE')]) waitsend = Call('MPI_Wait', [Byref(rsend), Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return SendRecv(key, iet, parameters, bufg, bufs)
def _alloc_array_on_high_bw_mem(self, site, obj, storage, *args): """ Allocate an Array in the high bandwidth memory. """ decl = Definition(obj) memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape) alloc = self.lang['host-alloc'](memptr, alignment, size) free = self.lang['host-free'](obj._C_symbol) storage.update(obj, site, allocs=(decl, alloc), frees=free)
def _make_poke(self, hs, key, msgs): flag = Symbol(name='flag') initflag = LocalExpression(DummyEq(flag, 0)) body = [initflag] for msg in msgs: dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) testrecv = Call( 'MPI_Test', [rrecv, Byref(flag), Macro('MPI_STATUS_IGNORE')]) testsend = Call( 'MPI_Test', [rsend, Byref(flag), Macro('MPI_STATUS_IGNORE')]) body.append(Iteration([testsend, testrecv], dim, msg.npeers - 1)) return make_efunc('pokempi%d' % key, body)
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() device_nvidia = Macro('acc_device_nvidia') if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia]) asdn_else = Call('acc_set_device_num', [rank % ngpus, device_nvidia]) body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), asdn_then, List(body=[rank_decl, rank_init, ngpus_init, asdn_else])) ] else: body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), Call('acc_set_device_num', [deviceid, device_nvidia])) ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def create_ops_arg(p, accessible_origin, name_to_ops_dat, par_to_ops_stencil): if p.is_Constant: return namespace['ops_arg_gbl']( Byref(Constant(name=p.name[1:])), 1, Literal('"%s"' % dtype_to_cstr(p.dtype)), namespace['ops_read']) else: accessible_info = accessible_origin[p.name] dat_name = name_to_ops_dat[p.name] \ if accessible_info.time is None \ else name_to_ops_dat[accessible_info.origin_name].\ indexify([accessible_info.time]) return namespace['ops_arg_dat']( dat_name, 1, par_to_ops_stencil[p], Literal('"%s"' % dtype_to_cstr(p.dtype)), namespace['ops_read'] if p.read_only else namespace['ops_write'])
def test_create_ops_dat_function(self): grid = Grid(shape=(4)) u = Function(name='u', grid=grid, space_order=2) block = OpsBlock('block') name_to_ops_dat = {} ops_dat = create_ops_dat(u, name_to_ops_dat, block) assert name_to_ops_dat['u'].name == namespace['ops_dat_name'](u.name) assert name_to_ops_dat['u']._C_typename == namespace['ops_dat_type'] assert ops_dat.dim_val.expr.lhs.name == \ namespace['ops_dat_dim'](u.name) assert ops_dat.dim_val.expr.rhs.params == \ (Integer(4),) assert ops_dat.base_val.expr.lhs.name == \ namespace['ops_dat_base'](u.name) assert ops_dat.base_val.expr.rhs.params == (Zero(),) assert ops_dat.d_p_val.expr.lhs.name == namespace['ops_dat_d_p'](u.name) assert ops_dat.d_p_val.expr.rhs.params == (Integer(2),) assert ops_dat.d_m_val.expr.lhs.name == namespace['ops_dat_d_m'](u.name) assert ops_dat.d_m_val.expr.rhs.params == (Integer(-2),) assert ops_dat.ops_decl_dat.expr.lhs == name_to_ops_dat['u'] assert type(ops_dat.ops_decl_dat.expr.rhs) == namespace['ops_decl_dat'] assert ops_dat.ops_decl_dat.expr.rhs.args == ( block, 1, Symbol(namespace['ops_dat_dim'](u.name)), Symbol(namespace['ops_dat_base'](u.name)), Symbol(namespace['ops_dat_d_m'](u.name)), Symbol(namespace['ops_dat_d_p'](u.name)), Byref(u.indexify((0,))), Literal('"%s"' % u._C_typedata), Literal('"u"') )
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
def test_make_cpp_parfor(): """ Test construction of a CPP parallel for. This excites the IET construction machinery in several ways, in particular by using Lambda nodes (to generate C++ lambda functions) and nested Calls. """ class STDVectorThreads(LocalObject): dtype = type('std::vector<std::thread>', (c_void_p, ), {}) def __init__(self): self.name = 'threads' class STDThread(LocalObject): dtype = type('std::thread&', (c_void_p, ), {}) def __init__(self, name): self.name = name class FunctionType(LocalObject): dtype = type('FuncType&&', (c_void_p, ), {}) def __init__(self, name): self.name = name # Basic symbols nthreads = Symbol(name='nthreads', is_const=True) threshold = Symbol(name='threshold', is_const=True) last = Symbol(name='last', is_const=True) first = Symbol(name='first', is_const=True) portion = Symbol(name='portion', is_const=True) # Composite symbols threads = STDVectorThreads() # Iteration helper symbols begin = Symbol(name='begin') l = Symbol(name='l') end = Symbol(name='end') # Functions stdmax = sympy.Function('std::max') # Construct the parallel-for body func = FunctionType('func') i = Dimension(name='i') threadobj = Call( 'std::thread', Lambda( Iteration(Call(func.name, i), i, (begin, end - 1, 1)), ['=', Byref(func.name)], )) threadpush = Call(FieldFromComposite('push_back', threads), threadobj) it = Dimension(name='it') iteration = Iteration([ LocalExpression(DummyEq(begin, it)), LocalExpression(DummyEq(l, it + portion)), LocalExpression(DummyEq(end, InlineIf(l > last, last, l))), threadpush ], it, (first, last, portion)) thread = STDThread('x') waitcall = Call('std::for_each', [ Call(FieldFromComposite('begin', threads)), Call(FieldFromComposite('end', threads)), Lambda(Call(FieldFromComposite('join', thread.name)), [], [thread]) ]) body = [ LocalExpression(DummyEq(threshold, 1)), LocalExpression( DummyEq(portion, stdmax(threshold, (last - first) / nthreads))), Call(FieldFromComposite('reserve', threads), nthreads), iteration, waitcall ] parfor = ElementalFunction('parallel_for', body, 'void', [first, last, func, nthreads]) assert str(parfor) == """\
def create_ops_dat(f, name_to_ops_dat, block): ndim = f.ndim - (1 if f.is_TimeFunction else 0) dim = Array(name=namespace['ops_dat_dim'](f.name), dimensions=(DefaultDimension(name='dim', default_value=ndim), ), dtype=np.int32, scope='stack') base = Array(name=namespace['ops_dat_base'](f.name), dimensions=(DefaultDimension(name='base', default_value=ndim), ), dtype=np.int32, scope='stack') d_p = Array(name=namespace['ops_dat_d_p'](f.name), dimensions=(DefaultDimension(name='d_p', default_value=ndim), ), dtype=np.int32, scope='stack') d_m = Array(name=namespace['ops_dat_d_m'](f.name), dimensions=(DefaultDimension(name='d_m', default_value=ndim), ), dtype=np.int32, scope='stack') res = [] base_val = [Zero() for i in range(ndim)] # If f is a TimeFunction we need to create a ops_dat for each time stepping # variable (eg: t1, t2) if f.is_TimeFunction: time_pos = f._time_position time_index = f.indices[time_pos] time_dims = f.shape[time_pos] dim_shape = sympify(f.shape[:time_pos] + f.shape[time_pos + 1:]) padding = f.padding[:time_pos] + f.padding[time_pos + 1:] halo = f.halo[:time_pos] + f.halo[time_pos + 1:] d_p_val = tuple(sympify([p[0] + h[0] for p, h in zip(padding, halo)])) d_m_val = tuple( sympify([-(p[1] + h[1]) for p, h in zip(padding, halo)])) ops_dat_array = Array(name=namespace['ops_dat_name'](f.name), dimensions=(DefaultDimension( name='dat', default_value=time_dims), ), dtype='ops_dat', scope='stack') dat_decls = [] for i in range(time_dims): name = '%s%s%s' % (f.name, time_index, i) name_to_ops_dat[name] = ops_dat_array.indexify( [Symbol('%s%s' % (time_index, i))]) dat_decls.append(namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([i])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % name))) ops_decl_dat = Expression( ClusterizedEq(Eq(ops_dat_array, ListInitializer(dat_decls)))) else: ops_dat = OpsDat("%s_dat" % f.name) name_to_ops_dat[f.name] = ops_dat d_p_val = tuple( sympify([p[0] + h[0] for p, h in zip(f.padding, f.halo)])) d_m_val = tuple( sympify([-(p[1] + h[1]) for p, h in zip(f.padding, f.halo)])) dim_shape = sympify(f.shape) ops_decl_dat = Expression( ClusterizedEq( Eq( ops_dat, namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([0])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % f.name))))) res.append(Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_shape))))) res.append(Expression(ClusterizedEq(Eq(base, ListInitializer(base_val))))) res.append(Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val))))) res.append(Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val))))) res.append(ops_decl_dat) return res
def create_ops_dat(f, name_to_ops_dat, block): ndim = f.ndim - (1 if f.is_TimeFunction else 0) dim = Array(name=namespace['ops_dat_dim'](f.name), dimensions=(DefaultDimension(name='dim', default_value=ndim), ), dtype=np.int32, scope='stack') base = Array(name=namespace['ops_dat_base'](f.name), dimensions=(DefaultDimension(name='base', default_value=ndim), ), dtype=np.int32, scope='stack') d_p = Array(name=namespace['ops_dat_d_p'](f.name), dimensions=(DefaultDimension(name='d_p', default_value=ndim), ), dtype=np.int32, scope='stack') d_m = Array(name=namespace['ops_dat_d_m'](f.name), dimensions=(DefaultDimension(name='d_m', default_value=ndim), ), dtype=np.int32, scope='stack') base_val = [Zero() for i in range(ndim)] # If f is a TimeFunction we need to create a ops_dat for each time stepping # variable (eg: t1, t2) if f.is_TimeFunction: time_pos = f._time_position time_index = f.indices[time_pos] time_dims = f.shape[time_pos] dim_val = f.shape[:time_pos] + f.shape[time_pos + 1:] d_p_val = f._size_nodomain.left[time_pos + 1:] d_m_val = [-i for i in f._size_nodomain.right[time_pos + 1:]] ops_dat_array = Array(name=namespace['ops_dat_name'](f.name), dimensions=(DefaultDimension( name='dat', default_value=time_dims), ), dtype=namespace['ops_dat_type'], scope='stack') dat_decls = [] for i in range(time_dims): name = '%s%s%s' % (f.name, time_index, i) dat_decls.append(namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([i])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % name))) ops_decl_dat = Expression( ClusterizedEq(Eq(ops_dat_array, ListInitializer(dat_decls)))) # Inserting the ops_dat array in case of TimeFunction. name_to_ops_dat[f.name] = ops_dat_array else: ops_dat = OpsDat("%s_dat" % f.name) name_to_ops_dat[f.name] = ops_dat dim_val = f.shape d_p_val = f._size_nodomain.left d_m_val = [-i for i in f._size_nodomain.right] ops_decl_dat = Expression( ClusterizedEq( Eq( ops_dat, namespace['ops_decl_dat'](block, 1, Symbol(dim.name), Symbol(base.name), Symbol(d_m.name), Symbol(d_p.name), Byref(f.indexify([0])), Literal('"%s"' % f._C_typedata), Literal('"%s"' % f.name))))) dim_val = Expression(ClusterizedEq(Eq(dim, ListInitializer(dim_val)))) base_val = Expression(ClusterizedEq(Eq(base, ListInitializer(base_val)))) d_p_val = Expression(ClusterizedEq(Eq(d_p, ListInitializer(d_p_val)))) d_m_val = Expression(ClusterizedEq(Eq(d_m, ListInitializer(d_m_val)))) return OpsDatDecl(dim_val=dim_val, base_val=base_val, d_p_val=d_p_val, d_m_val=d_m_val, ops_decl_dat=ops_decl_dat)
def test_create_ops_dat_function(self): grid = Grid(shape=(4)) u = Function(name='u', grid=grid, space_order=2) block = OpsBlock('block') name_to_ops_dat = {} result = create_ops_dat(u, name_to_ops_dat, block) assert name_to_ops_dat['u'].name == namespace['ops_dat_name'](u.name) assert name_to_ops_dat['u']._C_typename == namespace['ops_dat_type'] assert result[0].expr.lhs.name == namespace['ops_dat_dim'](u.name) assert result[0].expr.rhs.params == (Integer(4), ) assert result[1].expr.lhs.name == namespace['ops_dat_base'](u.name) assert result[1].expr.rhs.params == (Zero(), ) assert result[2].expr.lhs.name == namespace['ops_dat_d_p'](u.name) assert result[2].expr.rhs.params == (Integer(2), ) assert result[3].expr.lhs.name == namespace['ops_dat_d_m'](u.name) assert result[3].expr.rhs.params == (Integer(-2), ) assert result[4].expr.lhs == name_to_ops_dat['u'] assert result[4].expr.rhs.name == namespace['ops_decl_dat'].name assert result[4].expr.rhs.args == ( block, 1, Symbol(namespace['ops_dat_dim'](u.name)), Symbol(namespace['ops_dat_base'](u.name)), Symbol(namespace['ops_dat_d_m'](u.name)), Symbol(namespace['ops_dat_d_p'](u.name)), Byref(u.indexify( (0, ))), Literal('"%s"' % u._C_typedata), Literal('"u"')) def test_create_ops_arg_constant(self): a = Constant(name='*a') res = create_ops_arg(a, {}, {}) assert res.name == namespace['ops_arg_gbl'].name assert res.args == [ Byref(Constant(name='a')), 1, Literal('"%s"' % dtype_to_cstr(a.dtype)), namespace['ops_read'] ] @pytest.mark.parametrize('read', [True, False]) def test_create_ops_arg_function(self, read): u = OpsAccessible('u', np.float32, read) dat = OpsDat('u_dat') stencil = OpsStencil('stencil') res = create_ops_arg(u, {'u': dat}, {u: stencil}) assert res.name == namespace['ops_arg_dat'].name assert res.args == [ dat, 1, stencil, Literal('"%s"' % dtype_to_cstr(u.dtype)), namespace['ops_read'] if read else namespace['ops_write'] ]