def simple_function_fissionable(a, b, exprs, iters): # void foo(a, b) # for i # for j # for k # expr0 # expr2 symbols = [i.base.function for i in [a, b]] body = iters[0](iters[1](iters[2]([exprs[0], exprs[2]]))) return Callable('foo', body, 'void', symbols, ())
def update_halo(f, fixed): """ Construct an IET performing a halo exchange for a :class:`TensorFunction`. """ # Requirements assert f.is_Function assert f.grid is not None distributor = f.grid.distributor nb = distributor._C_neighbours.obj comm = distributor._C_comm fixed = {d: Symbol(name="o%s" % d.root) for d in fixed} mapper = get_views(f, fixed) body = [] masks = [] for d in f.dimensions: if d in fixed: continue rpeer = FieldFromPointer("%sright" % d, nb) lpeer = FieldFromPointer("%sleft" % d, nb) # Sending to left, receiving from right lsizes, loffsets = mapper[(d, LEFT, OWNED)] rsizes, roffsets = mapper[(d, RIGHT, HALO)] assert lsizes == rsizes sizes = lsizes parameters = ([f] + list(f.symbolic_shape) + sizes + loffsets + roffsets + [rpeer, lpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sl' % d) body.append(Conditional(mask, call)) masks.append(mask) # Sending to right, receiving from left rsizes, roffsets = mapper[(d, RIGHT, OWNED)] lsizes, loffsets = mapper[(d, LEFT, HALO)] assert rsizes == lsizes sizes = rsizes parameters = ([f] + list(f.symbolic_shape) + sizes + roffsets + loffsets + [lpeer, rpeer, comm]) call = Call('sendrecv_%s' % f.name, parameters) mask = Symbol(name='m%sr' % d) body.append(Conditional(mask, call)) masks.append(mask) iet = List(body=body) parameters = ([f] + masks + [comm, nb] + list(fixed.values()) + [d.symbolic_size for d in f.dimensions]) return Callable('halo_exchange_%s' % f.name, iet, 'void', parameters, ('static', ))
def simple_function_with_paddable_arrays(a_dense, b_dense, exprs, iters): # void foo(a_dense, b_dense) # for i # for j # for k # expr0 symbols = [i.base.function for i in [a_dense, b_dense]] body = iters[0](iters[1](iters[2](exprs[6]))) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def simple_function_fissionable(a, b, exprs, iters): # void foo(a, b) # for i # for j # for k # expr0 # expr2 symbols = [i.base.function for i in [a, b]] body = iters[0](iters[1](iters[2]([exprs[0], exprs[2]]))) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def _make_haloupdate(self, f, hse, key, msg=None): comm = f.grid.distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufg = FieldFromComposite(msg._C_field_bufg, msgi) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) torank = FieldFromComposite(msg._C_field_to, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofsg, i), msgi) for i in range(len(f._dist_dimensions)) ] ofsg = [fixed.get(d) or ofsg.pop(0) for d in f.dimensions] # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Call('gather_%s' % key, [bufg] + sizes + [f] + ofsg) gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # Make Irecv/Isend count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([recv, gather, send], dim, ncomms - 1) parameters = ([f, comm, msg, ncomms]) + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def sendrecv(f, fixed): """Construct an IET performing a halo exchange along arbitrary dimension and side.""" assert f.is_Function assert f.grid is not None comm = f.grid.distributor._C_comm buf_dims = [Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions] dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype, scope='external') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg gather = Call('gather_%s' % f.name, parameters) parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss scatter = Call('scatter_%s' % f.name, parameters) # The scatter must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [bufs, count, Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv]) send = Call('MPI_Isend', [bufg, count, Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)]) parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % f.name, iet, 'void', parameters, ('static',))
def complex_function(a, b, c, d, exprs, iters): # void foo(a, b, c, d) # for i # for s # expr0 # for j # for k # expr1 # expr2 # for p # expr3 symbols = [i.base.function for i in [a, b, c, d]] body = iters[0]([iters[3](exprs[2]), iters[1](iters[2]([exprs[3], exprs[4]])), iters[4](exprs[5])]) return Callable('foo', body, 'void', symbols, ())
def __make_tfunc(self, name, iet, root, threads): # Create the SharedData required = derive_parameters(iet) known = (root.parameters + tuple(i for i in required if i.is_Array and i._mem_shared)) parameters, dynamic_parameters = split(required, lambda i: i in known) sdata = SharedData(name=self.sregistry.make_name(prefix='sdata'), nthreads_std=threads.size, fields=dynamic_parameters) parameters.append(sdata) # Prepend the unwinded SharedData fields, available upon thread activation preactions = [ DummyExpr(i, FieldFromPointer(i.name, sdata.symbolic_base)) for i in dynamic_parameters ] preactions.append( DummyExpr(sdata.symbolic_id, FieldFromPointer(sdata._field_id, sdata.symbolic_base))) # Append the flag reset postactions = [ List(body=[ BlankLine, DummyExpr( FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 1) ]) ] iet = List(body=preactions + [iet] + postactions) # Append the flag reset # The thread has work to do when it receives the signal that all locks have # been set to 0 by the main thread iet = Conditional( CondEq(FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 2), iet) # The thread keeps spinning until the alive flag is set to 0 by the main thread iet = While( CondNe(FieldFromPointer(sdata._field_flag, sdata.symbolic_base), 0), iet) return Callable(name, iet, 'void', parameters, 'static'), sdata
def _make_haloupdate(self, f, hse, key, **kwargs): distributor = f.grid.distributor nb = distributor._obj_neighborhood comm = distributor._obj_comm sendrecv = self._cache_dims[f.dimensions][0] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} # Only retain the halos required by the Diag scheme # Note: `sorted` is only for deterministic code generation halos = sorted(i for i in hse.halos if isinstance(i.dim, tuple)) body = [] for dims, tosides in halos: mapper = OrderedDict(zip(dims, tosides)) sizes = [ f._C_get_field(OWNED, d, s).size for d, s in mapper.items() ] torank = FieldFromPointer( ''.join(i.name[0] for i in mapper.values()), nb) ofsg = [ fixed.get(d, f._C_get_field(OWNED, d, mapper.get(d)).offset) for d in f.dimensions ] mapper = OrderedDict(zip(dims, [i.flip() for i in tosides])) fromrank = FieldFromPointer( ''.join(i.name[0] for i in mapper.values()), nb) ofss = [ fixed.get(d, f._C_get_field(HALO, d, mapper.get(d)).offset) for d in f.dimensions ] kwargs['haloid'] = len(body) body.append( self._call_sendrecv(sendrecv.name, f, sizes, ofsg, ofss, fromrank, torank, comm, **kwargs)) iet = List(body=body) parameters = [f, comm, nb] + list(fixed.values()) return Callable('haloupdate%d' % key, iet, 'void', parameters, ('static', ))
def _make_copy(self, f, fixed, swap=False): """ Construct a Callable performing a copy of: * an arbitrary convex region of ``f`` into a contiguous Array, OR * if ``swap=True``, a contiguous Array into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in fixed else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather%dd' % f.ndim else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter%dd' % f.ndim iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=PARALLEL) iet = List(body=[ArrayCast(f), ArrayCast(buf), iet]) # Optimize the memory copy with the DLE from devito.dle import transform state = transform(iet, 'simd', {'openmp': self._threaded}) parameters = [buf] + list(buf.shape) + [f] + f_offsets + state.input return Callable(name, state.nodes, 'void', parameters, ('static', )), state.input
def test_transformer_replace_function_body(block1, block2): """Create a Function and replace its body with another.""" args = FindSymbols().visit(block1) f = Callable('foo', block1, 'void', args) assert str(f.ccode) == """void foo() { for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { a[i] = a[i] + b[i] + 5.0F; } } } }""" f = Transformer({block1: block2}).visit(f) assert str(f.ccode) == """void foo()
def complex_function(a, b, c, d, exprs, iters): # void foo(a, b, c, d) # for i # for s # expr0 # for j # for k # expr1 # expr2 # for p # expr3 symbols = [i.base.function for i in [a, b, c, d]] body = iters[0]([ iters[3](exprs[2]), iters[1](iters[2]([exprs[3], exprs[4]])), iters[4](exprs[5]) ]) f = Callable('foo', body, 'void', symbols, ()) f, subs = ResolveTimeStepping().visit(f) f = SubstituteExpression(subs=subs).visit(f) return f
def copy(f, fixed, swap=False): """ Construct a :class:`Callable` capable of copying: :: * an arbitrary convex region of ``f`` into a contiguous :class:`Array`, OR * if ``swap=True``, a contiguous :class:`Array` into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) dat_dims = [] dat_offsets = [] dat_indices = [] for d in f.dimensions: dat_dims.append(Dimension(name='dat_%s' % d.root)) offset = Symbol(name='o%s' % d.root) dat_offsets.append(offset) dat_indices.append(offset + (d.root if d not in fixed else 0)) dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype) if swap is False: eq = DummyEq(buf[buf_indices], dat[dat_indices]) name = 'gather_%s' % f.name else: eq = DummyEq(dat[dat_indices], buf[buf_indices]) name = 'scatter_%s' % f.name iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): iet = Iteration(iet, i, d.symbolic_size - 1) # -1 as Iteration generates <= iet = List(body=[ArrayCast(dat), ArrayCast(buf), iet]) parameters = [buf] + list(buf.shape) + [dat] + list( dat.shape) + dat_offsets return Callable(name, iet, 'void', parameters, ('static', ))
def _make_sendrecv(self, f, hse, key='', msg=None): comm = f.grid.distributor._obj_comm bufg = FieldFromPointer(msg._C_field_bufg, msg) bufs = FieldFromPointer(msg._C_field_bufs, msg) ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') sizes = [ FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions)) ] gather = Call('gather%s' % key, [bufg] + sizes + [f] + ofsg) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) count = reduce(mul, sizes, 1) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) iet = List(body=[recv, gather, send]) iet = List(body=iet_insert_C_decls(iet)) parameters = ([f] + ofsg + [fromrank, torank, comm, msg]) return Callable('sendrecv%s' % key, iet, 'void', parameters, ('static', ))
def _make_halowait(self, f, hse, key, msg=None): fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} dim = Dimension(name='i') msgi = IndexedPointer(msg, dim) bufs = FieldFromComposite(msg._C_field_bufs, msgi) fromrank = FieldFromComposite(msg._C_field_from, msgi) sizes = [ FieldFromComposite('%s[%d]' % (msg._C_field_sizes, i), msgi) for i in range(len(f._dist_dimensions)) ] ofss = [ FieldFromComposite('%s[%d]' % (msg._C_field_ofss, i), msgi) for i in range(len(f._dist_dimensions)) ] ofss = [fixed.get(d) or ofss.pop(0) for d in f.dimensions] # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Call('scatter_%s' % key, [bufs] + sizes + [f] + ofss) scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromComposite(msg._C_field_rrecv, msgi)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromComposite(msg._C_field_rsend, msgi)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) # The -1 below is because an Iteration, by default, generates <= ncomms = Symbol(name='ncomms') iet = Iteration([waitsend, waitrecv, scatter], dim, ncomms - 1) parameters = ([f] + list(fixed.values()) + [msg, ncomms]) return Callable('halowait%d' % key, iet, 'void', parameters, ('static', ))
def _make_wait(self, f, hse, key, msg=None): bufs = FieldFromPointer(msg._C_field_bufs, msg) ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') sizes = [FieldFromPointer('%s[%d]' % (msg._C_field_sizes, i), msg) for i in range(len(f._dist_dimensions))] scatter = Call('scatter%s' % key, [bufs] + sizes + [f] + ofss) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) rrecv = Byref(FieldFromPointer(msg._C_field_rrecv, msg)) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) rsend = Byref(FieldFromPointer(msg._C_field_rsend, msg)) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[waitsend, waitrecv, scatter]) parameters = ([f] + ofss + [fromrank, msg]) return Callable('wait_%s' % key, iet, 'void', parameters, ('static',))
def _make_halowait(self, f, hse, key, msg=None): nb = f.grid.distributor._obj_neighborhood wait = self._cache_dims[f.dimensions][2] fixed = {d: Symbol(name="o%s" % d.root) for d in hse.loc_indices} # Only retain the halos required by the Diag scheme # Note: `sorted` is only for deterministic code generation halos = sorted(i for i in hse.halos if isinstance(i.dim, tuple)) body = [] for dims, tosides in halos: mapper = OrderedDict(zip(dims, [i.flip() for i in tosides])) fromrank = FieldFromPointer(''.join(i.name[0] for i in mapper.values()), nb) ofss = [fixed.get(d, f._C_get_field(HALO, d, mapper.get(d)).offset) for d in f.dimensions] msgi = Byref(IndexedPointer(msg, len(body))) body.append(Call(wait.name, [f] + ofss + [fromrank, msgi])) iet = List(body=body) parameters = [f] + list(fixed.values()) + [nb, msg] return Callable('halowait%d' % key, iet, 'void', parameters, ('static',))
def _make_haloupdate(self, f, fixed, mask, extra=None, uniquekey=None): extra = extra or [] distributor = f.grid.distributor nb = distributor._obj_neighborhood comm = distributor._obj_comm fixed = {d: Symbol(name="o%s" % d.root) for d in fixed} # Build a mapper `(dim, side, region) -> (size, ofs)` for `f`. `size` and # `ofs` are symbolic objects. This mapper tells what data values should be # sent (OWNED) or received (HALO) given dimension and side mapper = {} for d0, side, region in product(f.dimensions, (LEFT, RIGHT), (OWNED, HALO)): if d0 in fixed: continue sizes = [] offsets = [] for d1 in f.dimensions: if d1 in fixed: offsets.append(fixed[d1]) else: meta = f._C_get_field(region if d0 is d1 else NOPAD, d1, side) offsets.append(meta.offset) sizes.append(meta.size) mapper[(d0, side, region)] = (sizes, offsets) body = [] for d in f.dimensions: if d in fixed: continue name = ''.join('r' if i is d else 'c' for i in distributor.dimensions) rpeer = FieldFromPointer(name, nb) name = ''.join('l' if i is d else 'c' for i in distributor.dimensions) lpeer = FieldFromPointer(name, nb) if mask[(d, LEFT)]: # Sending to left, receiving from right lsizes, loffsets = mapper[(d, LEFT, OWNED)] rsizes, roffsets = mapper[(d, RIGHT, HALO)] args = [f] + lsizes + loffsets + roffsets + [ rpeer, lpeer, comm ] + extra body.append(Call('sendrecv%dd' % f.ndim, args)) if mask[(d, RIGHT)]: # Sending to right, receiving from left rsizes, roffsets = mapper[(d, RIGHT, OWNED)] lsizes, loffsets = mapper[(d, LEFT, HALO)] args = [f] + rsizes + roffsets + loffsets + [ lpeer, rpeer, comm ] + extra body.append(Call('sendrecv%dd' % f.ndim, args)) if uniquekey is None: uniquekey = ''.join(str(int(i)) for i in mask.values()) name = 'haloupdate%dd%s' % (f.ndim, uniquekey) iet = List(body=body) parameters = [f, comm, nb] + list(fixed.values()) + extra return Callable(name, iet, 'void', parameters, ('static', ))
def _create_efuncs(self, nodes, state): """ Extract Iteration sub-trees and turn them into Calls+Callables. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) efuncs = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Build a new Iteration/Expression tree with free bounds free = [] defined_args = {} # Map of argument values defined by loop bounds for i in target: name, bounds = i.dim.name, i.symbolic_bounds # Iteration bounds _min = Scalar(name='%sf_m' % name, dtype=np.int32, is_const=True) _max = Scalar(name='%sf_M' % name, dtype=np.int32, is_const=True) defined_args[_min.name] = bounds[0] defined_args[_max.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update({ uf.name: j.symbolic_min for uf, j in zip(ufunc, i.uindices) }) uindices = [ IncrDimension(j.parent, i.dim + as_symbol(k), 1, j.name) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=(_min, _max, 1), offsets=None, uindices=uindices)) # Construct elemental function body free = Transformer(dict((zip(target, free))), nested=True).visit(root) items = FindSymbols().visit(free) # Insert array casts casts = [ArrayCast(i) for i in items if i.is_Tensor] free = List(body=casts + [free]) # Insert declarations external = [i for i in items if i.is_Array] free = iet_insert_C_decls(free, external) # Create the Callable name = "f_%d" % root.tag params = derive_parameters(free) efuncs.setdefault(name, Callable(name, free, 'void', params, 'static')) # Create the Call args = [defined_args.get(i.name, i) for i in params] mapper[root] = List(header=noinline, body=Call(name, args)) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'efuncs': efuncs.values()}
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', handle, ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def opsit(trees, count): node_factory = OPSNodeFactory() expressions = [] for tree in trees: expressions.extend(FindNodes(Expression).visit(tree.inner)) it_range = [] it_dims = 0 for tree in trees: if isinstance(tree, IterationTree): it_range = [it.bounds() for it in tree] it_dims = len(tree) block = OPSBlock(namespace['ops_block'](count)) block_init = Element( cgen.Initializer( block, Call("ops_decl_block", [it_dims, String(block.name)], False))) ops_expressions = [] accesses = defaultdict(set) for i in reversed(expressions): extend_accesses(accesses, get_accesses(i.expr)) ops_expressions.insert(0, Expression(make_ops_ast(i.expr, node_factory))) ops_stencils_initializers, ops_stencils = generate_ops_stencils(accesses) to_remove = [ f.name for f in FindSymbols('defines').visit(List(body=expressions)) ] parameters = FindSymbols('symbolics').visit(List(body=ops_expressions)) parameters = [ p for p in parameters if p.name != 'OPS_ACC_size' and p.name not in to_remove ] parameters = sorted(parameters, key=lambda i: (i.is_Constant, i.name)) arguments = FindSymbols('symbolics').visit(List(body=expressions)) arguments = [a for a in arguments if a.name not in to_remove] arguments = sorted(arguments, key=lambda i: (i.is_Constant, i.name)) ops_expressions = [ Expression(fix_ops_acc(e.expr, [p.name for p in parameters])) for e in ops_expressions ] callable_kernel = Callable(namespace['ops_kernel'](count), ops_expressions, "void", parameters) dat_declarations = [] argname_to_dat = {} for a in arguments: if a.is_Constant: continue dat_dec, dat_sym = to_ops_dat(a, block) dat_declarations.extend(dat_dec) argname_to_dat.update(dat_sym) par_loop_range_arr = SymbolicArray(name=namespace['ops_range'](count), dimensions=(len(it_range) * 2, ), dtype=np.int32) range_vals = [] for mn, mx in it_range: range_vals.append(mn) range_vals.append(mx) par_loop_range_init = Expression( ClusterizedEq(Eq(par_loop_range_arr, ListInitializer(range_vals)))) ops_args = get_ops_args([p for p in parameters], ops_stencils, argname_to_dat) par_loop = Call("ops_par_loop", [ FunctionPointer(callable_kernel.name), String(callable_kernel.name), block, it_dims, par_loop_range_arr, *ops_args ]) return (callable_kernel, [par_loop_range_init, block_init] + ops_stencils_initializers + dat_declarations + [Call("ops_partition", [String("")])], List(body=[par_loop]), it_dims)
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Evaluable) for i in expressions): raise InvalidOperator("Only `devito.Evaluable` are allowed.") # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on iteration spaces and data dependences clusters = cls._lower_clusters(expressions, profiler, **kwargs) # Lower Clusters to a ScheduleTree stree = cls._lower_stree(clusters, **kwargs) # Lower ScheduleTree to an Iteration/Expression Tree iet, byproduct = cls._lower_iet(stree, profiler, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **iet.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(byproduct.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(byproduct.includes) # Required for the jit-compilation op._compiler = kwargs['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in byproduct.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = flatten(c.dimensions for c in clusters) + byproduct.dimensions op._dimensions = sorted(set(op._dimensions), key=attrgetter('name')) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") name = kwargs.get("name", "Kernel") dse = kwargs.get("dse", configuration['dse']) # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions to internal expressions (e.g., attaching metadata) expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on their iteration space and data dependences # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...) clusters = clusterize(expressions, dse_mode=set_dse_mode(dse)) # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) # Instrument the IET for C-level profiling iet = profiler.instrument(iet) # Wrap the IET with a Callable parameters = derive_parameters(iet, True) op = Callable(name, iet, 'int', parameters, ()) # Lower IET to a Target-specific IET op, target_state = cls._specialize_iet(op, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **op.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(target_state.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(target_state.includes) # Required for the jit-compilation op._compiler = configuration['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in target_state.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) op._dimensions.extend(target_state.dimensions) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def _make_fetchprefetch(self, iet, sync_ops, pieces, root): fid = SharedData._field_id fetches = [] prefetches = [] presents = [] for s in sync_ops: f = s.function dimensions = s.dimensions fc = s.fetch ifc = s.ifetch pfc = s.pfetch fcond = s.fcond pcond = s.pcond # Construct init IET imask = [(ifc, s.size) if d.root is s.dim.root else FULL for d in dimensions] fetch = PragmaTransfer(self.lang._map_to, f, imask=imask) fetches.append(Conditional(fcond, fetch)) # Construct present clauses imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] presents.append( PragmaTransfer(self.lang._map_present, f, imask=imask)) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in dimensions] prefetch = PragmaTransfer(self.lang._map_to_wait, f, imask=imask, queueid=fid) prefetches.append(Conditional(pcond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine])) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)) ] + presents + [iet, tctx.activate]) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, sdata, threads) return iet
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) pfc = s.fetch + 1 fc_cond = s.next_cbk(s.dim.symbolic_min) pfc_cond = s.next_cbk(s.dim + 1) else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) pfc = s.fetch - 1 fc_cond = s.next_cbk(s.dim.symbolic_max) pfc_cond = s.next_cbk(s.dim - 1) # Construct init IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = PragmaList(self.lang._map_to(s.function, imask), {s.function} | fc.free_symbols) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self.lang._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = PragmaList(self.lang._map_to_wait(s.function, imask, SharedData._field_id), {s.function} | pfc.free_symbols) prefetches.append(Conditional(pfc_cond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append(List( header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine] )) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait(CondNe(FieldFromComposite(sdata._field_flag, sdata[threads.index]), 1)), List(header=presents), iet, tctx.activate ]) # Fire up the threads pieces.init.append(tctx.init) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) return iet
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): threads = self.__make_threads() fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) fsize = s.function._C_get_field(FULL, s.dim).size fc_cond = fc + (s.size - 1) < fsize pfc = s.fetch + 1 pfc_cond = pfc + (s.size - 1) < fsize else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) fc_cond = fc >= 0 pfc = s.fetch - 1 pfc_cond = pfc >= 0 # Construct fetch IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = List(header=self._P._map_to(s.function, imask)) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self._P._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = List(header=self._P._map_to_wait( s.function, imask, SharedData._field_id)) prefetches.append(Conditional(pfc_cond, prefetch)) functions = filter_ordered(s.function for s in sync_ops) casts = [PointerCast(f) for f in functions] # Turn init IET into a Callable name = self.sregistry.make_name(prefix='init_device') body = List(body=casts + fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream for `%s`" % threads.name), body=[Call(name, func.parameters), BlankLine])) # Turn prefetch IET into a threaded Callable name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=casts + prefetches) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Glue together all the IET pieces, including the activation bits iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)), List(header=presents), iet, self.__make_activate_thread(threads, sdata, sync_ops) ]) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) defined_args = {} # Map of argument values defined by loop bounds # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) defined_args[start.name] = bounds[0] defined_args[finish.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update( {uf.name: j.start for uf, j in zip(ufunc, i.uindices)}) limits = [ Scalar(name=start.name, dtype=np.int32), Scalar(name=finish.name, dtype=np.int32), 1 ] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) # Insert array casts for all non-defined f_symbols = FindSymbols('symbolics').visit(free) defines = [s.name for s in FindSymbols('defines').visit(free)] casts = [ ArrayCast(f) for f in f_symbols if f.is_Tensor and f.name not in defines ] free = (List(body=casts), free) for i in derive_parameters(free): if i.name in defined_args: args.append((defined_args[i.name], i)) elif i.is_Dimension: d = Scalar(name=i.name, dtype=i.dtype) args.append((d, d)) else: args.append((i, i)) call, params = zip(*args) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', flatten(params), ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def _make_sendrecv(self, f, fixed, extra=None): extra = extra or [] comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') args = [bufg] + list(bufg.shape) + [f] + ofsg + extra gather = Call('gather%dd' % f.ndim, args) args = [bufs] + list(bufs.shape) + [f] + ofss + extra scatter = Call('scatter%dd' % f.ndim, args) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') ssend = MPIStatusObject(name='ssend') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, ssend]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=iet_insert_C_decls(iet)) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm] + extra) return Callable('sendrecv%dd' % f.ndim, iet, 'void', parameters, ('static', ))