def _create_efuncs(self, nodes, state): """ Extract Iteration sub-trees and turn them into Calls+Callables. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) efuncs = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Build a new Iteration/Expression tree with free bounds free = [] defined_args = {} # Map of argument values defined by loop bounds for i in target: name, bounds = i.dim.name, i.symbolic_bounds # Iteration bounds _min = Scalar(name='%sf_m' % name, dtype=np.int32, is_const=True) _max = Scalar(name='%sf_M' % name, dtype=np.int32, is_const=True) defined_args[_min.name] = bounds[0] defined_args[_max.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update({ uf.name: j.symbolic_min for uf, j in zip(ufunc, i.uindices) }) uindices = [ IncrDimension(j.parent, i.dim + as_symbol(k), 1, j.name) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=(_min, _max, 1), offsets=None, uindices=uindices)) # Construct elemental function body free = Transformer(dict((zip(target, free))), nested=True).visit(root) items = FindSymbols().visit(free) # Insert array casts casts = [ArrayCast(i) for i in items if i.is_Tensor] free = List(body=casts + [free]) # Insert declarations external = [i for i in items if i.is_Array] free = iet_insert_C_decls(free, external) # Create the Callable name = "f_%d" % root.tag params = derive_parameters(free) efuncs.setdefault(name, Callable(name, free, 'void', params, 'static')) # Create the Call args = [defined_args.get(i.name, i) for i in params] mapper[root] = List(header=noinline, body=Call(name, args)) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'efuncs': efuncs.values()}
def _optimize_halospots(self, iet): """ Optimize the HaloSpots in ``iet``. * Remove all USELESS HaloSpots; * Merge all hoistable HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop USELESS HaloSpots mapper = {hs: hs.body for hs in FindNodes(HaloSpot).visit(iet) if hs.is_Useless} iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots mapper = {} for halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).values(): root = halo_spots[0] halo_schemes = [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]] mapper[root] = root._rebuild(halo_scheme=root.halo_scheme.union(halo_schemes)) mapper.update({hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.hoistable)) for hs in halo_spots[1:]}) iet = Transformer(mapper, nested=True).visit(iet) # At this point, some HaloSpots may have become empty (i.e., requiring # no communications), hence they can be removed # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> <A> # <HaloSpot()> ----> <B> # <B> mapper = {i: i.body for i in FindNodes(HaloSpot).visit(iet) if i.is_empty} iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = {k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items()} iet = Transformer(mapper).visit(iet) return iet, {}
def _avoid_denormals(self, iet): header = [cgen.Comment('Flush denormal numbers to zero in hardware'), cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'), cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')] iet = List(header=header, body=iet) return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not (tree.root.is_Sequential or iet.is_Callable) and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks interb.append( Iteration([], d, d.symbolic_max, properties=PARALLEL)) # Build Iteration within a block intrab.append( i._rebuild([], limits=(d, d + d.step - 1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten( (bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append( ((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b, ) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, { 'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims] }
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations( tree, lambda i: i.is_Parallel and i.is_Affine) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not self.blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/readbility, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels) ] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL, ) + ((AFFINE, ) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append( Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append( Iteration([], di, limits=(d, d + d.step - 1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append( i._rebuild([], limits=(d, d + d.step - 1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append( ((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b, ) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value, ) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, { 'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims] }
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) defined_args = {} # Map of argument values defined by loop bounds # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) defined_args[start.name] = bounds[0] defined_args[finish.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update( {uf.name: j.start for uf, j in zip(ufunc, i.uindices)}) limits = [ Scalar(name=start.name, dtype=np.int32), Scalar(name=finish.name, dtype=np.int32), 1 ] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) # Insert array casts for all non-defined f_symbols = FindSymbols('symbolics').visit(free) defines = [s.name for s in FindSymbols('defines').visit(free)] casts = [ ArrayCast(f) for f in f_symbols if f.is_Tensor and f.name not in defines ] free = (List(body=casts), free) for i in derive_parameters(free): if i.name in defined_args: args.append((defined_args[i.name], i)) elif i.is_Dimension: d = Scalar(name=i.name, dtype=i.dtype) args.append((d, d)) else: args.append((i, i)) call, params = zip(*args) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', flatten(params), ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def optimize_halospots(iet): """ Optimize the HaloSpots in ``iet``. * Remove all ``useless`` HaloSpots; * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus removing redundant communications and anticipating communications that will be required by later Iterations. """ # Drop `useless` HaloSpots mapper = { hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless)) for hs in FindNodes(HaloSpot).visit(iet) } iet = Transformer(mapper, nested=True).visit(iet) # Handle `hoistable` HaloSpots # First, we merge `hoistable` HaloSpots together, to anticipate communications mapper = {} for tree in retrieve_iteration_tree(iet): halo_spots = FindNodes(HaloSpot).visit(tree.root) if not halo_spots: continue root = halo_spots[0] if root in mapper: continue hss = [root.halo_scheme] hss.extend( [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]]) try: mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss)) except ValueError: # HaloSpots have non-matching `loc_indices` and therefore can't be merged perf_adv("Found hoistable HaloSpots with disjoint loc_indices, " "skipping optimization") continue for hs in halo_spots[1:]: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) iet = Transformer(mapper, nested=True).visit(iet) # Then, we make sure the halo exchanges get performed *before* # the first distributed Dimension. Again, we do this to anticipate # communications, which hopefully has a pay off in performance # # <Iteration x> <HaloSpot(u)>, in y # <HaloSpot(u)>, in y ----> <Iteration x> # <Iteration y> <Iteration y> mapper = {} for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items(): hoistable = [hs for hs in halo_spots if hs.hoistable] if not hoistable: continue elif len(hoistable) > 1: # We should never end up here, but for now we can't prove it formally perf_adv( "Found multiple hoistable HaloSpots, skipping optimization") continue hs = hoistable.pop() if hs in mapper: continue if i.dim.root in hs.dimensions: halo_scheme = hs.halo_scheme.drop(hs.hoistable) if halo_scheme.is_void: mapper[hs] = hs.body else: mapper[hs] = hs._rebuild(halo_scheme=halo_scheme) halo_scheme = hs.halo_scheme.project(hs.hoistable) mapper[i] = hs._rebuild(halo_scheme=halo_scheme, body=i._rebuild()) iet = Transformer(mapper, nested=True).visit(iet) # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot # subtrees, to overlap as much computation as possible. The HaloSpot-free # Iteration nests must be fully affine, otherwise we wouldn't be able to # honour the data dependences along the halo # # <HaloSpot(u,v)> HaloSpot(u,v) # <A> ----> <A> # <B> affine? <B> # # Here, <B> doesn't require any halo exchange, but it might still need the # output of <A>; thus, if we do computation/communication overlap over <A> # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space # will have to be split as well. For this, <B> must be affine. mapper = {} for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values(): for g in v: root = None for i in g: if i.is_HaloSpot: root = i mapper[root] = [root.body] elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)): mapper[root].append(i) mapper[i] = None else: root = None mapper = { k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items() } iet = Transformer(mapper).visit(iet) return iet, {}
def _loop_fission(self, nodes, state): """ Apply loop fission to innermost :class:`Iteration` objects. This pass is not applied if the number of statements in an Iteration's body is lower than ``self.thresholds['fission'].`` """ mapper = {} for tree in retrieve_iteration_tree(nodes): if len(tree) <= 1: # Heuristically avoided continue candidate = tree[-1] expressions = [e for e in candidate.nodes if e.is_Expression] if len(expressions) < self.thresholds['max_fission']: # Heuristically avoided continue if len(expressions) != len(candidate.nodes): # Dangerous for correctness continue functions = list( set.union(*[set(e.functions) for e in expressions])) wrapped = [e.expr for e in expressions] if not functions or not wrapped: # Heuristically avoided continue # Promote temporaries from scalar to tensors handle = functions[0] dim = handle.indices[-1] size = handle.shape[-1] if any(dim != i.indices[-1] for i in functions): # Dangerous for correctness continue wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ), True) assert len(wrapped) == len(expressions) rebuilt = [ Expression(s, e.dtype) for s, e in zip(wrapped, expressions) ] # Group statements # TODO: Need a heuristic here to maximize reuse args_frozen = candidate.args_frozen properties = as_tuple(args_frozen['properties']) + (ELEMENTAL, ) args_frozen['properties'] = properties n = self.thresholds['min_fission'] fissioned = [ Iteration(g, **args_frozen) for g in grouper(rebuilt, n) ] mapper[candidate] = List(body=fissioned) processed = Transformer(mapper).visit(nodes) return processed, {}
def make(self, hs, key): """ Construct Callables and Calls implementing distributed-memory halo exchange for the HaloSpot ``hs``. At least three Callables are constructed: * ``update_halo``, to be called to trigger the halo exchange, * ``sendrecv``, called from within ``update_halo``. * ``copy``, called from within ``sendrecv``, to implement, for example, data gathering prior to an MPI_Send, and data scattering following an MPI recv. Additional Callables may be constructed if the halo exchange is asynchronous (which depends on the specific HaloExchangeBuilder implementation). """ # Sanity check assert all(f.is_Function and f.grid is not None for f in hs.fmapper) # Callable for compute over the CORE region compute = self._make_compute(hs, key) if compute is not None: self._efuncs[compute] = [None] # Callables for send/recv/wait for f, hse in hs.fmapper.items(): msg = self._make_msg(f, hse, key='%d_%d' % (key, len(self.msgs))) msg = self._msgs.setdefault((f, hse), msg) if (f.ndim, hse) not in self._cache: df = f.__class__.__base__(name='a', grid=f.grid, shape=f.shape_global, dimensions=f.dimensions) haloupdate = self._make_haloupdate(df, hse, key, msg=msg) sendrecv = self._make_sendrecv(df, hse, key, msg=msg) gather = self._make_copy(df, hse, key) self._efuncs[haloupdate] = None self._efuncs[sendrecv] = [haloupdate.name] self._efuncs[gather] = [sendrecv.name] halowait = self._make_halowait(df, hse, key, msg=msg) wait = self._make_wait(df, hse, key, msg=msg) scatter = self._make_copy(df, hse, key, swap=True) if halowait is None: assert wait is None self._efuncs[scatter] = [sendrecv.name] else: self._efuncs[halowait] = None self._efuncs[wait] = [halowait.name] self._efuncs[scatter] = [wait.name] self._cache[(f.ndim, hse)] = (haloupdate, halowait) # Callable for compute over the OWNED region callcompute = self._call_compute(hs, compute) remainder = self._make_remainder(callcompute, hs, key) if remainder is not None: self._efuncs[remainder] = None self._efuncs.setdefault(callcompute, []).append(remainder.name) # Now build up the HaloSpot body, with explicit Calls to the constructed Callables body = [callcompute] for f, hse in hs.fmapper.items(): msg = self._msgs[(f, hse)] haloupdate, halowait = self._cache[(f.ndim, hse)] body.insert(0, self._call_haloupdate(haloupdate.name, f, hse, msg)) if halowait is not None: body.append(self._call_halowait(halowait.name, f, hse, msg)) if remainder is not None: body.append(self._call_remainder(remainder)) return List(body=body)
def make(self, hs): """ Construct Callables and Calls implementing distributed-memory halo exchange for the HaloSpot ``hs``. """ # Sanity check assert all(f.is_Function and f.grid is not None for f in hs.fmapper) for f, hse in hs.fmapper.items(): # Build an MPIMsg, a data structure to be propagated across the # various halo exchange routines if (f, hse) not in self._msgs: key = self._gen_msgkey() msg = self._msgs.setdefault((f, hse), self._make_msg(f, hse, key)) else: msg = self._msgs[(f, hse)] # Callables for send/recv/wait if (f.ndim, hse) not in self._cache_halo: self._make_all(f, hse, msg) msgs = [self._msgs[(f, hse)] for f, hse in hs.fmapper.items()] # Callable for poking the asynchronous progress engine key = self._gen_compkey() poke = self._make_poke(hs, key, msgs) if isinstance(poke, Callable): self._efuncs.append(poke) # Callable for compute over the CORE region callpoke = self._call_poke(poke) compute = self._make_compute(hs, key, msgs, callpoke) if isinstance(compute, Callable): self._efuncs.append(compute) # Callable for compute over the OWNED region region = self._make_region(hs, key) region = self._regions.setdefault(hs, region) callcompute = self._call_compute(hs, compute, msgs) remainder = self._make_remainder(hs, key, callcompute, region) if isinstance(remainder, Callable): self._efuncs.append(remainder) # Now build up the HaloSpot body, with explicit Calls to the constructed Callables haloupdates = [] halowaits = [] for i, (f, hse) in enumerate(hs.fmapper.items()): msg = self._msgs[(f, hse)] haloupdate, halowait = self._cache_halo[(f.ndim, hse)] haloupdates.append(self._call_haloupdate(haloupdate.name, f, hse, msg)) if halowait is not None: halowaits.append(self._call_halowait(halowait.name, f, hse, msg)) body = [] body.append(HaloUpdateList(body=haloupdates)) if callcompute is not None: body.append(callcompute) body.append(HaloWaitList(body=halowaits)) if remainder is not None: body.append(self._call_remainder(remainder)) return List(body=body)
def _minimize_remainders(self, nodes, state): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ mapper = {} for tree in retrieve_iteration_tree(nodes): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] if root.tag is None: continue # Padding writes = [ i for i in FindSymbols('symbolics-writes').visit(root) if i.is_Array ] padding = [] for i in writes: try: simd_items = get_simd_items(i.dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype( i.dtype).itemsize padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, )) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.end_symbolic if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)) for i in root.uindices: for j in externals: condition.append(root.end_symbolic + padding < j) condition = ' || '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func(name='_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint))) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(nodes) return processed, {}
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): threads = self.__make_threads() fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) fsize = s.function._C_get_field(FULL, s.dim).size fc_cond = fc + (s.size - 1) < fsize pfc = s.fetch + 1 pfc_cond = pfc + (s.size - 1) < fsize else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) fc_cond = fc >= 0 pfc = s.fetch - 1 pfc_cond = pfc >= 0 # Construct fetch IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = List(header=self._P._map_to(s.function, imask)) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self._P._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = List(header=self._P._map_to_wait( s.function, imask, SharedData._field_id)) prefetches.append(Conditional(pfc_cond, prefetch)) functions = filter_ordered(s.function for s in sync_ops) casts = [PointerCast(f) for f in functions] # Turn init IET into a Callable name = self.sregistry.make_name(prefix='init_device') body = List(body=casts + fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream for `%s`" % threads.name), body=[Call(name, func.parameters), BlankLine])) # Turn prefetch IET into a threaded Callable name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=casts + prefetches) tfunc, sdata = self.__make_tfunc(name, body, root, threads) pieces.funcs.append(tfunc) # Glue together all the IET pieces, including the activation bits iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)), List(header=presents), iet, self.__make_activate_thread(threads, sdata, sync_ops) ]) # Fire up the threads pieces.init.append( self.__make_init_threads(threads, sdata, tfunc, pieces)) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(self.__make_finalize_threads(threads, sdata)) return iet
def _make_fetchprefetch(self, iet, sync_ops, pieces, root): fid = SharedData._field_id fetches = [] prefetches = [] presents = [] for s in sync_ops: f = s.function dimensions = s.dimensions fc = s.fetch ifc = s.ifetch pfc = s.pfetch fcond = s.fcond pcond = s.pcond # Construct init IET imask = [(ifc, s.size) if d.root is s.dim.root else FULL for d in dimensions] fetch = PragmaTransfer(self.lang._map_to, f, imask=imask) fetches.append(Conditional(fcond, fetch)) # Construct present clauses imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in dimensions] presents.append( PragmaTransfer(self.lang._map_present, f, imask=imask)) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in dimensions] prefetch = PragmaTransfer(self.lang._map_to_wait, f, imask=imask, queueid=fid) prefetches.append(Conditional(pcond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append( List(header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine])) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait( CondNe( FieldFromComposite(sdata._field_flag, sdata[ threads.index]), 1)) ] + presents + [iet, tctx.activate]) # Fire up the threads pieces.init.append(tctx.init) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) # Keep track of created objects pieces.objs.add(sync_ops, sdata, threads) return iet
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) noinline = self._compiler_decoration('noinline', cgen.Comment('noinline?')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = OrderedDict() block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? candidates = [i for i in tree if i.is_Parallel] if blockinner: iterations = candidates else: iterations = [i for i in candidates if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree.root.is_Sequential and not blockalways: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_block" % (i.dim.name, len(mapper))) # Build Iteration over blocks interb.append(Iteration([], d, d.symbolic_max, offsets=i.offsets, properties=PARALLEL)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Record that a new BlockDimension has been introduced block_dims.append(d) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc0 = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc0.make_call(dynamic_args_mapper) body.append(List(header=noinline, body=call)) # Build indirect Call to the `efunc0` Calls dynamic_parameters = [i.dim.root for i in candidates] dynamic_parameters.extend([bi.dim.step for bi in interb]) efunc1 = make_efunc("f%d" % len(mapper), body, dynamic_parameters) # Track everything to ultimately transform the input `iet` mapper[root] = efunc1.make_call() efuncs[efunc1] = None efuncs[efunc0] = [efunc1.name] iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs}
def _loop_blocking(self, nodes, state): """ Apply loop blocking to :class:`Iteration` trees. Blocking is applied to parallel iteration trees. Heuristically, innermost dimensions are not blocked to maximize the trip count of the SIMD loops. Different heuristics may be specified by passing the keywords ``blockshape`` and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate a specific block size for each blocked dimension. For example, for the :class:`Iteration` tree: :: for i for j for k ... one may provide ``blockshape = {i: 4, j: 7}``, in which case the two outer loops will blocked, and the resulting 2-dimensional block will have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. """ exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree[0].is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: name = "%s%d_block" % (i.dim.name, len(mapper)) # Build Iteration over blocks dim = blocked.setdefault(i, Dimension(name=name)) bsize = dim.symbolic_size bstart = i.limits[0] binnersize = i.dim.symbolic_extent + (i.offsets[1] - i.offsets[0]) bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1 inter_block = Iteration([], dim, [bstart, bfinish, bsize], offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + bsize - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild( [], limits=[bfinish + 1, i.dim.symbolic_end, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [ b._rebuild(properties=b.properties + (REMAINDER, )) for b, r in zip(inter_blocks, remainders) if r.dim not in c ] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) # All blocked dimensions if not blocked: return processed, {} # Determine the block shape blockshape = self.params.get('blockshape') if not blockshape: # Use trivial heuristic for a suitable blockshape def heuristic(dim_size): ths = 8 # FIXME: This really needs to be improved return ths if dim_size > ths else 1 blockshape = {k: heuristic for k in blocked.keys()} else: try: nitems, nrequired = len(blockshape), len(blocked) blockshape = {k: v for k, v in zip(blocked, blockshape)} if nitems > nrequired: dle_warning("Provided 'blockshape' has more entries than " "blocked loops; dropping entries ...") if nitems < nrequired: dle_warning("Provided 'blockshape' has fewer entries than " "blocked loops; dropping dimensions ...") except TypeError: blockshape = {list(blocked)[0]: blockshape} blockshape.update( {k: None for k in blocked.keys() if k not in blockshape}) # Track any additional arguments required to execute /state.nodes/ arguments = [ BlockingArg(v, k, blockshape[k]) for k, v in blocked.items() ] return processed, {'arguments': arguments, 'flags': 'blocking'}
def iet_make(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. :param clusters: The iterable :class:`Cluster`s for which the IET is built. :param dtype: The data type of the scalar expressions. """ processed = [] schedule = OrderedDict() for cluster in clusters: if not cluster.ispace.empty: root = None intervals = cluster.ispace.intervals # Can I reuse any of the previously scheduled Iterations ? index = 0 for i0, i1 in zip(intervals, list(schedule)): if i0 != i1 or i0.dim in cluster.atomics: break root = schedule[i1] index += 1 needed = intervals[index:] # Build Expressions body = [ Expression( e, np.int32 if cluster.trace.is_index(e.lhs) else dtype) for e in cluster.exprs ] if not needed: body = List(body=body) # Build Iterations scheduling = [] for i in reversed(needed): # Prepare any necessary unbounded index uindices = [] for j, offs in cluster.ispace.sub_iterators.get(i.dim, []): modulo = len(offs) for n, o in enumerate(filter_ordered(offs)): name = "%s%d" % (j.name, n) vname = Scalar(name=name, dtype=np.int32) value = (i.dim + o) % modulo uindices.append( UnboundedIndex(vname, value, value, j, j + o)) # Retrieve the iteration direction direction = cluster.ispace.directions[i.dim] # Update IET and scheduling if i.dim in cluster.guards: # Must wrap within an if-then scope body = Conditional(cluster.guards[i.dim], body) iteration = Iteration(body, i.dim, i.dim.limits, offsets=i.limits, direction=direction, uindices=uindices) # Adding (None, None) ensures that nested iterations won't # be reused by the next cluster scheduling.extend([(None, None), (i, iteration)]) else: iteration = Iteration(body, i.dim, i.dim.limits, offsets=i.limits, direction=direction, uindices=uindices) scheduling.append((i, iteration)) # Prepare for next dimension body = iteration # If /needed/ is != [], root.dim might be a guarded dimension for /cluster/ if root is not None and root.dim in cluster.guards: body = Conditional(cluster.guards[root.dim], body) # Update the current schedule scheduling = OrderedDict(reversed(scheduling)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict( list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend([Expression(e, dtype) for e in cluster.exprs]) return List(body=processed)
def _specialize_iet(self, iet, **kwargs): warning("The OPS backend is still work-in-progress") affine_trees = find_affine_trees(iet).items() # If there is no affine trees, then there is no loop to be optimized using OPS. if not affine_trees: return iet ops_init = Call(namespace['ops_init'], [0, 0, 2]) ops_partition = Call(namespace['ops_partition'], Literal('""')) ops_exit = Call(namespace['ops_exit']) # Extract all symbols that need to be converted to ops_dat dims = [] to_dat = set() for _, tree in affine_trees: dims.append(len(tree[0].dimensions)) symbols = set(FindSymbols('symbolics').visit(tree[0].root)) symbols -= set(FindSymbols('defines').visit(tree[0].root)) to_dat |= symbols # Create the OPS block for this problem ops_block = OpsBlock('block') ops_block_init = Expression(ClusterizedEq(Eq( ops_block, namespace['ops_decl_block']( dims[0], Literal('"block"') ) ))) # To ensure deterministic code generation we order the datasets to # be generated (since a set is an unordered collection) to_dat = filter_sorted(to_dat) name_to_ops_dat = {} pre_time_loop = [] after_time_loop = [] for f in to_dat: if f.is_Constant: continue pre_time_loop.extend(list(create_ops_dat(f, name_to_ops_dat, ops_block))) # To return the result to Devito, it is necessary to copy the data # from the dat object back to the CPU memory. after_time_loop.extend(create_ops_fetch(f, name_to_ops_dat, self.time_dimension.extreme_max)) # Generate ops kernels for each offloadable iteration tree mapper = {} for n, (_, tree) in enumerate(affine_trees): pre_loop, ops_kernel, ops_par_loop_call = opsit( tree, n, name_to_ops_dat, ops_block, dims[0] ) pre_time_loop.extend(pre_loop) self._ops_kernels.append(ops_kernel) mapper[tree[0].root] = ops_par_loop_call mapper.update({i.root: mapper.get(i.root) for i in tree}) # Drop trees iet = Transformer(mapper).visit(iet) assert (d == dims[0] for d in dims), \ "The OPS backend currently assumes that all kernels \ have the same number of dimensions" self._headers.append(namespace['ops_define_dimension'](dims[0])) self._includes.extend(['stdio.h', 'ops_seq.h']) body = [ops_init, ops_block_init, *pre_time_loop, ops_partition, iet, *after_time_loop, ops_exit] return List(body=body)
def iet_insert_C_decls(iet, func_table=None): """ Given an Iteration/Expression tree ``iet``, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol use. :param iet: The input Iteration/Expression tree. :param func_table: (Optional) a mapper from callable names within ``iet`` to :class:`Callable`s. """ func_table = func_table or {} allocator = Allocator() mapper = OrderedDict() # Detect all IET nodes accessing symbols that need to be declared scopes = [] me = MapExpressions() for k, v in me.visit(iet).items(): if k.is_Call: func = func_table.get(k.name) if func is not None and func.local: scopes.extend(me.visit(func.root, queue=list(v)).items()) scopes.append((k, v)) # Classify, and then schedule declarations to stack/heap for k, v in scopes: if k.is_Expression: if k.is_scalar: # Inline declaration mapper[k] = LocalExpression(**k.args) continue objs = [k.write] elif k.is_Call: objs = k.params else: raise NotImplementedError("Cannot schedule declarations for IET " "node of type `%s`" % type(k)) for i in objs: try: if i.is_LocalObject: # On the stack site = v[-1] if v else iet allocator.push_stack(site, i) elif i.is_Array: if i._mem_external: # Nothing to do; e.g., a user-provided Function continue elif i._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], i) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = Transformer(mapper, nested=True).visit(iet) for k, v in list(func_table.items()): if v.local: func_table[k] = MetaCall( Transformer(mapper).visit(v.root), v.local) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def relax_incr_dimensions(iet, **kwargs): """ Recast Iterations over IncrDimensions as ElementalFunctions; insert ElementalCalls to iterate over the "main" and "remainder" regions induced by the IncrDimensions. """ sregistry = kwargs['sregistry'] efuncs = [] mapper = {} for tree in retrieve_iteration_tree(iet): iterations = [i for i in tree if i.dim.is_Incr] if not iterations: continue root = iterations[0] if root in mapper: continue outer, inner = split(iterations, lambda i: not i.dim.parent.is_Incr) # Compute the iteration ranges ranges = [] for i in outer: maxb = i.symbolic_max - (i.symbolic_size % i.dim.step) ranges.append(((i.symbolic_min, maxb, i.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Remove any offsets # E.g., `x = x_m + 2 to x_M - 2` --> `x = x_m to x_M` outer = [ i._rebuild(limits=(i.dim.root.symbolic_min, i.dim.root.symbolic_max, i.step)) for i in outer ] # Create the ElementalFunction name = sregistry.make_name(prefix="bf") body = compose_nodes(outer) dynamic_parameters = flatten( (i.symbolic_bounds, i.step) for i in outer) dynamic_parameters.extend( [i.step for i in inner if not is_integer(i.step)]) efunc = make_efunc(name, body, dynamic_parameters) efuncs.append(efunc) # Create the ElementalCalls calls = [] for p in product(*ranges): dynamic_args_mapper = {} for i, (m, M, b) in zip(outer, p): dynamic_args_mapper[i.symbolic_min] = m dynamic_args_mapper[i.symbolic_max] = M dynamic_args_mapper[i.step] = b for j in inner: if j.dim.root is i.dim.root and not is_integer(j.step): value = j.step if b is i.step else b dynamic_args_mapper[j.step] = (value, ) calls.append(efunc.make_call(dynamic_args_mapper)) mapper[root] = List(body=calls) iet = Transformer(mapper).visit(iet) return iet, {'efuncs': efuncs}
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices))] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc)] free.append(i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append(("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update(set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault(name, Callable(name, free, 'void', flatten(params), ('static',))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def iet_insert_C_decls(iet, external=None): """ Given an IET, build a new tree with the necessary symbol declarations. Declarations are placed as close as possible to the first symbol occurrence. Parameters ---------- iet : Node The input Iteration/Expression tree. external : tuple, optional The symbols defined in some outer Callable, which therefore must not be re-defined. """ external = external or [] # Classify and then schedule declarations to stack/heap allocator = Allocator() mapper = OrderedDict() for k, v in MapExpressions().visit(iet).items(): if k.is_Expression: if k.is_scalar_assign: # Inline declaration mapper[k] = LocalExpression(**k.args) continue objs = [k.write] elif k.is_Call: objs = k.arguments for i in objs: try: if i.is_LocalObject: # On the stack site = v[-1] if v else iet allocator.push_stack(site, i) elif i.is_Array: if i in external: # The Array is to be defined in some foreign IET continue elif i._mem_stack: # On the stack key = lambda i: not i.is_Parallel site = filter_iterations(v, key=key, stop='asap') or [iet] allocator.push_stack(site[-1], i) else: # On the heap, as a tensor that must be globally accessible allocator.push_heap(i) except AttributeError: # E.g., a generic SymPy expression pass # Introduce declarations on the stack for k, v in allocator.onstack: mapper[k] = tuple(Element(i) for i in v) iet = Transformer(mapper, nested=True).visit(iet) # Introduce declarations on the heap (if any) if allocator.onheap: decls, allocs, frees = zip(*allocator.onheap) iet = List(header=decls + allocs, body=iet, footer=frees) return iet
def _build_casts(self, iet): """Introduce array casts.""" casts = [ ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external ] return List(body=casts + [iet])
def sendrecv(f, fixed): """Construct an IET performing a halo exchange along arbitrary dimension and side.""" assert f.is_Function assert f.grid is not None comm = f.grid.distributor._C_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='stack') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='stack') dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions] dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype, scope='external') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg gather = Call('gather_%s' % f.name, parameters) parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss scatter = Call('scatter_%s' % f.name, parameters) # The scatter must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) MPI_Status = type('MPI_Status', (c_void_p, ), {}) srecv = LocalObject(name='srecv', dtype=MPI_Status) MPI_Request = type('MPI_Request', (c_void_p, ), {}) rrecv = LocalObject(name='rrecv', dtype=MPI_Request) rsend = LocalObject(name='rsend', dtype=MPI_Request) count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [ bufs, count, Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)]) parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % f.name, iet, 'void', parameters, ('static', ))
def test_create_efuncs_complex(complex_function): roots = [i[-1] for i in retrieve_iteration_tree(complex_function)] retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(complex_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.efuncs) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { for (int i = 0; i <= 3; i += 1) { f_0((float *)a,(float *)b,i_size,i,4,0); for (int j = 0; j <= 5; j += 1) { f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0); } f_2((float *)a,(float *)b,i_size,i,4,0); } } void f_0(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int sf_M, const int sf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int s = sf_m; s <= sf_M; s += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i_size, const int j_size, const int k_size,""" """ const int i, const int j, const int kf_M, const int kf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = kf_m; k <= kf_M; k += 1) { a[i] = a[i]*b[i]*c[i][j]*d[i][j][k]; a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } void f_2(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int qf_M, const int qf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int q = qf_m; q <= qf_M; q += 1) { a[i] = 8.0F*a[i] + 6.0F/b[i]; } }""")
def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root): fetches = [] prefetches = [] presents = [] for s in sync_ops: if s.direction is Forward: fc = s.fetch.subs(s.dim, s.dim.symbolic_min) pfc = s.fetch + 1 fc_cond = s.next_cbk(s.dim.symbolic_min) pfc_cond = s.next_cbk(s.dim + 1) else: fc = s.fetch.subs(s.dim, s.dim.symbolic_max) pfc = s.fetch - 1 fc_cond = s.next_cbk(s.dim.symbolic_max) pfc_cond = s.next_cbk(s.dim - 1) # Construct init IET imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] fetch = PragmaList(self.lang._map_to(s.function, imask), {s.function} | fc.free_symbols) fetches.append(Conditional(fc_cond, fetch)) # Construct present clauses imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] presents.extend(as_list(self.lang._map_present(s.function, imask))) # Construct prefetch IET imask = [(pfc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions] prefetch = PragmaList(self.lang._map_to_wait(s.function, imask, SharedData._field_id), {s.function} | pfc.free_symbols) prefetches.append(Conditional(pfc_cond, prefetch)) # Turn init IET into a Callable functions = filter_ordered(s.function for s in sync_ops) name = self.sregistry.make_name(prefix='init_device') body = List(body=fetches) parameters = filter_sorted(functions + derive_parameters(body)) func = Callable(name, body, 'void', parameters, 'static') pieces.funcs.append(func) # Perform initial fetch by the main thread pieces.init.append(List( header=c.Comment("Initialize data stream"), body=[Call(name, parameters), BlankLine] )) # Turn prefetch IET into a ThreadFunction name = self.sregistry.make_name(prefix='prefetch_host_to_device') body = List(header=c.Line(), body=prefetches) tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry) pieces.funcs.extend(tctx.funcs) # Glue together all the IET pieces, including the activation logic sdata = tctx.sdata threads = tctx.threads iet = List(body=[ BlankLine, BusyWait(CondNe(FieldFromComposite(sdata._field_flag, sdata[threads.index]), 1)), List(header=presents), iet, tctx.activate ]) # Fire up the threads pieces.init.append(tctx.init) pieces.threads.append(threads) # Final wait before jumping back to Python land pieces.finalize.append(tctx.finalize) return iet
def _profile_sections(self, iet): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" return List(body=iet), None
def _loop_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ blockinner = bool(self.params.get('blockinner')) blockalways = bool(self.params.get('blockalways')) # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Parallel) if not blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not blockalways: # Heuristically bypass loop blocking if we think `tree` # won't be computationally expensive. This will help with code # size/redability, JIT time, and auto-tuning time if not (tree.root.is_Sequential or iet.is_Callable): # E.g., not inside a time-stepping Iteration continue if any(i.dim.is_Sub and i.dim.local for i in tree): # At least an outer Iteration is over a local SubDimension, # which suggests the computational cost of this Iteration # nest will be negligible w.r.t. the "core" Iteration nest # (making use of non-local (Sub)Dimensions only) continue if not IsPerfectIteration().visit(root): # Don't know how to block non-perfect nests continue # Apply loop blocking to `tree` interb = [] intrab = [] for i in iterations: d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper))) block_dims.append(d) # Build Iteration over blocks properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) interb.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration within a block intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) # Construct the blocked tree blocked = compose_nodes(interb + intrab + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb) efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, bi in zip(iterations, interb): maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step) ranges.append(((i.symbolic_min, maxb, bi.dim.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for bi, (m, M, b) in zip(interb, p): dynamic_args_mapper[bi.dim] = (m, M) dynamic_args_mapper[bi.dim.step] = (b,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) iet = Transformer(mapper).visit(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}
def _loop_blocking(self, nodes, state): """Apply loop blocking to PARALLEL Iteration trees.""" exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree.root.is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: # Build Iteration over blocks name = "%s%d_block" % (i.dim.name, len(mapper)) dim = blocked.setdefault(i, BlockDimension(i.dim, name=name)) binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0]) bmax = i.dim.symbolic_max - (binnersize % dim.step) inter_block = Iteration([], dim, bmax, offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + dim.step - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [b._rebuild(properties=b.properties + (REMAINDER,)) for b, r in zip(inter_blocks, remainders) if r.dim not in c] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) return processed, {'dimensions': list(blocked.values())}
def _minimize_remainders(self, iet): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ # The innermost dimension is the one that might get padded p_dim = -1 mapper = {} for tree in retrieve_iteration_tree(iet): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] # Padding writes = [i.write for i in FindNodes(Expression).visit(root) if i.write.is_Array] padding = [] for i in writes: try: simd_items = self.platform.simd_items_per_reg(i.dtype) except KeyError: return iet, {} padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding) i.update(padding=i._padding[:p_dim] + (padded,)) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.symbolic_max if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root) if i.is_Tensor) for i in root.uindices: for j in externals: condition.append(root.symbolic_max + padding < j) condition = ' && '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func('_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint)) ) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(iet) return processed, {}
def iet_make(clusters, dtype): """ Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s. :param clusters: The iterable :class:`Cluster`s for which the IET is built. :param dtype: The data type of the scalar expressions. """ processed = [] schedule = OrderedDict() for cluster in clusters: if not cluster.ispace.empty: root = None intervals = cluster.ispace.intervals # Can I reuse any of the previously scheduled Iterations ? index = 0 for i0, i1 in zip(intervals, list(schedule)): if i0 != i1 or i0.dim in clusters.atomics[cluster]: break root = schedule[i1] index += 1 needed = intervals[index:] # Build Iterations, including any necessary unbounded index iters = [] for i in needed: uindices = [] for j, offs in cluster.ispace.sub_iterators.get(i.dim, []): for n, o in enumerate(filter_ordered(offs)): name = "%s%d" % (j.name, n) vname = Scalar(name=name, dtype=np.int32) value = (i.dim + o) % j.modulo uindices.append(UnboundedIndex(vname, value, value, j, j + o)) iters.append(Iteration([], i.dim, i.dim.limits, offsets=i.limits, uindices=uindices)) # Build Expressions exprs = [Expression(v, np.int32 if cluster.trace.is_index(k) else dtype) for k, v in cluster.trace.items()] # Compose Iterations and Expressions body, tree = compose_nodes(iters + [exprs], retrieve=True) # Update the current scheduling scheduling = OrderedDict(zip(needed, tree)) if root is None: processed.append(body) schedule = scheduling else: nodes = list(root.nodes) + [body] mapper = {root: root._rebuild(nodes, **root.args_frozen)} transformer = Transformer(mapper) processed = list(transformer.visit(processed)) schedule = OrderedDict(list(schedule.items())[:index] + list(scheduling.items())) for k, v in list(schedule.items()): schedule[k] = transformer.rebuilt.get(v, v) else: # No Iterations are needed processed.extend([Expression(e, dtype) for e in cluster.exprs]) return List(body=processed)