def inject(self, field, expr, offset=0): """ Generate equations injecting an arbitrary expression into a field. Parameters ---------- field : Function Input field into which the injection is performed. expr : expr-like Injected expression. offset : int, optional Additional offset from the boundary. """ expr = indexify(expr) field = indexify(field) p, _ = self.gridpoints.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.gridpoints[p, i]))) coeffs.append(self.interpolation_coeffs[p, i, rd]) rhs = prod(coeffs) * expr field = field.subs(dim_subs) return [Eq(field, field + rhs.subs(dim_subs))]
def _make_symbolic_sections_from_imask(cls, f, imask): datasize = cls._map_data(f) if imask is None: imask = [FULL]*len(datasize) sections = [] for i, j in zip(imask, datasize): if i is FULL: start, size = 0, j else: try: start, size = i except TypeError: start, size = i, 1 sections.append((start, size)) # Unroll (or "flatten") the remaining Dimensions not captured by `imask` if len(imask) < len(datasize): try: start, size = sections.pop(-1) except IndexError: start, size = (0, 1) remainder_size = prod(datasize[len(imask):]) # The reason we may see a Wildcard is detailed in the `linearize_transfer` # pass, take a look there for more info. Basically, a Wildcard here means # that the symbol `start` is actually a temporary whose value already # represents the unrolled size if not isinstance(start, Wildcard): start *= remainder_size size *= remainder_size sections.append((start, size)) return sections
def interpolate(self, expr, offset=0, increment=False, self_subs={}): """ Generate equations interpolating an arbitrary expression into ``self``. Parameters ---------- expr : expr-like Input expression to interpolate. offset : int, optional Additional offset from the boundary. increment: bool, optional If True, generate increments (Inc) rather than assignments (Eq). """ expr = indexify(expr) p, _, _ = self.interpolation_coeffs.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.gridpoints[p, i]))) coeffs.append(self.interpolation_coeffs[p, i, rd]) # Apply optional time symbol substitutions to lhs of assignment lhs = self.subs(self_subs) rhs = prod(coeffs) * expr.subs(dim_subs) return [Eq(lhs, lhs + rhs)]
def _make_guard(self, parregion, *args): partrees = FindNodes(ParallelTree).visit(parregion) if not any(isinstance(i.root, self.DeviceIteration) for i in partrees): return super()._make_guard(parregion, *args) cond = [] # There must be at least one iteration or potential crash if not parregion.is_Affine: trees = retrieve_iteration_tree(parregion.root) tree = trees[0][:parregion.ncollapsed] cond.extend([i.symbolic_size > 0 for i in tree]) # SparseFunctions may occasionally degenerate to zero-size arrays. In such # a case, a copy-in produces a `nil` pointer on the device. To fire up a # parallel loop we must ensure none of the SparseFunction pointers are `nil` symbols = FindSymbols().visit(parregion) sfs = [i for i in symbols if i.is_SparseFunction] if sfs: size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs] cond.extend([i > 0 for i in size]) # Combine all cond elements if cond: parregion = List(body=[Conditional(And(*cond), parregion)]) return parregion
def calculate_nblocks(tree, blockable): collapsed = tree[:(tree[0].ncollapsed or 1)] blocked = [i.dim for i in collapsed if i.dim in blockable] remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked] niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)] nblocks = prod((i - d.root.symbolic_min + 1) / d.step for d, i in zip(blocked, niters)) return nblocks
def _select_candidates(self, candidates): assert candidates if self.ncores < self.collapse_ncores: return candidates[0], [] mapper = {} for n0, root in enumerate(candidates): collapsable = [] for n, i in enumerate(candidates[n0+1:], n0+1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # Loops are collapsable only if none of the iteration variables appear # in initializer expressions. For example, the following two loops # cannot be collapsed # # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[n0:n]): break # Also, we do not want to collapse SIMD-vectorized Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n+1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) # Give a score to this candidate, based on the number of fully-parallel # Iterations and their position (i.e. outermost to innermost) in the nest score = ( int(root.is_ParallelNoAtomic), int(len([i for i in collapsable if i.is_ParallelNoAtomic]) >= 1), int(len([i for i in collapsable if i.is_ParallelRelaxed]) >= 1), -(n0 + 1) # The outermost, the better ) mapper[(root, tuple(collapsable))] = score # Retrieve the candidates with highest score root, collapsable = max(mapper, key=mapper.get) return root, list(collapsable)
def _make_partree(self, candidates, omp_pragma=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Pick up an omp-pragma template # Caller-provided -> stick to it # Affine -> ... schedule(static,1) ... # Non-affine -> ... schedule(static) ... if omp_pragma is None: if all(i.is_Affine for i in candidates): omp_pragma = self.lang['for-static-1'] else: omp_pragma = self.lang['for-static'] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) # Attach an OpenMP pragma-for with a collapse clause ncollapse = 1 + len(collapsable) partree = root._rebuild( pragmas=root.pragmas + (omp_pragma(ncollapse), ), properties=root.properties + (COLLAPSED(ncollapse), )) collapsed = [partree] + collapsable return root, partree, collapsed
def calculate_nblocks(tree, blockable): block_indices = [n for n, i in enumerate(tree) if i.dim in blockable] index = block_indices[0] collapsed = tree[index:index + (tree[index].ncollapsed or index+1)] blocked = [i.dim for i in collapsed if i.dim in blockable] remainders = [(d.root.symbolic_max-d.root.symbolic_min+1) % d.step for d in blocked] niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)] nblocks = prod((i - d.root.symbolic_min + 1) / d.step for d, i in zip(blocked, niters)) return nblocks
def _dist_alltoall(self): """ The metadata necessary to perform an ``MPI_Alltoallv`` distributing the sparse data values across the MPI ranks needing them. """ ssparse, rsparse = self._dist_count # Per-rank shape of send/recv data sshape = [] rshape = [] for s, r in zip(ssparse, rsparse): handle = list(self.shape) handle[self._sparse_position] = s sshape.append(tuple(handle)) handle = list(self.shape) handle[self._sparse_position] = r rshape.append(tuple(handle)) # Per-rank count of send/recv data scount = tuple(prod(i) for i in sshape) rcount = tuple(prod(i) for i in rshape) # Per-rank displacement of send/recv data (it's actually all contiguous, # but the Alltoallv needs this information anyway) sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]]) rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]]) # Total shape of send/recv data sshape = list(self.shape) sshape[self._sparse_position] = sum(ssparse) rshape = list(self.shape) rshape[self._sparse_position] = sum(rsparse) # May have to swap axes, as `MPI_Alltoallv` expects contiguous data, and # the sparse dimension may not be the outermost sshape = tuple(sshape[i] for i in self._dist_reorder_mask) rshape = tuple(rshape[i] for i in self._dist_reorder_mask) return sshape, scount, sdisp, rshape, rcount, rdisp
def make_calculate_parblocks(trees, blockable, nthreads): trees = [i for i in trees if any(d in i.dimensions for d in blockable)] nblocks_per_threads = [] for tree, nt in product(trees, nthreads): collapsed = tree[:tree[0].ncollapsed] blocked = [i.dim for i in collapsed if i.dim in blockable] remainders = [(d.root.symbolic_max - d.root.symbolic_min + 1) % d.step for d in blocked] niters = [d.root.symbolic_max - i for d, i in zip(blocked, remainders)] nblocks = prod((i - d.root.symbolic_min + 1) / d.step for d, i in zip(blocked, niters)) nblocks_per_threads.append(nblocks / nt) return nblocks_per_threads
def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = OpenMPIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = OpenMPIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = OpenMPIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _alloc_array_on_high_bw_mem(self, site, obj, storage): """ Allocate an Array in the high bandwidth memory. """ size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = prod(obj.symbolic_shape) alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full) init = c.Initializer(decl, alloc) free = c.Statement('acc_free(%s)' % obj.name) storage.update(obj, site, allocs=init, frees=free)
def _dist_subfunc_alltoall(self): ssparse, rsparse = self._dist_count # Per-rank shape of send/recv `coordinates` sshape = [(i, self.grid.dim) for i in ssparse] rshape = [(i, self.grid.dim) for i in rsparse] # Per-rank count of send/recv `coordinates` scount = [prod(i) for i in sshape] rcount = [prod(i) for i in rshape] # Per-rank displacement of send/recv `coordinates` (it's actually all # contiguous, but the Alltoallv needs this information anyway) sdisp = np.concatenate([[0], np.cumsum(scount)[:-1]]) rdisp = np.concatenate([[0], tuple(np.cumsum(rcount))[:-1]]) # Total shape of send/recv `coordinates` sshape = list(self.coordinates.shape) sshape[0] = sum(ssparse) rshape = list(self.coordinates.shape) rshape[0] = sum(rsparse) return sshape, scount, sdisp, rshape, rcount, rdisp
def _alloc_array_on_high_bw_mem(self, site, obj, storage, *args): """ Allocate an Array in the high bandwidth memory. """ decl = Definition(obj) memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape) alloc = self.lang['host-alloc'](memptr, alignment, size) free = self.lang['host-free'](obj._C_symbol) storage.update(obj, site, allocs=(decl, alloc), frees=free)
def callback(): _expr = indexify(expr) _field = indexify(field) p, _ = self.obj.gridpoints.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.obj.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i]))) coeffs.append(self.obj.interpolation_coeffs[p, i, rd]) rhs = prod(coeffs) * _expr _field = _field.subs(dim_subs) return [Eq(_field, _field + rhs.subs(dim_subs))]
def callback(): _expr = indexify(expr) p, _, _ = self.obj.interpolation_coeffs.indices dim_subs = [] coeffs = [] for i, d in enumerate(self.obj.grid.dimensions): rd = DefaultDimension(name="r%s" % d.name, default_value=self.r) dim_subs.append((d, INT(rd + self.obj.gridpoints[p, i]))) coeffs.append(self.obj.interpolation_coeffs[p, i, rd]) # Apply optional time symbol substitutions to lhs of assignment lhs = self.obj.subs(self_subs) rhs = prod(coeffs) * _expr.subs(dim_subs) return [Eq(lhs, lhs + rhs)]
def test_guard_overflow(self): """ A toy test showing how to create a Guard to prevent writes to buffers with not enough space to fit another snapshot. """ grid = Grid(shape=(4, 4)) f = Function(name='f', grid=grid) freespace = Scalar(name='freespace') size = prod(f.symbolic_shape) guard = GuardOverflow(freespace, size) assert ccode(guard) == 'freespace >= (x_size + 2)*(y_size + 2)'
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides # on the device as it never needs to be accessed on the host assert obj._mem_local deviceid = DefFunction(self.lang['device-get'].name) doalloc = self.lang['device-alloc'] dofree = self.lang['device-free'] size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape) init = doalloc(size, deviceid, retobj=obj) free = dofree(obj._C_name, deviceid) storage.update(obj, site, allocs=init, frees=free)
def _alloc_pointed_array_on_high_bw_mem(self, site, obj, storage): """ Allocate the following objects in the high bandwidth memory: * The pointer array `obj`; * The pointee Array `obj.array` If the pointer array is defined over `sregistry.threadid`, that is a thread Dimension, then each `obj.array` slice is allocated and freed individually by the owner thread. """ # The pointer array decl = Definition(obj) memptr = VOID(Byref(obj._C_symbol), '**') alignment = obj._data_alignment size = SizeOf(Keyword('%s*' % obj._C_typedata)) * obj.dim.symbolic_size alloc0 = self.lang['host-alloc'](memptr, alignment, size) free0 = self.lang['host-free'](obj._C_symbol) # The pointee Array pobj = IndexedPointer(obj._C_symbol, obj.dim) memptr = VOID(Byref(pobj), '**') size = SizeOf(obj._C_typedata) * prod(obj.array.symbolic_shape) alloc1 = self.lang['host-alloc'](memptr, alignment, size) free1 = self.lang['host-free'](pobj) # Dump if obj.dim is self.sregistry.threadid: storage.update(obj, site, allocs=(decl, alloc0), frees=free0, pallocs=(obj.dim, alloc1), pfrees=(obj.dim, free1)) else: storage.update(obj, site, allocs=(decl, alloc0, alloc1), frees=(free0, free1))
def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides # on the device as it never needs to be accessed on the host assert obj._mem_default decl = c.Value(obj._C_typedata, "*%s" % obj._C_name) size = "sizeof(%s[%s])" % (obj._C_typedata, prod( obj.symbolic_shape)) deviceid = self.lang['device-get'] doalloc = self.lang['device-alloc'] dofree = self.lang['device-free'] alloc = "(%s*) %s" % (obj._C_typedata, doalloc(size, deviceid)) init = c.Initializer(decl, alloc) free = c.Statement(dofree(obj._C_name, deviceid)) storage.update(obj, site, allocs=init, frees=free)
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: # posix_memalign + copy-to-device super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # acc_malloc -- the Array only resides on the device, ie, it never # needs to be accessed on the host assert obj._mem_default size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = "sizeof(%s[%s])" % (obj._C_typedata, prod(obj.symbolic_shape)) alloc = "%s %s" % (cast, self.lang['device-alloc'](size_full)) init = c.Initializer(decl, alloc) free = c.Statement(self.lang['device-free'](obj.name)) storage.update(obj, site, allocs=init, frees=free)
def _alloc_array_on_high_bw_mem(self, site, obj, storage): """ Allocate an Array in the high bandwidth memory. """ if obj._mem_mapped: # posix_memalign + copy-to-device super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # acc_malloc -- the Array only resides on the device, ie, it never # needs to be accessed on the host assert obj._mem_default size_trunkated = "".join("[%s]" % i for i in obj.symbolic_shape[1:]) decl = c.Value(obj._C_typedata, "(*%s)%s" % (obj.name, size_trunkated)) cast = "(%s (*)%s)" % (obj._C_typedata, size_trunkated) size_full = prod(obj.symbolic_shape) alloc = "%s acc_malloc(sizeof(%s[%s]))" % (cast, obj._C_typedata, size_full) init = c.Initializer(decl, alloc) free = c.Statement('acc_free(%s)' % obj.name) storage.update(obj, site, allocs=init, frees=free)
def _find_collapsable(self, root, candidates): collapsable = [] if ncores() >= self.collapse_ncores: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n + 1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) return collapsable
def _find_collapsable(self, root, candidates): collapsable = [] if self.ncores >= self.collapse_ncores: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # Loops are collapsable only if none of the iteration variables appear # in initializer expressions. For example, the following two loops # cannot be collapsed # # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse SIMD-vectorized Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n + 1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) return collapsable
def _make_partree(self, candidates, nthreads=None): """Parallelize `root` attaching a suitable OpenMP pragma.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = [] if ncores() >= Ompizer.COLLAPSE_NCORES and IsPerfectIteration().visit( root): for n, i in enumerate(candidates[1:], 1): # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorizable: break # Would there be enough work per parallel iteration? try: work = prod( [int(j.dim.symbolic_size) for j in candidates[n + 1:]]) if work < Ompizer.COLLAPSE_WORK: break except TypeError: pass collapsable.append(i) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree prefix = [] if all(i.is_Affine for i in candidates): if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads omp_pragma = self.lang['for'](ncollapse, 1) else: # pragma omp parallel for ... schedule(..., 1) omp_pragma = self.lang['par-for'](ncollapse, 1, nthreads) else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') omp_pragma = self.lang['for'](ncollapse, chunk_size) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.CHUNKSIZE_NONAFFINE), 1)) prefix.append( Expression(DummyEq(chunk_size, value, dtype=np.int32))) # Create a ParallelTree body = root._rebuild(pragmas=root.pragmas + (omp_pragma, ), properties=root.properties + (COLLAPSED(ncollapse), )) partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def linearize_accesses(iet, key, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # `functions` are all Functions that `iet` may need to linearize functions = [f for f in FindSymbols().visit(iet) if key(f) and f.ndim > 1] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # `functions_unseen` are all Functions that `iet` may need to linearize # and have not been seen while processing other IETs functions_unseen = [f for f in functions if f not in cache] # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # For all unseen Functions, build the size exprs. For example: # `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(dict) for (d, halo, _), v in mapper.items(): v_unseen = [f for f in v if f in functions_unseen] if not v_unseen: continue expr = _generate_fsz(v_unseen[0], d, sregistry) if expr: for f in v_unseen: imapper[f][d] = expr.write cache[f].stmts0.append(expr) # For all unseen Functions, build the stride exprs. For example: # `y_stride0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(dict) for f, v in imapper.items(): for d in v: n = f.dimensions.index(d) expr = prod(v[i] for i in f.dimensions[n:]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_stride' % d.name) s = Symbol(name=name, dtype=np.int64, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f][d] = stmt.write cache[f].stmts1.append(stmt) mapper.update([(f, {}) for f in functions_unseen if f not in mapper]) # For all unseen Functions, build defines. For example: # `#define uL(t, x, y, z) u[(t)*t_stride0 + (x)*x_stride0 + (y)*y_stride0 + (z)]` headers = [] findexeds = {} for f in functions: if cache[f].cbk is None: header, cbk = _generate_macro(f, mapper[f], sregistry) headers.append(header) cache[f].cbk = findexeds[f] = cbk else: findexeds[f] = cache[f].cbk # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} indexeds = FindSymbols('indexeds').visit(iet) for i in indexeds: try: mapper[i] = findexeds[i.function](i) except KeyError: pass # Introduce the linearized expressions iet = Uxreplace(mapper).visit(iet) # All Functions that actually require linearization in `iet` candidates = [] candidates.extend(filter_ordered(i.function for i in indexeds)) calls = FindNodes(Call).visit(iet) cfuncs = filter_ordered(flatten(i.functions for i in calls)) candidates.extend(i for i in cfuncs if i.function.is_DiscreteFunction) # All Functions that can be linearized in `iet` defines = FindSymbols('defines-aliases').visit(iet) # Place the linearization expressions or delegate to ancestor efunc stmts0 = [] stmts1 = [] args = [] for f in candidates: if f in defines: stmts0.extend(cache[f].stmts0) stmts1.extend(cache[f].stmts1) else: args.extend([e.write for e in cache[f].stmts1]) if stmts0: assert len(stmts1) > 0 stmts0 = filter_ordered(stmts0) + [BlankLine] stmts1 = filter_ordered(stmts1) + [BlankLine] body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) else: assert len(stmts0) == 0 return iet, headers, args
def linearize_accesses(iet, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # Find all objects amenable to linearization symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)} functions = [f for f in FindSymbols().visit(iet) if ((f.is_DiscreteFunction or f.is_Array) and f.ndim > 1 and f.name in symbol_names)] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: if f not in cache: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # Build all exprs such as `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(list) for (d, halo, _), v in mapper.items(): name = sregistry.make_name(prefix='%s_fsz' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) try: expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True) except AttributeError: assert v[0].is_Array expr = DummyExpr(s, v[0].symbolic_shape[d], init=True) for f in v: imapper[f].append((d, s)) cache[f].stmts0.append(expr) # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(list) for f, v in imapper.items(): for n, (d, _) in enumerate(v): expr = prod(list(zip(*v[n:]))[1]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_slc' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f].append(stmt.write) cache[f].stmts1.append(stmt) mapper.update([(f, []) for f in functions if f not in mapper]) # Build defines. For example: # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]` headers = [] findexeds = {} for f, szs in mapper.items(): if cache[f].cbk is not None: # Perhaps we've already built an access macro for `f` through another efunc findexeds[f] = cache[f].cbk else: assert len(szs) == len(f.dimensions) - 1 pname = sregistry.make_name(prefix='%sL' % f.name) expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)]) expr += MacroArgument(f.dimensions[-1].name) expr = Indexed(IndexedData(f.name, None, f), expr) define = DefFunction(pname, f.dimensions) headers.append((ccode(define), ccode(expr))) cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname) # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} for n in FindNodes(Expression).visit(iet): subs = {} for i in retrieve_indexed(n.expr): try: subs[i] = findexeds[i.function](i) except KeyError: pass mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs)) # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ... stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions)) if stmts0: stmts0.append(BlankLine) stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions)) if stmts1: stmts1.append(BlankLine) iet = Transformer(mapper).visit(iet) body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) return iet, headers