class OmpBB(PragmaLangBB): mapper = { # Misc 'name': 'OpenMP', 'header': 'omp.h', # Platform mapping AMDGPUX: None, NVIDIAX: None, # Runtime library 'init': None, 'thread-num': DefFunction('omp_get_thread_num'), 'num-devices': lambda args: DefFunction('omp_get_num_devices', args), 'set-device': lambda args: Call('omp_set_default_device', args), # Pragmas 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update'), 'map-enter-to': lambda i, j: c.Pragma('omp target enter data map(to: %s%s)' % (i, j)), 'map-enter-alloc': lambda i, j: c.Pragma('omp target enter data map(alloc: %s%s)' % (i, j)), 'map-update': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('omp target update from(%s%s)' % (i, j)), 'map-update-device': lambda i, j: c.Pragma('omp target update to(%s%s)' % (i, j)), 'map-release': lambda i, j, k: c.Pragma('omp target exit data map(release: %s%s)%s' % (i, j, k)), 'map-exit-delete': lambda i, j, k: c.Pragma('omp target exit data map(delete: %s%s)%s' % (i, j, k)), } mapper.update(CBB.mapper) Region = OmpRegion HostIteration = OmpIteration DeviceIteration = DeviceOmpIteration Prodder = ThreadedProdder
def __init__(self, prodder): # Atomic-ize any single-thread Prodders in the parallel tree condition = CondEq(DefFunction('omp_get_thread_num'), 0) # Prod within a while loop until all communications have completed # In other words, the thread delegated to prodding is entrapped for as long # as it's required prod_until = Not(DefFunction(prodder.name, [i.name for i in prodder.arguments])) then_body = List(header=c.Comment('Entrap thread until comms have completed'), body=While(prod_until)) Conditional.__init__(self, condition, then_body) Prodder.__init__(self, prodder.name, prodder.arguments, periodic=prodder.periodic)
def _(f, szs, sregistry): assert len(szs) == len(f.dimensions) - 1 pname = sregistry.make_name(prefix='%sL' % f.name) cbk = lambda i, pname=pname: FIndexed(i, pname) expr = sum([MacroArgument(d0.name)*szs[d1] for d0, d1 in zip(f.dimensions, f.dimensions[1:])]) expr += MacroArgument(f.dimensions[-1].name) expr = Indexed(IndexedData(f.name, None, f), expr) define = DefFunction(pname, f.dimensions) header = (ccode(define), ccode(expr)) return header, cbk
def _(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) objcomm = None for i in iet.parameters: if isinstance(i, MPICommObject): objcomm = i break deviceid = DeviceID() device_nvidia = Macro('acc_device_nvidia') if objcomm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [objcomm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) asdn_then = Call('acc_set_device_num', [deviceid, device_nvidia]) asdn_else = Call('acc_set_device_num', [rank % ngpus, device_nvidia]) body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), asdn_then, List(body=[rank_decl, rank_init, ngpus_init, asdn_else])) ] else: body = [ Call('acc_init', [device_nvidia]), Conditional( CondNe(deviceid, -1), Call('acc_set_device_num', [deviceid, device_nvidia])) ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet, {'args': deviceid}
def _alloc_array_on_high_bw_mem(self, site, obj, storage): if obj._mem_mapped: super()._alloc_array_on_high_bw_mem(site, obj, storage) else: # E.g., use `acc_malloc` or `omp_target_alloc` -- the Array only resides # on the device as it never needs to be accessed on the host assert obj._mem_local deviceid = DefFunction(self.lang['device-get'].name) doalloc = self.lang['device-alloc'] dofree = self.lang['device-free'] size = SizeOf(obj._C_typedata) * prod(obj.symbolic_shape) init = doalloc(size, deviceid, retobj=obj) free = dofree(obj._C_name, deviceid) storage.update(obj, site, allocs=init, frees=free)
def _initialize(iet): # TODO: we need to pick the rank from `comm_shm`, not `comm`, # so that we have nranks == ngpus (as long as the user has launched # the right number of MPI processes per node given the available # number of GPUs per node) comm = None for i in iet.parameters: if isinstance(i, MPICommObject): comm = i break device_nvidia = Macro('acc_device_nvidia') body = Call('acc_init', [device_nvidia]) if comm is not None: rank = Symbol(name='rank') rank_decl = LocalExpression(DummyEq(rank, 0)) rank_init = Call('MPI_Comm_rank', [comm, Byref(rank)]) ngpus = Symbol(name='ngpus') call = DefFunction('acc_get_num_devices', device_nvidia) ngpus_init = LocalExpression(DummyEq(ngpus, call)) devicenum = Symbol(name='devicenum') devicenum_init = LocalExpression(DummyEq(devicenum, rank % ngpus)) set_device_num = Call('acc_set_device_num', [devicenum, device_nvidia]) body = [ rank_decl, rank_init, ngpus_init, devicenum_init, set_device_num, body ] init = List(header=c.Comment('Begin of OpenACC+MPI setup'), body=body, footer=(c.Comment('End of OpenACC+MPI setup'), c.Line())) iet = iet._rebuild(body=(init, ) + iet.body) return iet
def test_symbolics(): a = Symbol('a') id = IntDiv(a, 3) pkl_id = pickle.dumps(id) new_id = pickle.loads(pkl_id) assert id == new_id ffp = CallFromPointer('foo', a, ['b', 'c']) pkl_ffp = pickle.dumps(ffp) new_ffp = pickle.loads(pkl_ffp) assert ffp == new_ffp li = ListInitializer(['a', 'b']) pkl_li = pickle.dumps(li) new_li = pickle.loads(pkl_li) assert li == new_li df = DefFunction('f', ['a', 1, 2]) pkl_df = pickle.dumps(df) new_df = pickle.loads(pkl_df) assert df == new_df assert df.arguments == new_df.arguments
class Ompizer(object): lang = { 'simd-for': c.Pragma('omp simd'), 'simd-for-aligned': lambda i, j: c.Pragma('omp simd aligned(%s:%d)' % (i, j)), 'atomic': c.Pragma('omp atomic update'), 'thread-num': DefFunction('omp_get_thread_num') } """ Shortcuts for the OpenMP language. """ _Region = OpenMPRegion _Iteration = OpenMPIteration def __init__(self, sregistry, options, key=None): """ Parameters ---------- sregistry : SymbolRegistry The symbol registry, to quickly access the special symbols that may appear in the IET (e.g., `sregistry.threadid`, `sregistry.nthreads`). options : dict The optimization options. Accepted: ['par-collapse-ncores', 'par-collapse-work', 'par-chunk-nonaffine', 'par-dynamic-work', 'par-nested'] * 'par-collapse-ncores': use a collapse clause if the number of available physical cores is greater than this threshold. * 'par-collapse-work': use a collapse clause if the trip count of the collapsable Iterations is statically known to exceed this threshold. * 'par-chunk-nonaffine': coefficient to adjust the chunk size in non-affine parallel Iterations. * 'par-dynamic-work': use dynamic scheduling if the operation count per iteration exceeds this threshold. Otherwise, use static scheduling. * 'par-nested': nested parallelism if the number of hyperthreads per core is greater than this threshold. key : callable, optional Return True if an Iteration can be parallelized, False otherwise. """ self.sregistry = sregistry self.collapse_ncores = options['par-collapse-ncores'] self.collapse_work = options['par-collapse-work'] self.chunk_nonaffine = options['par-chunk-nonaffine'] self.dynamic_work = options['par-dynamic-work'] self.nested = options['par-nested'] if key is not None: self.key = key else: self.key = lambda i: i.is_ParallelRelaxed and not i.is_Vectorized @property def nthreads(self): return self.sregistry.nthreads @property def nthreads_nested(self): return self.sregistry.nthreads_nested @property def nthreads_nonaffine(self): return self.sregistry.nthreads_nonaffine @property def threadid(self): return self.sregistry.threadid def _find_collapsable(self, root, candidates): collapsable = [] if ncores() >= self.collapse_ncores: for n, i in enumerate(candidates[1:], 1): # The Iteration nest [root, ..., i] must be perfect if not IsPerfectIteration(depth=i).visit(root): break # The OpenMP specification forbids collapsed loops to use iteration # variables in initializer expressions. E.g., the following is forbidden: # # #pragma omp ... collapse(2) # for (i = ... ) # for (j = i ...) # ... # # Here, we make sure this won't happen if any(j.dim in i.symbolic_min.free_symbols for j in candidates[:n]): break # Also, we do not want to collapse vectorizable Iterations if i.is_Vectorized: break # Would there be enough work per parallel iteration? nested = candidates[n + 1:] if nested: try: work = prod([int(j.dim.symbolic_size) for j in nested]) if work < self.collapse_work: break except TypeError: pass collapsable.append(i) return collapsable @classmethod def _make_tid(cls, tid): return c.Initializer(c.Value(tid._C_typedata, tid.name), cls.lang['thread-num']) def _make_reductions(self, partree, collapsed): if not any(i.is_ParallelAtomic for i in collapsed): return partree # Collect expressions inducing reductions exprs = FindNodes(Expression).visit(partree) exprs = [ i for i in exprs if i.is_Increment and not i.is_ForeignExpression ] reduction = [i.output for i in exprs] if (all(i.is_Affine for i in collapsed) or all(not i.is_Indexed for i in reduction)): # Introduce reduction clause mapper = {partree.root: partree.root._rebuild(reduction=reduction)} else: # Introduce one `omp atomic` pragma for each increment mapper = { i: List(header=self.lang['atomic'], body=i) for i in exprs } partree = Transformer(mapper).visit(partree) return partree def _make_threaded_prodders(self, partree): mapper = { i: ThreadedProdder(i) for i in FindNodes(Prodder).visit(partree) } partree = Transformer(mapper).visit(partree) return partree def _make_partree(self, candidates, nthreads=None): """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas.""" assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma omp for ... schedule(..., 1) nthreads = self.nthreads body = OpenMPIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma omp parallel for ... schedule(..., 1) body = OpenMPIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma omp for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = OpenMPIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed def _make_parregion(self, partree, parrays): arrays = [i for i in FindSymbols().visit(partree) if i.is_Array] # Detect thread-private arrays on the heap and "map" them to shared # vector-expanded (one entry per thread) Arrays heap_private = [i for i in arrays if i._mem_heap and i._mem_local] heap_globals = [] for i in heap_private: if i in parrays: pi = parrays[i] else: pi = parrays.setdefault( i, PointerArray(name=self.sregistry.make_name(), dimensions=(self.threadid, ), array=i)) heap_globals.append(Dereference(i, pi)) if heap_globals: body = List(header=self._make_tid(self.threadid), body=heap_globals + [partree], footer=c.Line()) else: body = partree return OpenMPRegion(body, partree.nthreads) def _make_guard(self, partree, collapsed): # Do not enter the parallel region if the step increment is 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementations. Note that using an OpenMP `if` clause won't work cond = [ CondEq(i.step, 0) for i in collapsed if isinstance(i.step, Symbol) ] cond = Or(*cond) if cond != False: # noqa: `cond` may be a sympy.False which would be == False partree = List(body=[Conditional(cond, Return()), partree]) return partree def _make_nested_partree(self, partree): # Apply heuristic if nhyperthreads() <= self.nested: return partree # Note: there might be multiple sub-trees amenable to nested parallelism, # hence we loop over all of them # # for (i = ... ) // outer parallelism # for (j0 = ...) // first source of nested parallelism # ... # for (j1 = ...) // second source of nested parallelism # ... mapper = {} for tree in retrieve_iteration_tree(partree): outer = tree[:partree.ncollapsed] inner = tree[partree.ncollapsed:] # Heuristic: nested parallelism is applied only if the top nested # parallel Iteration iterates *within* the top outer parallel Iteration # (i.e., the outer is a loop over blocks, while the nested is a loop # within a block) candidates = [] for i in inner: if self.key(i) and any( is_integer(j.step - i.symbolic_size) for j in outer): candidates.append(i) elif candidates: # If there's at least one candidate but `i` doesn't honor the # heuristic above, then we break, as the candidates must be # perfectly nested break if not candidates: continue # Introduce nested parallelism subroot, subpartree, _ = self._make_partree( candidates, self.nthreads_nested) mapper[subroot] = subpartree partree = Transformer(mapper).visit(partree) return partree def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) if root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree, parrays) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin)) ] for n in FindNodes(Dereference).visit(iet): args.extend([(n.array, True), n.parray]) return iet, {'args': args, 'includes': ['omp.h']} @iet_pass def make_parallel(self, iet): """ Create a new IET with shared-memory parallelism via OpenMP pragmas. """ return self._make_parallel(iet) @iet_pass def make_simd(self, iet, **kwargs): """ Create a new IET with SIMD parallelism via OpenMP pragmas. """ simd_reg_size = kwargs.pop('simd_reg_size') mapper = {} for tree in retrieve_iteration_tree(iet): candidates = [i for i in tree if i.is_Parallel] # As long as there's an outer level of parallelism, the innermost # PARALLEL Iteration gets vectorized if len(candidates) < 2: continue candidate = candidates[-1] # Construct OpenMP SIMD pragma aligned = [ j for j in FindSymbols('symbolics').visit(candidate) if j.is_DiscreteFunction ] if aligned: simd = self.lang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simd_reg_size)) else: simd = as_tuple(self.lang['simd-for']) pragmas = candidate.pragmas + simd # Add VECTORIZED property properties = list(candidate.properties) + [VECTORIZED] mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def linearize_accesses(iet, cache, sregistry): """ Turn Indexeds into FIndexeds and create the necessary access Macros. """ # Find all objects amenable to linearization symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)} functions = [f for f in FindSymbols().visit(iet) if ((f.is_DiscreteFunction or f.is_Array) and f.ndim > 1 and f.name in symbol_names)] functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True) # Find unique sizes (unique -> minimize necessary registers) mapper = DefaultOrderedDict(list) for f in functions: if f not in cache: # NOTE: the outermost dimension is unnecessary for d in f.dimensions[1:]: # TODO: same grid + same halo => same padding, however this is # never asserted throughout the compiler yet... maybe should do # it when in debug mode at `prepare_arguments` time, ie right # before jumping to C? mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f) # Build all exprs such as `x_fsz0 = u_vec->size[1]` imapper = DefaultOrderedDict(list) for (d, halo, _), v in mapper.items(): name = sregistry.make_name(prefix='%s_fsz' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) try: expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True) except AttributeError: assert v[0].is_Array expr = DummyExpr(s, v[0].symbolic_shape[d], init=True) for f in v: imapper[f].append((d, s)) cache[f].stmts0.append(expr) # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0` built = {} mapper = DefaultOrderedDict(list) for f, v in imapper.items(): for n, (d, _) in enumerate(v): expr = prod(list(zip(*v[n:]))[1]) try: stmt = built[expr] except KeyError: name = sregistry.make_name(prefix='%s_slc' % d.name) s = Symbol(name=name, dtype=np.int32, is_const=True) stmt = built[expr] = DummyExpr(s, expr, init=True) mapper[f].append(stmt.write) cache[f].stmts1.append(stmt) mapper.update([(f, []) for f in functions if f not in mapper]) # Build defines. For example: # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]` headers = [] findexeds = {} for f, szs in mapper.items(): if cache[f].cbk is not None: # Perhaps we've already built an access macro for `f` through another efunc findexeds[f] = cache[f].cbk else: assert len(szs) == len(f.dimensions) - 1 pname = sregistry.make_name(prefix='%sL' % f.name) expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)]) expr += MacroArgument(f.dimensions[-1].name) expr = Indexed(IndexedData(f.name, None, f), expr) define = DefFunction(pname, f.dimensions) headers.append((ccode(define), ccode(expr))) cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname) # Build "functional" Indexeds. For example: # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)` mapper = {} for n in FindNodes(Expression).visit(iet): subs = {} for i in retrieve_indexed(n.expr): try: subs[i] = findexeds[i.function](i) except KeyError: pass mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs)) # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ... stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions)) if stmts0: stmts0.append(BlankLine) stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions)) if stmts1: stmts1.append(BlankLine) iet = Transformer(mapper).visit(iet) body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body) iet = iet._rebuild(body=body) return iet, headers
class AccBB(PragmaLangBB): mapper = { # Misc 'name': 'OpenACC', 'header': 'openacc.h', # Platform mapping AMDGPUX: Macro('acc_device_radeon'), NVIDIAX: Macro('acc_device_nvidia'), # Runtime library 'init': lambda args: Call('acc_init', args), 'num-devices': lambda args: DefFunction('acc_get_num_devices', args), 'set-device': lambda args: Call('acc_set_device_num', args), # Pragmas 'atomic': c.Pragma('acc atomic update'), 'map-enter-to': lambda i, j: c.Pragma('acc enter data copyin(%s%s)' % (i, j)), 'map-enter-to-wait': lambda i, j, k: (c.Pragma('acc enter data copyin(%s%s) async(%s)' % (i, j, k)), c.Pragma('acc wait(%s)' % k)), 'map-enter-alloc': lambda i, j: c.Pragma('acc enter data create(%s%s)' % (i, j)), 'map-present': lambda i, j: c.Pragma('acc data present(%s%s)' % (i, j)), 'map-wait': lambda i: c.Pragma('acc wait(%s)' % i), 'map-update': lambda i, j: c.Pragma('acc exit data copyout(%s%s)' % (i, j)), 'map-update-host': lambda i, j: c.Pragma('acc update self(%s%s)' % (i, j)), 'map-update-host-async': lambda i, j, k: c.Pragma('acc update self(%s%s) async(%s)' % (i, j, k)), 'map-update-device': lambda i, j: c.Pragma('acc update device(%s%s)' % (i, j)), 'map-update-device-async': lambda i, j, k: c.Pragma('acc update device(%s%s) async(%s)' % (i, j, k)), 'map-release': lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)), 'map-exit-delete': lambda i, j, k: c.Pragma('acc exit data delete(%s%s)%s' % (i, j, k)), 'memcpy-to-device': lambda i, j, k: Call('acc_memcpy_to_device', [i, j, k]), 'memcpy-to-device-wait': lambda i, j, k, l: List(body=[ Call('acc_memcpy_to_device_async', [i, j, k, l]), Call('acc_wait', [l]) ]), 'device-get': Call('acc_get_device_num'), 'device-alloc': lambda i, *a, retobj=None: Call( 'acc_malloc', (i, ), retobj=retobj, cast=True), 'device-free': lambda i, *a: Call('acc_free', (i, )) } mapper.update(CBB.mapper) Region = OmpRegion HostIteration = OmpIteration # Host parallelism still goes via OpenMP DeviceIteration = DeviceAccIteration @classmethod def _map_to_wait(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-enter-to-wait'](f.name, sections, queueid) @classmethod def _map_present(cls, f, imask=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-present'](f.name, sections) @classmethod def _map_delete(cls, f, imask=None, devicerm=None): sections = cls._make_sections_from_imask(f, imask) if devicerm is not None: cond = ' if(%s)' % devicerm.name else: cond = '' return cls.mapper['map-exit-delete'](f.name, sections, cond) @classmethod def _map_update_host_async(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-update-host-async'](f.name, sections, queueid) @classmethod def _map_update_device_async(cls, f, imask=None, queueid=None): sections = cls._make_sections_from_imask(f, imask) return cls.mapper['map-update-device-async'](f.name, sections, queueid)