def _make_parallel_tree(self, root, candidates): ncollapse = self._ncollapse(root, candidates) parallel = self.lang['for'](ncollapse) yask_add = namespace['code-grid-add'] # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Turn increments into atomic increments subs = {} for e in FindNodes(Expression).visit(root): if not e.is_Increment: continue # Try getting the increment components try: target, value, indices = split_increment(e.expr) except (AttributeError, ValueError): warning( "Found a parallelizable tree, but couldn't ompize it " "because couldn't understand the increment %s" % e.expr) return mapper # All good, can atomicize the increment subs[e] = e._rebuild( expr=e.expr.func(yask_add, target, (value, indices))) handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel, )) else: mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel, )) return mapper
def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update( OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [ c for c in FindNodes(Call).visit(efunc) if c.name in stack ] mapper = { c: c._rebuild(arguments=extif(c.arguments)) for c in calls } efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def _make_omp_parallel_tree(self, root, candidates): """ Return a mapper to parallelize the :class:`Iteration`s within /root/. """ mapper = OrderedDict() # Heuristic: if at least two parallel loops are available and the # physical core count is greater than COLLAPSE, then omp-collapse them nparallel = len(candidates) if psutil.cpu_count(logical=False) < Ompizer.COLLAPSE or\ nparallel < 2: parallel = self.lang['for'] else: parallel = self.lang['collapse'](nparallel) # Introduce the `omp parallel` pragma if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = {i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_increment} handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,)) else: mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,)) return mapper
def _make_parallel_tree(self, root, candidates): """Parallelize the IET rooted in `root`.""" ncollapse = self._ncollapse(root, candidates) parallel = self.lang['for'](ncollapse) pragmas = root.pragmas + (parallel, ) properties = root.properties + (COLLAPSED(ncollapse), ) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = { i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_Increment } handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=pragmas, properties=properties) else: mapper[root] = root._rebuild(pragmas=pragmas, properties=properties) root = Transformer(mapper).visit(root) return root
def process(func, state): """ Apply ``func`` to the IETs in ``state._efuncs``, and update ``state`` accordingly. """ # Create a Call graph. `func` will be applied to each node in the Call graph. # `func` might change an `efunc` signature; the Call graph will be used to # propagate such change through the `efunc` callers dag = DAG(nodes=['root']) queue = ['root'] while queue: caller = queue.pop(0) callees = FindNodes(Call).visit(state._efuncs[caller]) for callee in filter_ordered([i.name for i in callees]): if callee in state._efuncs: # Exclude foreign Calls, e.g., MPI calls try: dag.add_node(callee) queue.append(callee) except KeyError: # `callee` already in `dag` pass dag.add_edge(callee, caller) assert dag.size == len(state._efuncs) # Apply `func` for i in dag.topological_sort(): state._efuncs[i], metadata = func(state._efuncs[i]) # Track any new Dimensions introduced by `func` state._dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include required by `func` state._includes.extend(list(metadata.get('includes', []))) state._includes = filter_ordered(state._includes) # Track any new ElementalFunctions state._efuncs.update(OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = state._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) state._efuncs[n] = efunc
def apply(self, func, **kwargs): """ Apply ``func`` to all nodes in the Graph. This changes the state of the Graph. """ dag = self._create_call_graph() # Apply `func` for i in dag.topological_sort(): self.efuncs[i], metadata = func(self.efuncs[i], **kwargs) # Track any new Dimensions introduced by `func` self.dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include and #define required by `func` self.includes.extend(list(metadata.get('includes', []))) self.includes = filter_ordered(self.includes) self.headers.extend(list(metadata.get('headers', []))) self.headers = filter_ordered(self.headers) # Tracky any new external function self.ffuncs.extend(list(metadata.get('ffuncs', []))) self.ffuncs = filter_ordered(self.ffuncs) # Track any new ElementalFunctions self.efuncs.update( OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = self.efuncs[n] calls = [ c for c in FindNodes(Call).visit(efunc) if c.name in stack ] mapper = { c: c._rebuild(arguments=extif(c.arguments)) for c in calls } efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild( parameters=extif(efunc.parameters)) self.efuncs[n] = efunc # Apply `func` to the external functions for i in range(len(self.ffuncs)): self.ffuncs[i], _ = func(self.ffuncs[i], **kwargs)
def process(self, iet): sync_spots = FindNodes(SyncSpot).visit(iet) if not sync_spots: return iet, {} def key(s): # The SyncOps are to be processed in the following order return [ WaitLock, WithLock, Delete, FetchUpdate, FetchPrefetch, PrefetchUpdate, WaitPrefetch ].index(s) callbacks = { WaitLock: self._make_waitlock, WithLock: self._make_withlock, Delete: self._make_delete, FetchUpdate: self._make_fetchupdate, FetchPrefetch: self._make_fetchprefetch, PrefetchUpdate: self._make_prefetchupdate } postponed_callbacks = {WaitPrefetch: self._make_waitprefetch} all_callbacks = [callbacks, postponed_callbacks] pieces = namedtuple('Pieces', 'init finalize funcs objs')([], [], [], Objs()) # The processing is a two-step procedure; first, we apply the `callbacks`; # then, the `postponed_callbacks`, as these depend on objects produced by the # `callbacks` subs = {} for cbks in all_callbacks: for n in sync_spots: mapper = as_mapper(n.sync_ops, lambda i: type(i)) for _type in sorted(mapper, key=key): try: subs[n] = cbks[_type](subs.get(n, n), mapper[_type], pieces, iet) except KeyError: pass iet = Transformer(subs).visit(iet) # Add initialization and finalization code init = List(body=pieces.init, footer=c.Line()) finalize = List(header=c.Line(), body=pieces.finalize) body = iet.body._rebuild(body=(init, ) + iet.body.body + (finalize, )) iet = iet._rebuild(body=body) return iet, { 'efuncs': pieces.funcs, 'includes': ['pthread.h'], 'args': [i.size for i in pieces.objs.threads if not is_integer(i.size)] }
def _make_parallel_tree(self, root, candidates): """ Return a mapper to parallelize the :class:`Iteration`s within /root/. """ parallel = self._pragma_for(root, candidates) # Introduce the `omp for` pragma mapper = OrderedDict() if root.is_ParallelAtomic: # Introduce the `omp atomic` pragmas exprs = FindNodes(Expression).visit(root) subs = {i: List(header=self.lang['atomic'], body=i) for i in exprs if i.is_increment} handle = Transformer(subs).visit(root) mapper[root] = handle._rebuild(pragmas=root.pragmas + (parallel,)) else: mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel,)) return mapper
def process(self, iet): def key(s): # The SyncOps are to be processed in the following order return [WaitLock, WithLock, Delete, FetchWait, FetchWaitPrefetch].index(s) callbacks = { WaitLock: self._make_waitlock, WithLock: self._make_withlock, FetchWait: self._make_fetchwait, FetchWaitPrefetch: self._make_fetchwaitprefetch, Delete: self._make_delete } sync_spots = FindNodes(SyncSpot).visit(iet) if not sync_spots: return iet, {} pieces = namedtuple('Pieces', 'init finalize funcs threads')([], [], [], []) subs = {} for n in sync_spots: mapper = as_mapper(n.sync_ops, lambda i: type(i)) for _type in sorted(mapper, key=key): subs[n] = callbacks[_type](subs.get(n, n), mapper[_type], pieces, iet) iet = Transformer(subs).visit(iet) # Add initialization and finalization code init = List(body=pieces.init, footer=c.Line()) finalize = List(header=c.Line(), body=pieces.finalize) iet = iet._rebuild(body=(init, ) + iet.body + (finalize, )) return iet, { 'efuncs': pieces.funcs, 'includes': ['pthread.h'], 'args': [i.size for i in pieces.threads if not is_integer(i.size)] }
def _process(self, func): """Apply ``func`` to all tracked ``IETs``.""" for i in self._call_graph.topological_sort(): self._efuncs[i], metadata = func(self._efuncs[i]) # Track any new Dimensions and includes introduced by `func` self._dimensions.extend(list(metadata.get('dimensions', []))) self._includes.extend(list(metadata.get('includes', []))) # If there's a change to the `input` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased _input = as_tuple(metadata.get('input')) if _input: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in _input if e not in v] stack = [i] + self._call_graph.all_downstreams(i) for n in stack: efunc = self._efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) self._efuncs[n] = efunc self._input.extend(list(_input)) for k, v in metadata.get('efuncs', {}).items(): # Update the efuncs if k.is_Callable: self._efuncs[k.name] = k # Update the call graph self._call_graph.add_node(k.name, ignore_existing=True) for target in (v or [None]): self._call_graph.add_edge(k.name, target or 'main', force_add=True)
def apply(self, func, **kwargs): """ Apply ``func`` to all nodes in the Graph. This changes the state of the Graph. """ dag = self._create_call_graph() # Apply `func` for i in dag.topological_sort(): self.efuncs[i], metadata = func(self.efuncs[i], **kwargs) # Track any new Dimensions introduced by `func` self.dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include and #define required by `func` self.includes.extend(list(metadata.get('includes', []))) self.includes = filter_ordered(self.includes) self.headers.extend(list(metadata.get('headers', []))) self.headers = filter_ordered(self.headers, key=str) # Tracky any new external function self.ffuncs.extend(list(metadata.get('ffuncs', []))) self.ffuncs = filter_ordered(self.ffuncs) # Track any new ElementalFunctions self.efuncs.update( OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if not args: continue def filter_args(v, efunc=None): processed = list(v) for _a in args: try: # Should the arg actually be dropped? a, drop = _a if drop: if a in processed: processed.remove(a) continue except (TypeError, ValueError, IndexException): a = _a if a in processed: # A child efunc trying to add a symbol alredy added by a # sibling efunc continue if efunc is self.root and not (a.is_Input or a.is_Object): # Temporaries (ie, Symbol, Arrays) *cannot* be args in `root` continue processed.append(a) return processed stack = [i] + dag.all_downstreams(i) for n in stack: efunc = self.efuncs[n] mapper = {} for c in FindNodes(Call).visit(efunc): if c.name not in stack: continue mapper[c] = c._rebuild(arguments=filter_args(c.arguments)) parameters = filter_args(efunc.parameters, efunc) efunc = Transformer(mapper).visit(efunc) efunc = efunc._rebuild(parameters=parameters) self.efuncs[n] = efunc # Apply `func` to the external functions for i in range(len(self.ffuncs)): self.ffuncs[i], _ = func(self.ffuncs[i], **kwargs)
def make_yask_kernels(iet, **kwargs): yk_solns = kwargs.pop('yk_solns') mapper = {} for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees))) # Retrieve the section dtype exprs = FindNodes(Expression).visit(section) dtypes = {e.dtype for e in exprs} if len(dtypes) != 1: log("Unable to offload in presence of mixed-precision arithmetic") continue dtype = dtypes.pop() context = contexts.fetch(dimensions, dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK vars and populate `yc_soln` with equations local_vars = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YASKSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_vars) yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d var(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_vars(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed through YASK, due to the # different storage layout yk_var_objs = { i.name: YASKVarObject(i.name) for i in FindSymbols().visit(iet) if i.from_YASK } yk_var_objs.update({i: YASKVarObject(i) for i in get_local_vars(yk_solns)}) iet = make_var_accesses(iet, yk_var_objs) # The signature needs to be updated # TODO: this could be done automagically through the iet pass engine, but # currently it only supports *appending* to the parameters list. While here # we actually need to change it as some parameters may disappear (x_m, x_M, ...) parameters = derive_parameters(iet, True) iet = iet._rebuild(parameters=parameters) return iet, {}
def _specialize_iet(cls, iet, **kwargs): """ Transform the Iteration/Expression tree to offload the computation of one or more loop nests onto YASK. This involves calling the YASK compiler to generate YASK code. Such YASK code is then called from within the transformed Iteration/Expression tree. """ mapper = {} yk_solns = kwargs.pop('yk_solns') for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple( filter_ordered(i.dim.root for i in flatten(trees))) # Retrieve the section dtype exprs = FindNodes(Expression).visit(section) dtypes = {e.dtype for e in exprs} if len(dtypes) != 1: log("Unable to offload in presence of mixed-precision arithmetic" ) continue dtype = dtypes.pop() context = contexts.fetch(dimensions, dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest( configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK vars and populate `yc_soln` with equations local_vars = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YaskSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_vars) yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d var(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_vars(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed through YASK, due to the # different storage layout yk_var_objs = { i.name: YaskVarObject(i.name) for i in FindSymbols().visit(iet) if i.from_YASK } yk_var_objs.update( {i: YaskVarObject(i) for i in cls._get_local_vars(yk_solns)}) iet = make_var_accesses(iet, yk_var_objs) # The signature needs to be updated parameters = derive_parameters(iet, True) iet = iet._rebuild(parameters=parameters) return super(OperatorYASK, cls)._specialize_iet(iet, **kwargs)