def make_parallel(self, iet): """Transform ``iet`` by introducing shared-memory parallelism.""" mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates, self.lang['for']) # Nested parallelism partree = self._make_nested_partree(partree) # Ensure increments are atomic partree = self._make_atomic_incs(partree) # Ensure single-thread prodders are atomic partree = self._make_atomic_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) return iet, {'args': [self.nthreads] if mapper else [], 'includes': ['omp.h']}
def _make_parallel(self, iet): mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The used `nthreads` arguments args = [i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin))] return iet, {'args': args, 'includes': ['omp.h']}
def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the omp-parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree, collapsed = self._make_partree(candidates) if partree is None or root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree, collapsed) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region, declaring private and shared variables parregion = self._make_parregion(partree, parrays) # Protect the parallel region in case of 0-valued step increments parregion = self._make_guard(parregion, collapsed) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsMixin)) ] for n in FindNodes(Dereference).visit(iet): args.extend([(n.array, True), n.parray]) return iet, {'args': args, 'includes': ['omp.h']}
def _make_parallel(self, iet): mapper = {} parrays = {} for tree in retrieve_iteration_tree(iet): # Get the parallelizable Iterations in `tree` candidates = filter_iterations(tree, key=self.key) if not candidates: continue # Outer parallelism root, partree = self._make_partree(candidates) if partree is None or root in mapper: continue # Nested parallelism partree = self._make_nested_partree(partree) # Handle reductions partree = self._make_reductions(partree) # Atomicize and optimize single-thread prodders partree = self._make_threaded_prodders(partree) # Wrap within a parallel region parregion = self._make_parregion(partree, parrays) # Protect the parallel region if necessary parregion = self._make_guard(parregion) mapper[root] = parregion iet = Transformer(mapper).visit(iet) # The new arguments introduced by this pass args = [ i for i in FindSymbols().visit(iet) if isinstance(i, (NThreadsBase)) ] for n in FindNodes(VExpanded).visit(iet): args.extend([(n.pointee, True), n.pointer]) return iet, {'args': args, 'includes': [self.lang['header']]}
def hoist_prodders(iet): """ Move Prodders within the outer levels of an Iteration tree. """ mapper = {} for tree in retrieve_iteration_tree(iet): for prodder in FindNodes(Prodder).visit(tree.root): if prodder._periodic: try: key = lambda i: i.dim.is_Incr and i.dim.step != 1 candidate = filter_iterations(tree, key)[-1] except IndexError: # Fallback: use the outermost Iteration candidate = tree.root mapper[candidate] = candidate._rebuild(nodes=(candidate.nodes + (prodder._rebuild(),))) mapper[prodder] = None iet = Transformer(mapper, nested=True).visit(iet) return iet, {}
def make_blocking(self, iet): """ Apply loop blocking to PARALLEL Iteration trees. """ # Make sure loop blocking will span as many Iterations as possible iet = fold_blockable_tree(iet, self.blockinner) mapper = {} efuncs = [] block_dims = [] for tree in retrieve_iteration_tree(iet): # Is the Iteration tree blockable ? iterations = filter_iterations(tree, lambda i: i.is_Tilable) if not self.blockinner: iterations = iterations[:-1] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Don't know how block non-perfect Iteration nests continue # Apply hierarchical loop blocking to `tree` level_0 = [] # Outermost level of blocking level_i = [[] for i in range(1, self.nlevels)] # Inner levels of blocking intra = [] # Within the smallest block for i in iterations: template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d') properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ()) # Build Iteration across `level_0` blocks d = BlockDimension(i.dim, name=template % 0) level_0.append(Iteration([], d, d.symbolic_max, properties=properties)) # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels] for n, li in enumerate(level_i, 1): di = BlockDimension(d, name=template % n) li.append(Iteration([], di, limits=(d, d+d.step-1, di.step), properties=properties)) d = di # Build Iteration within the smallest block intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0))) level_i = flatten(level_i) # Track all constructed BlockDimensions block_dims.extend(i.dim for i in level_0 + level_i) # Construct the blocked tree blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes]) blocked = unfold_blocked_tree(blocked) # Promote to a separate Callable dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0) dynamic_parameters.extend([li.step for li in level_i]) efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters) efuncs.append(efunc) # Compute the iteration ranges ranges = [] for i, l0 in zip(iterations, level_0): maxb = i.symbolic_max - (i.symbolic_size % l0.step) ranges.append(((i.symbolic_min, maxb, l0.step), (maxb + 1, i.symbolic_max, i.symbolic_max - maxb))) # Build Calls to the `efunc` body = [] for p in product(*ranges): dynamic_args_mapper = {} for l0, (m, M, b) in zip(level_0, p): dynamic_args_mapper[l0.dim] = (m, M) dynamic_args_mapper[l0.step] = (b,) for li in level_i: if li.dim.root is l0.dim.root: value = li.step if b is l0.step else b dynamic_args_mapper[li.step] = (value,) call = efunc.make_call(dynamic_args_mapper) body.append(List(body=call)) mapper[root] = List(body=body) # Next blockable nest, use different (unique) variable/function names self.nblocked += 1 iet = Transformer(mapper).visit(iet) # Force-unfold if some folded Iterations haven't been blocked in the end iet = unfold_blocked_tree(iet) return iet, {'dimensions': block_dims, 'efuncs': efuncs, 'args': [i.step for i in block_dims]}