def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations. In particular: * All parallel Iterations not *writing* to a host Function, that is a Function `f` such that `is_on_device(f) == False`, are offloaded to the device. * The remaining ones, that is those writing to a host Function, are parallelized on the host. """ assert candidates root = candidates[0] if is_on_device(root, self.gpu_fit, only_writes=True): # The typical case: all written Functions are device Functions, that is # they're mapped in the device memory. Then we offload `root` to the device # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree elif not self.par_disabled: # Resort to host parallelism return super()._make_partree(candidates, nthreads) else: return root, None
def _make_partree(self, candidates, nthreads=None): """ Parallelize the `candidates` Iterations. In particular: * A PARALLEL Iteration writing (reading) a mapped Array while reading (writing) a host Function (that is, all Functions `f` such that `is_on_device(f)` gives False) is parallelized on the host. These are essentially the Iterations that initialize or dump the Devito-created buffers. * All other PARALLEL Iterations (typically, the majority) are offloaded to the device. """ assert candidates root, collapsable = self._select_candidates(candidates) if self._is_offloadable(root): body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapse=len(collapsable) + 1, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree elif not self.par_disabled: # Resort to host parallelism return super()._make_partree(candidates, nthreads) else: return root, None
def _make_partree(self, candidates, nthreads=None): assert candidates root, collapsable = self._select_candidates(candidates) ncollapsable = len(collapsable) if self._is_offloadable(root) and \ all(i.is_Affine for i in [root] + collapsable) and \ self.par_tile: # TODO: still unable to exploit multiple par-tiles (one per nest) # This will require unconditionally applying blocking, and then infer # the tile clause shape from the BlockDimensions' step tile = self.par_tile[0] assert isinstance(tile, tuple) nremainder = (ncollapsable + 1) - len(tile) if nremainder >= 0: tile += (tile[-1], ) * nremainder else: tile = tile[:ncollapsable + 1] body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree else: return super()._make_partree(candidates, nthreads)
def _make_partree(self, candidates, nthreads=None): assert candidates root = candidates[0] # Get the collapsable Iterations collapsable = self._find_collapsable(root, candidates) ncollapse = 1 + len(collapsable) # Prepare to build a ParallelTree if all(i.is_Affine for i in candidates): bundles = FindNodes(ExpressionBundle).visit(root) sops = sum(i.ops for i in bundles) if sops >= self.dynamic_work: schedule = 'dynamic' else: schedule = 'static' if nthreads is None: # pragma ... for ... schedule(..., 1) nthreads = self.nthreads body = self.HostIteration(schedule=schedule, ncollapse=ncollapse, **root.args) else: # pragma ... parallel for ... schedule(..., 1) body = self.HostIteration(schedule=schedule, parallel=True, ncollapse=ncollapse, nthreads=nthreads, **root.args) prefix = [] else: # pragma ... for ... schedule(..., expr) assert nthreads is None nthreads = self.nthreads_nonaffine chunk_size = Symbol(name='chunk_size') body = self.HostIteration(ncollapse=ncollapse, chunk_size=chunk_size, **root.args) niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable]) value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1)) prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))] # Create a ParallelTree partree = ParallelTree(prefix, body, nthreads=nthreads) collapsed = [partree] + collapsable return root, partree, collapsed
def _make_partree(self, candidates, nthreads=None): assert candidates root = candidates[0] collapsable = self._find_collapsable(root, candidates) ncollapsable = len(collapsable) if self._is_offloadable(root) and \ all(i.is_Affine for i in [root] + collapsable) and \ self.par_tile: if isinstance(self.par_tile, tuple): tile = self.par_tile[:ncollapsable + 1] else: # (32,4,4,...) is typically a decent choice tile = (32,) + (4,)*ncollapsable body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, **root.args) partree = ParallelTree([], body, nthreads=nthreads) return root, partree else: return super()._make_partree(candidates, nthreads)