Esempio n. 1
0
    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations. In particular:

            * All parallel Iterations not *writing* to a host Function, that
              is a Function `f` such that `is_on_device(f) == False`, are offloaded
              to the device.
            * The remaining ones, that is those writing to a host Function,
              are parallelized on the host.
        """
        assert candidates
        root = candidates[0]

        if is_on_device(root, self.gpu_fit, only_writes=True):
            # The typical case: all written Functions are device Functions, that is
            # they're mapped in the device memory. Then we offload `root` to the device

            # Get the collapsable Iterations
            collapsable = self._find_collapsable(root, candidates)
            ncollapse = 1 + len(collapsable)

            body = self.DeviceIteration(gpu_fit=self.gpu_fit, ncollapse=ncollapse,
                                        **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)

            return root, partree
        elif not self.par_disabled:
            # Resort to host parallelism
            return super()._make_partree(candidates, nthreads)
        else:
            return root, None
Esempio n. 2
0
    def _make_partree(self, candidates, nthreads=None):
        """
        Parallelize the `candidates` Iterations. In particular:

            * A PARALLEL Iteration writing (reading) a mapped Array while
              reading (writing) a host Function (that is, all Functions `f`
              such that `is_on_device(f)` gives False) is parallelized
              on the host. These are essentially the Iterations that initialize
              or dump the Devito-created buffers.
            * All other PARALLEL Iterations (typically, the majority) are
              offloaded to the device.
        """
        assert candidates

        root, collapsable = self._select_candidates(candidates)

        if self._is_offloadable(root):
            body = self.DeviceIteration(gpu_fit=self.gpu_fit,
                                        ncollapse=len(collapsable) + 1,
                                        **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)

            return root, partree
        elif not self.par_disabled:
            # Resort to host parallelism
            return super()._make_partree(candidates, nthreads)
        else:
            return root, None
Esempio n. 3
0
    def _make_partree(self, candidates, nthreads=None):
        assert candidates

        root, collapsable = self._select_candidates(candidates)
        ncollapsable = len(collapsable)

        if self._is_offloadable(root) and \
           all(i.is_Affine for i in [root] + collapsable) and \
           self.par_tile:
            # TODO: still unable to exploit multiple par-tiles (one per nest)
            # This will require unconditionally applying blocking, and then infer
            # the tile clause shape from the BlockDimensions' step
            tile = self.par_tile[0]
            assert isinstance(tile, tuple)
            nremainder = (ncollapsable + 1) - len(tile)
            if nremainder >= 0:
                tile += (tile[-1], ) * nremainder
            else:
                tile = tile[:ncollapsable + 1]

            body = self.DeviceIteration(gpu_fit=self.gpu_fit,
                                        tile=tile,
                                        **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)

            return root, partree
        else:
            return super()._make_partree(candidates, nthreads)
Esempio n. 4
0
    def _make_partree(self, candidates, nthreads=None):
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.dynamic_work:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma ... for ... schedule(..., 1)
                nthreads = self.nthreads
                body = self.HostIteration(schedule=schedule,
                                          ncollapse=ncollapse,
                                          **root.args)
            else:
                # pragma ... parallel for ... schedule(..., 1)
                body = self.HostIteration(schedule=schedule,
                                          parallel=True,
                                          ncollapse=ncollapse,
                                          nthreads=nthreads,
                                          **root.args)
            prefix = []
        else:
            # pragma ... for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = self.HostIteration(ncollapse=ncollapse,
                                      chunk_size=chunk_size,
                                      **root.args)

            niters = prod([root.symbolic_size] +
                          [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads * self.chunk_nonaffine), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Esempio n. 5
0
    def _make_partree(self, candidates, nthreads=None):
        assert candidates
        root = candidates[0]

        collapsable = self._find_collapsable(root, candidates)
        ncollapsable = len(collapsable)

        if self._is_offloadable(root) and \
           all(i.is_Affine for i in [root] + collapsable) and \
           self.par_tile:
            if isinstance(self.par_tile, tuple):
                tile = self.par_tile[:ncollapsable + 1]
            else:
                # (32,4,4,...) is typically a decent choice
                tile = (32,) + (4,)*ncollapsable

            body = self.DeviceIteration(gpu_fit=self.gpu_fit, tile=tile, **root.args)
            partree = ParallelTree([], body, nthreads=nthreads)

            return root, partree
        else:
            return super()._make_partree(candidates, nthreads)