Exemple #1
0
 def run(expr):
     if expr.is_Integer:
         return fac.new_const_number_node(int(expr))
     elif expr.is_Float:
         return fac.new_const_number_node(float(expr))
     elif isinstance(expr, Indexed):
         function = expr.base.function
         assert function.name in self.grids
         indices = [
             int((i.origin if isinstance(i, LoweredDimension) else i) -
                 j) for i, j in zip(expr.indices, function.indices)
         ]
         return self.grids[function.name].new_relative_grid_point(
             *indices)
     elif expr.is_Add:
         return nary2binary(expr.args, fac.new_add_node)
     elif expr.is_Mul:
         return nary2binary(expr.args, fac.new_multiply_node)
     elif expr.is_Pow:
         num, den = expr.as_numer_denom()
         if num == 1:
             return fac.new_divide_node(run(num), run(den))
     elif expr.is_Equality:
         return fac.new_equation_node(*[run(i) for i in expr.args])
     else:
         dle_warning("Missing handler in Devito-YASK translation")
         raise NotImplementedError
Exemple #2
0
def transform(iet, mode='basic', options=None):
    """
    Transform Iteration/Expression trees (IET) to generate optimized C code.

    Parameters
    ----------
    iet : Node
        The root of the IET to be transformed.
    mode : str, optional
        The transformation mode.
        - ``noop``: Do nothing.
        - ``basic``: Add instructions to avoid denormal numbers and create elemental
                     functions for quicker JIT-compilation.
        - ``advanced``: 'basic', vectorization, loop blocking.
        - ``speculative``: Apply all of the 'advanced' transformations, plus other
                           transformations that might increase (or possibly decrease)
                           performance.
    options : dict, optional
        - ``openmp``: Enable/disable OpenMP. Defaults to `configuration['openmp']`.
        - ``mpi``: Enable/disable MPI. Defaults to `configuration['mpi']`.
        - ``blockinner``: Enable/disable blocking of innermost loops. By default,
                          this is disabled to maximize SIMD vectorization. Pass True
                          to override this heuristic.
        - ``blockalways``: Pass True to unconditionally apply loop blocking, even when
                           the compiler heuristically thinks that it might not be
                           profitable and/or dangerous for performance.
    """
    assert isinstance(iet, Node)

    # Parse options (local values take precedence over global ones)
    options = options or {}
    params = options.copy()
    for i in options:
        if i not in all_options:
            dle_warning("Illegal DLE option '%s'" % i)
            params.pop(i)
    params.update({
        k: v
        for k, v in configuration['dle-options'].items() if k not in params
    })
    params.setdefault('openmp', configuration['openmp'])
    params.setdefault('mpi', configuration['mpi'])

    # Force OpenMP if parallelism was requested, even though mode is 'noop'
    if mode == 'noop' and params['openmp'] is True:
        mode = 'openmp'

    # Process the Iteration/Expression tree through the DLE
    if mode is None or mode == 'noop':
        return iet, State(iet)
    elif mode not in default_modes:
        try:
            rewriter = CustomRewriter(mode, params)
            return rewriter.run(iet)
        except DLEException:
            dle_warning("Unknown transformer mode(s) %s" % mode)
            return iet, State(iet)
    else:
        rewriter = default_modes[mode](params)
        return rewriter.run(iet)
Exemple #3
0
 def _pipeline(self, state):
     if yask is None:
         dle_warning("Cannot find YASK. Skipping DLE optimizations...")
         super(YaskRewriter, self)._pipeline(state)
         return
     self._avoid_denormals(state)
     self._yaskize(state)
     self._create_elemental_functions(state)
Exemple #4
0
def nhyperthreads():
    try:
        logical = configuration['cross-compile'].cpu_count(logical=True)
    except AttributeError:
        logical = psutil.cpu_count(logical=True)
    physical = ncores()
    if logical % physical > 0:
        dle_warning(
            "Couldn't detect number of hyperthreads per core, assuming 1")
        return 1
    else:
        return logical // physical
Exemple #5
0
    def _yaskize(self, state):
        """
        Create a YASK representation of this Iteration/Expression tree.
        """

        dle_warning("Be patient! The YASK backend is still a WIP")
        dle_warning("This is the YASK AST that the Devito DLE can build")

        for node in state.nodes:
            for tree in retrieve_iteration_tree(node):
                candidate = tree[-1]

                # Set up the YASK solution
                soln = cfac.new_stencil_solution("solution")

                # Set up the YASK grids
                grids = FindSymbols(mode='symbolics').visit(candidate)
                grids = {
                    g.name: soln.new_grid(g.name, *[str(i) for i in g.indices])
                    for g in grids
                }

                # Vector folding API usage example
                soln.set_fold_len('x', 1)
                soln.set_fold_len('y', 1)
                soln.set_fold_len('z', 8)

                # Perform the translation on an expression basis
                transformer = sympy2yask(grids)
                expressions = [e for e in candidate.nodes if e.is_Expression]
                # yaskASTs = [transformer(e.stencil) for e in expressions]
                for i in expressions:
                    try:
                        ast = transformer(i.stencil)
                        # Scalar
                        print(ast.format_simple())

                        # AVX2 intrinsics
                        # print soln.format('avx2')

                        # AVX2 intrinsics to file
                        # import os
                        # path = os.path.join(os.environ.get('YASK_HOME', '.'), 'src')
                        # soln.write(os.path.join(path, 'stencil_code.hpp'), 'avx2')
                    except:
                        pass

        dle_warning("Falling back to basic DLE optimizations...")

        return {'nodes': state.nodes}
Exemple #6
0
def transform(node, mode='basic', options=None):
    """
    Transform Iteration/Expression trees to generate highly optimized C code.

    :param node: The Iteration/Expression tree to be transformed, or an iterable
                 of Iteration/Expression trees.
    :param mode: Drive the tree transformation. ``mode`` is a string indicating
                 a certain optimization pipeline.
    :param options: A dictionary with additional information to drive the DLE.

    The ``mode`` parameter accepts the following values: ::

        * 'noop': Do nothing.
        * 'basic': Add instructions to avoid denormal numbers and create elemental
                   functions for rapid JIT-compilation.
        * 'advanced': 'basic', vectorization, loop blocking.
        * 'speculative': Apply all of the 'advanced' transformations, plus other
                         transformations that might increase (or possibly decrease)
                         performance.

    The ``options`` parameter accepts the following values: ::

        * 'blockshape': The block shape for loop blocking (a tuple).
        * 'blockinner': By default, loop blocking is not applied to the innermost
                        dimension of an Iteration/Expression tree (to maximize
                        vectorization). Set this flag to True to override this
                        heuristic.
        * 'blockalways': Apply blocking even though the DLE thinks it's not
                         worthwhile applying it.
    """
    # Check input parameters
    if not (mode is None or isinstance(mode, str)):
        raise ValueError("Parameter 'mode' should be a string, not %s." % type(mode))

    if isinstance(node, Sequence):
        assert all(n.is_Node for n in node)
        node = list(node)
    elif node.is_Node:
        node = [node]
    else:
        raise ValueError("Got illegal node of type %s." % type(node))

    # Parse options (local options take precedence over global options)
    options = options or {}
    params = options.copy()
    for i in options:
        if i not in default_options:
            dle_warning("Illegal DLE parameter '%s'" % i)
            params.pop(i)
    params.update({k: v for k, v in configuration['dle_options'].items()
                   if k not in params})
    params.update({k: v for k, v in default_options.items() if k not in params})
    params['compiler'] = configuration['compiler']
    params['openmp'] = configuration['openmp']

    # Process the Iteration/Expression tree through the DLE
    if mode is None or mode == 'noop':
        return State(node)
    elif mode not in modes:
        try:
            rewriter = DevitoCustomRewriter(node, mode, params)
            return rewriter.run()
        except DLEException:
            dle_warning("Unknown transformer mode(s) %s" % mode)
            return State(node)
    else:
        rewriter = modes[mode](node, params)
        return rewriter.run()
Exemple #7
0
    def _loop_blocking(self, state, **kwargs):
        """
        Apply loop blocking to :class:`Iteration` trees.

        Blocking is applied to parallel iteration trees. Heuristically, innermost
        dimensions are not blocked to maximize the trip count of the SIMD loops.

        Different heuristics may be specified by passing the keywords ``blockshape``
        and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate
        a specific block size for each blocked dimension. For example, for the
        :class:`Iteration` tree: ::

            for i
              for j
                for k
                  ...

        one may provide ``blockshape = {i: 4, j: 7}``, in which case the
        two outer loops will blocked, and the resulting 2-dimensional block will
        have size 4x7. The latter may be set to True to also block innermost parallel
        :class:`Iteration` objects.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        blocked = OrderedDict()
        processed = []
        for node in state.nodes:
            # Make sure loop blocking will span as many Iterations as possible
            fold = fold_blockable_tree(node, exclude_innermost)

            mapper = {}
            for tree in retrieve_iteration_tree(fold):
                # Is the Iteration tree blockable ?
                iterations = [i for i in tree if i.is_Parallel]
                if exclude_innermost:
                    iterations = [
                        i for i in iterations if not i.is_Vectorizable
                    ]
                if len(iterations) <= 1:
                    continue
                root = iterations[0]
                if not IsPerfectIteration().visit(root):
                    # Illegal/unsupported
                    continue
                if not tree[0].is_Sequential and not ignore_heuristic:
                    # Heuristic: avoid polluting the generated code with blocked
                    # nests (thus increasing JIT compilation time and affecting
                    # readability) if the blockable tree isn't embedded in a
                    # sequential loop (e.g., a timestepping loop)
                    continue

                # Decorate intra-block iterations with an IterationProperty
                TAG = tagger(len(mapper))

                # Build all necessary Iteration objects, individually. These will
                # subsequently be composed to implement loop blocking.
                inter_blocks = []
                intra_blocks = []
                remainders = []
                for i in iterations:
                    # Build Iteration over blocks
                    dim = blocked.setdefault(
                        i, Dimension("%s_block" % i.dim.name))
                    block_size = dim.symbolic_size
                    iter_size = i.dim.size or i.dim.symbolic_size
                    start = i.limits[0] - i.offsets[0]
                    finish = iter_size - i.offsets[1]
                    innersize = iter_size - (-i.offsets[0] + i.offsets[1])
                    finish = finish - (innersize % block_size)
                    inter_block = Iteration([],
                                            dim, [start, finish, block_size],
                                            properties=PARALLEL)
                    inter_blocks.append(inter_block)

                    # Build Iteration within a block
                    start = inter_block.dim
                    finish = start + block_size
                    intra_block = i._rebuild([],
                                             limits=[start, finish, 1],
                                             offsets=None,
                                             properties=i.properties +
                                             (TAG, ELEMENTAL))
                    intra_blocks.append(intra_block)

                    # Build unitary-increment Iteration over the 'leftover' region.
                    # This will be used for remainder loops, executed when any
                    # dimension size is not a multiple of the block size.
                    start = inter_block.limits[1]
                    finish = iter_size - i.offsets[1]
                    remainder = i._rebuild([],
                                           limits=[start, finish, 1],
                                           offsets=None)
                    remainders.append(remainder)

                # Build blocked Iteration nest
                blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                             [iterations[-1].nodes])

                # Build remainder Iterations
                remainder_trees = []
                for n in range(len(iterations)):
                    for c in combinations([i.dim for i in iterations], n + 1):
                        # First all inter-block Interations
                        nodes = [
                            b._rebuild(properties=b.properties + (REMAINDER, ))
                            for b, r in zip(inter_blocks, remainders)
                            if r.dim not in c
                        ]
                        # Then intra-block or remainder, for each dim (in order)
                        properties = (REMAINDER, TAG, ELEMENTAL)
                        for b, r in zip(intra_blocks, remainders):
                            handle = r if b.dim in c else b
                            nodes.append(
                                handle._rebuild(properties=properties))
                        nodes.extend([iterations[-1].nodes])
                        remainder_trees.append(compose_nodes(nodes))

                # Will replace with blocked loop tree
                mapper[root] = List(body=[blocked_tree] + remainder_trees)

            rebuilt = Transformer(mapper).visit(fold)

            # Finish unrolling any previously folded Iterations
            processed.append(unfold_blocked_tree(rebuilt))

        # All blocked dimensions
        if not blocked:
            return {'nodes': processed}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return {
            'nodes': processed,
            'arguments': arguments,
            'flags': 'blocking'
        }
Exemple #8
0
def transform(node, mode='basic', options=None):
    """
    Transform Iteration/Expression trees to generate highly optimized C code.

    :param node: The Iteration/Expression tree to be transformed, or an iterable
                 of Iteration/Expression trees.
    :param mode: Drive the tree transformation. ``mode`` is a string indicating
                 a certain optimization pipeline. The following values are accepted: ::

                     * 'noop': Do nothing.
                     * 'basic': Add instructions to avoid denormal numbers and create
                                elemental functions for rapid JIT-compilation.
                     * 'advanced': 'basic', vectorization, loop blocking.
                     * '3D-advanced': Like 'advanced', but attempt 3D loop blocking.
                     * 'speculative': Apply all of the 'advanced' transformations,
                                      plus other transformations that might increase
                                      (or possibly decrease) performance.
                     * 'yask': Optimize by offloading to the YASK optimizer. Still
                               work-in-progress; should only be used by developers.
    :param options: A dictionary with additional information to drive the DLE. The
                    following values are accepted: ::

                        * 'blockshape': A tuple representing the shape of a block created
                                        by loop blocking.
                        * 'blockinner': By default, loop blocking is not applied to the
                                        innermost dimension of an Iteration/Expression
                                        tree to maximize vectorization. Set this flag to
                                        True to override this heuristic.
    """
    from devito.parameters import configuration

    # Check input parameters
    if not (mode is None or isinstance(mode, str)):
        raise ValueError("Parameter 'mode' should be a string, not %s." %
                         type(mode))

    if isinstance(node, Sequence):
        assert all(n.is_Node for n in node)
        node = list(node)
    elif node.is_Node:
        node = [node]
    else:
        raise ValueError("Got illegal node of type %s." % type(node))

    # Parse options
    options = options or {}
    params = options.copy()
    for i in options:
        if i not in ('blockshape', 'blockinner'):
            dle_warning("Illegal DLE parameter '%s'" % i)
            params.pop(i)
    params['compiler'] = configuration['compiler']
    params['openmp'] = configuration['openmp']

    # Process the Iteration/Expression tree through the DLE
    if mode is None or mode == 'noop':
        return State(node)
    elif mode == '3D-advanced':
        params['blockinner'] = True
        mode = 'advanced'
    elif mode not in modes:
        try:
            rewriter = DevitoCustomRewriter(node, mode, params)
            return rewriter.run()
        except DLEException:
            dle_warning("Unknown transformer mode(s) %s" % mode)
            return State(node)
    else:
        rewriter = modes[mode](node, params)
        return rewriter.run()
def transform(iet, mode='advanced', options=None):
    """
    Transform Iteration/Expression trees (IET) to generate optimized C code.

    Parameters
    ----------
    iet : Node
        The root of the IET to be transformed.
    mode : str, optional
        The transformation mode.
        - ``noop``: Do nothing.
        - ``advanced``: flush denormals, vectorization, loop blocking, OpenMP-related
                        optimizations (collapsing, nested parallelism, ...), MPI-related
                        optimizations (aggregation of communications, reshuffling of
                        communications, ...).
        - ``speculative``: Apply all of the 'advanced' transformations, plus other
                           transformations that might increase (or possibly decrease)
                           performance.
    options : dict, optional
        - ``openmp``: Enable/disable OpenMP. Defaults to `configuration['openmp']`.
        - ``mpi``: Enable/disable MPI. Defaults to `configuration['mpi']`.
        - ``blockinner``: Enable/disable blocking of innermost loops. By default,
                          this is disabled to maximize SIMD vectorization. Pass True
                          to override this heuristic.
        - ``blockalways``: Pass True to unconditionally apply loop blocking, even when
                           the compiler heuristically thinks that it might not be
                           profitable and/or dangerous for performance.
    """
    assert isinstance(iet, Node)

    # Default options
    params = {}
    params['blockinner'] = configuration['dle-options'].get(
        'blockinner', False)
    params['blockalways'] = configuration['dle-options'].get(
        'blockalways', False)
    params['openmp'] = configuration['openmp']
    params['mpi'] = configuration['mpi']

    # Parse input options (potentially replacing defaults)
    for k, v in (options or {}).items():
        if k not in params:
            dle_warning("Illegal DLE option '%s'" % k)
        else:
            params[k] = v

    # Force OpenMP if parallelism was requested, even though mode is 'noop'
    if mode == 'noop' and params['openmp'] is True:
        mode = 'openmp'

    # What is the target platform for which the optimizations are applied?
    platform = configuration['platform']

    # Run the DLE
    if mode is None or mode == 'noop':
        # No-op case
        return iet, State(iet)
    try:
        # Preset rewriter (typical use case)
        rewriter = modes.fetch(platform, mode)(params, platform)
        return rewriter.run(iet)
    except KeyError:
        pass
    try:
        # Fallback: custom rewriter -- `mode` is a specific sequence of
        # transformation passes
        rewriter = CustomRewriter(mode, params, platform)
        return rewriter.run(iet)
    except DLEException:
        dle_warning("Unknown transformer mode(s) %s" % mode)
        return iet, State(iet)
Exemple #10
0
    def _loop_blocking(self, nodes, state):
        """
        Apply loop blocking to PARALLEL :class:`Iteration` trees.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                name = "%s%d_block" % (i.dim.name, len(mapper))

                # Build Iteration over blocks
                dim = blocked.setdefault(i, Dimension(name=name))
                bsize = dim.symbolic_size
                bstart = i.limits[0]
                binnersize = i.symbolic_extent + (i.offsets[1] - i.offsets[0])
                bfinish = i.dim.symbolic_end - (binnersize % bsize)
                inter_block = Iteration([],
                                        dim, [bstart, bfinish, bsize],
                                        offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + bsize - 1, 1)
                intra_block = i._rebuild([],
                                         limits=limits,
                                         offsets=(0, 0),
                                         properties=i.properties +
                                         (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild(
                    [],
                    limits=[bfinish + 1, i.dim.symbolic_end, 1],
                    offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [
                        b._rebuild(properties=b.properties + (REMAINDER, ))
                        for b, r in zip(inter_blocks, remainders)
                        if r.dim not in c
                    ]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        # All blocked dimensions
        if not blocked:
            return processed, {}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return processed, {'arguments': arguments, 'flags': 'blocking'}
Exemple #11
0
def transform(iet, mode='advanced', options=None):
    """
    Transform Iteration/Expression trees (IET) to generate optimized C code.

    Parameters
    ----------
    iet : Node
        The root of the IET to be transformed.
    mode : str, optional
        The transformation mode.
        - ``noop``: Do nothing.
        - ``advanced``: flush denormals, vectorization, loop blocking, OpenMP-related
                        optimizations (collapsing, nested parallelism, ...), MPI-related
                        optimizations (aggregation of communications, reshuffling of
                        communications, ...).
        - ``speculative``: Apply all of the 'advanced' transformations, plus other
                           transformations that might increase (or possibly decrease)
                           performance.
    options : dict, optional
        - ``openmp``: Enable/disable OpenMP. Defaults to `configuration['openmp']`.
        - ``mpi``: Enable/disable MPI. Defaults to `configuration['mpi']`.
        - ``blockinner``: Enable/disable blocking of innermost loops. By default,
                          this is disabled to maximize SIMD vectorization. Pass True
                          to override this heuristic.
        - ``blockalways``: Pass True to unconditionally apply loop blocking, even when
                           the compiler heuristically thinks that it might not be
                           profitable and/or dangerous for performance.
    """
    assert isinstance(iet, Node)

    # Default options
    params = {}
    params['blockinner'] = configuration['dle-options'].get('blockinner', False)
    params['blockalways'] = configuration['dle-options'].get('blockalways', False)
    params['openmp'] = configuration['openmp']
    params['mpi'] = configuration['mpi']

    # Parse input options (potentially replacing defaults)
    for k, v in (options or {}).items():
        if k not in params:
            dle_warning("Illegal DLE option '%s'" % k)
        else:
            params[k] = v

    # Force OpenMP if parallelism was requested, even though mode is 'noop'
    if mode == 'noop' and params['openmp'] is True:
        mode = 'openmp'

    # What is the target platform for which the optimizations are applied?
    platform = configuration['platform']

    # Run the DLE
    if mode is None or mode == 'noop':
        # No-op case
        return iet, State(iet)
    try:
        # Preset rewriter (typical use case)
        rewriter = modes.fetch(platform, mode)(params, platform)
        return rewriter.run(iet)
    except KeyError:
        pass
    try:
        # Fallback: custom rewriter -- `mode` is a specific sequence of
        # transformation passes
        rewriter = CustomRewriter(mode, params, platform)
        return rewriter.run(iet)
    except DLEException:
        dle_warning("Unknown transformer mode(s) %s" % mode)
        return iet, State(iet)
Exemple #12
0
    def _loop_blocking(self, state, **kwargs):
        """
        Apply loop blocking to :class:`Iteration` trees.

        By default, the blocked :class:`Iteration` objects and the block size are
        determined heuristically. The heuristic consists of searching the deepest
        Iteration/Expression tree and blocking all dimensions except:

            * The innermost (eg, to retain SIMD vectorization);
            * Those dimensions inducing loop-carried dependencies.

        The caller may take over the heuristic through ``kwargs['blocking']``,
        a dictionary indicating the block size of each blocked dimension. For
        example, for the :class:`Iteration` tree below: ::

            for i
              for j
                for k
                  ...

        one may pass in ``kwargs['blocking'] = {i: 4, j: 7}``, in which case the
        two outer loops would be blocked, and the resulting 2-dimensional block
        would be of size 4x7.
        """
        Region = namedtuple('Region', 'main leftover')

        blocked = OrderedDict()
        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node):
                # Is the Iteration tree blockable ?
                iterations = [i for i in tree if i.is_Parallel]
                if 'blockinner' not in self.params:
                    iterations = [
                        i for i in iterations if not i.is_Vectorizable
                    ]
                if not iterations:
                    continue
                root = iterations[0]
                if not IsPerfectIteration().visit(root):
                    continue

                # Construct the blocked loop nest, as well as all necessary
                # remainder loops
                regions = OrderedDict()
                blocked_iterations = []
                for i in iterations:
                    # Build Iteration over blocks
                    dim = blocked.setdefault(
                        i, Dimension("%s_block" % i.dim.name))
                    block_size = dim.symbolic_size
                    iter_size = i.dim.size or i.dim.symbolic_size
                    start = i.limits[0] - i.offsets[0]
                    finish = iter_size - i.offsets[1]
                    finish = finish - ((finish - i.offsets[1]) % block_size)
                    inter_block = Iteration([],
                                            dim, [start, finish, block_size],
                                            properties=as_tuple('parallel'))

                    # Build Iteration within a block
                    start = inter_block.dim
                    finish = start + block_size
                    properties = 'vector-dim' if i.is_Vectorizable else None
                    intra_block = Iteration([],
                                            i.dim, [start, finish, 1],
                                            i.index,
                                            properties=as_tuple(properties))

                    blocked_iterations.append((inter_block, intra_block))

                    # Build unitary-increment Iteration over the 'main' region
                    # (the one blocked); necessary to generate code iterating over
                    # non-blocked ("remainder") iterations.
                    start = inter_block.limits[0]
                    finish = inter_block.limits[1]
                    main = Iteration([],
                                     i.dim, [start, finish, 1],
                                     i.index,
                                     properties=i.properties)

                    # Build unitary-increment Iteration over the 'leftover' region:
                    # again as above, this may be necessary when the dimension size
                    # is not a multiple of the block size.
                    start = inter_block.limits[1]
                    finish = iter_size - i.offsets[1]
                    leftover = Iteration([],
                                         i.dim, [start, finish, 1],
                                         i.index,
                                         properties=i.properties)

                    regions[i] = Region(main, leftover)

                blocked_tree = list(flatten(zip(*blocked_iterations)))
                blocked_tree = compose_nodes(blocked_tree +
                                             [iterations[-1].nodes])

                # Build remainder loops
                remainder_tree = []
                for n in range(len(iterations)):
                    for i in combinations(iterations, n + 1):
                        nodes = [
                            v.leftover if k in i else v.main
                            for k, v in regions.items()
                        ]
                        nodes += [iterations[-1].nodes]
                        remainder_tree.append(compose_nodes(nodes))

                # Will replace with blocked loop tree
                mapper[root] = List(body=[blocked_tree] + remainder_tree)

            rebuilt = Transformer(mapper).visit(node)

            processed.append(rebuilt)

        # All blocked dimensions
        if not blocked:
            return {'nodes': processed}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return {
            'nodes': processed,
            'arguments': arguments,
            'flags': 'blocking'
        }