Ejemplo n.º 1
0
def find_offloadable_trees(nodes):
    """
    Return the trees within ``nodes`` that can be computed by YASK.

    A tree is "offloadable to YASK" if it is embedded in a time stepping loop
    *and* all of the grids accessed by the enclosed equations are homogeneous
    (i.e., same dimensions and data type).
    """
    offloadable = []
    for tree in retrieve_iteration_tree(nodes):
        parallel = filter_iterations(tree, lambda i: i.is_Parallel)
        if not parallel:
            # Cannot offload non-parallel loops
            continue
        if not (IsPerfectIteration().visit(tree)
                and all(i.is_Expression for i in tree[-1].nodes)):
            # Don't know how to offload this Iteration/Expression to YASK
            continue
        functions = flatten(i.functions for i in tree[-1].nodes)
        keys = set((i.grid, i.dtype) for i in functions if i.is_TimeFunction)
        if len(keys) == 0:
            continue
        elif len(keys) > 1:
            exit("Cannot handle Operators w/ heterogeneous grids")
        grid, dtype = keys.pop()
        # Is this a "complete" tree iterating over the entire grid?
        dims = [i.dim for i in tree]
        if all(i in dims for i in grid.dimensions) and\
                any(j in dims for j in [grid.time_dim, grid.stepping_dim]):
            offloadable.append((tree, grid, dtype))
    return offloadable
Ejemplo n.º 2
0
def find_offloadable_trees(nodes):
    """
    Return the trees within ``nodes`` that can be computed by YASK.

    A tree is "offloadable to YASK" if it is embedded in a time stepping loop
    *and* all of the grids accessed by the enclosed equations are homogeneous
    (i.e., same dimensions, shape, data type).
    """
    offloadable = []
    for tree in retrieve_iteration_tree(nodes):
        parallel = filter_iterations(tree, lambda i: i.is_Parallel)
        if not parallel:
            # Cannot offload non-parallel loops
            continue
        if not (IsPerfectIteration().visit(tree)
                and all(i.is_Expression for i in tree[-1].nodes)):
            # Don't know how to offload this Iteration/Expression to YASK
            continue
        functions = flatten(i.functions for i in tree[-1].nodes)
        keys = set((i.indices, i.shape, i.dtype) for i in functions
                   if i.is_TimeFunction)
        if len(keys) == 0:
            continue
        elif len(keys) > 1:
            exit("Cannot handle Operators w/ heterogeneous grids")
        dimensions, shape, dtype = keys.pop()
        if len(dimensions) == len(tree) and\
                all(i.dim == j for i, j in zip(tree, dimensions)):
            # Detected a "full" Iteration/Expression tree (over both
            # time and space dimensions)
            offloadable.append((tree, dimensions, shape, dtype))
    return offloadable
Ejemplo n.º 3
0
 def _ncollapse(self, root, candidates):
     # Heuristic: if at least two parallel loops are available and the
     # physical core count is greater than COLLAPSE, then omp-collapse them
     nparallel = len(candidates)
     isperfect = IsPerfectIteration().visit(root)
     if ncores() < Ompizer.COLLAPSE or nparallel < 2 or not isperfect:
         return 1
     else:
         return nparallel
Ejemplo n.º 4
0
def fold_blockable_tree(iet, blockinner=True):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    mapper = {}
    for k, sequence in FindAdjacent(Iteration).visit(iet).items():
        # Group based on Dimension
        groups = []
        for subsequence in sequence:
            for _, v in groupby(subsequence, lambda i: i.dim):
                i = list(v)
                if len(i) >= 2:
                    groups.append(i)
        for i in groups:
            # Pre-condition: they all must be perfect iterations
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if blockinner is False:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 0:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                r, remainder = j[0], j[1:]
                folds = [(tuple(y - x for x, y in zip(i.offsets, r.offsets)),
                          i.nodes) for i in remainder]
                mapper[r] = IterationFold(folds=folds, **r.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet
Ejemplo n.º 5
0
 def _pragma_for(self, root, candidates):
     # Heuristic: if at least two parallel loops are available and the
     # physical core count is greater than COLLAPSE, then omp-collapse them
     nparallel = len(candidates)
     if (psutil.cpu_count(logical=False) < Ompizer.COLLAPSE
             or nparallel < 2
             or not IsPerfectIteration().visit(root)):
         return self.lang['for']
     else:
         return self.lang['collapse'](nparallel)
Ejemplo n.º 6
0
def fold_blockable_tree(node, exclude_innermost=False):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    found = FindAdjacent(Iteration).visit(node)

    mapper = {}
    for k, v in found.items():
        for i in v:
            # Pre-condition: they all must be perfect iterations
            assert len(i) > 1
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if exclude_innermost is True:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 1:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(
                FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                root, remainder = j[0], j[1:]
                folds = [(tuple(y - x
                                for x, y in zip(i.offsets, root.offsets)),
                          i.nodes) for i in remainder]
                mapper[root] = IterationFold(folds=folds, **root.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    processed = Transformer(mapper, nested=True).visit(node)

    return processed
Ejemplo n.º 7
0
def fold_blockable_tree(node, exclude_innermost=False):
    """
    Create :class:`IterationFold`s from sequences of nested :class:`Iteration`.
    """
    found = FindAdjacentIterations().visit(node)
    found.pop('seen_iteration')

    mapper = {}
    for k, v in found.items():
        for i in v:
            # Pre-condition: they all must be perfect iterations
            assert len(i) > 1
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if exclude_innermost is True:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 1:
                continue
            # Perform folding
            for j in pairwise_folds:
                root, remainder = j[0], j[1:]
                folds = [(tuple(y - x
                                for x, y in zip(i.offsets, root.offsets)),
                          i.nodes) for i in remainder]
                mapper[root] = IterationFold(folds=folds, **root.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    processed = NestedTransformer(mapper).visit(node)

    return processed
Ejemplo n.º 8
0
def unfold_blocked_tree(node):
    """
    Unfold nested IterationFolds.

    Examples
    --------

    Given a section of Iteration/Expression tree as below: ::

        for i = 1 to N-1  // folded
          for j = 1 to N-1  // folded
            foo1()

    Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: ::

        for i = 1 to N-1
          for j = 1 to N-1
            foo1()
        for i = 2 to N-2
          for j = 2 to N-2
            foo2()
    """
    # Search the unfolding candidates
    candidates = []
    for tree in retrieve_iteration_tree(node):
        handle = tuple(i for i in tree if i.is_IterationFold)
        if handle:
            # Sanity check
            assert IsPerfectIteration().visit(handle[0])
            candidates.append(handle)

    # Perform unfolding
    tag = ntags()
    mapper = {}
    for tree in candidates:
        trees = list(zip(*[i.unfold() for i in tree]))
        # Update tag
        for i, _tree in enumerate(list(trees)):
            trees[i] = tuple(j.retag(tag + i) for j in _tree)
        trees = optimize_unfolded_tree(trees[:-1], trees[-1])
        mapper[tree[0]] = List(body=trees)

    # Insert the unfolded Iterations in the Iteration/Expression tree
    processed = Transformer(mapper).visit(node)

    return processed
Ejemplo n.º 9
0
    def test_consistency_coupled_w_ofs(self, exprs, ti0, ti1, ti3):
        """
        Test that no matter what is the order in which the equations are
        provided to an Operator, the resulting loop nest is the same.
        The array accesses in the equations may or may not use offsets;
        these impact the loop bounds, but not the resulting tree
        structure.
        """
        eq1, eq2, eq3 = EVAL(exprs, ti0.base, ti1.base, ti3.base)
        op1 = Operator([eq1, eq2, eq3], dse='noop', dle='noop')
        op2 = Operator([eq2, eq1, eq3], dse='noop', dle='noop')
        op3 = Operator([eq3, eq2, eq1], dse='noop', dle='noop')

        trees = [retrieve_iteration_tree(i) for i in [op1, op2, op3]]
        assert all(len(i) == 1 for i in trees)
        trees = [i[0] for i in trees]
        for tree in trees:
            assert IsPerfectIteration().visit(tree[0])
            exprs = FindNodes(Expression).visit(tree[-1])
            assert len(exprs) == 3
Ejemplo n.º 10
0
    def test_consistency_coupled_wo_ofs(self, tu, tv, ti0, t0, t1):
        """
        Test that no matter what is the order in which the equations are
        provided to an Operator, the resulting loop nest is the same.
        None of the array accesses in the equations use offsets.
        """
        eq1 = Eq(tu, tv*ti0*t0 + ti0*t1)
        eq2 = Eq(ti0, tu + t0*3.)
        eq3 = Eq(tv, ti0*tu)
        op1 = Operator([eq1, eq2, eq3], dse='noop', dle='noop')
        op2 = Operator([eq2, eq1, eq3], dse='noop', dle='noop')
        op3 = Operator([eq3, eq2, eq1], dse='noop', dle='noop')

        trees = [retrieve_iteration_tree(i) for i in [op1, op2, op3]]
        assert all(len(i) == 1 for i in trees)
        trees = [i[0] for i in trees]
        for tree in trees:
            assert IsPerfectIteration().visit(tree[0])
            exprs = FindNodes(Expression).visit(tree[-1])
            assert len(exprs) == 3
Ejemplo n.º 11
0
def detect_innermost_unitstride(tree, deps_graph, mapper=None):
    """
    Update ``mapper``, a dictionary from :class:`Iteration`s to
    :class:`IterationProperty`s, by annotating the innermost Iteration as
    vectorizable if all array accesses along its dimension turn out to
    be unit-strided.
    """
    if mapper is None:
        mapper = OrderedDict()
    innermost = tree[-1]
    if not IsPerfectIteration().visit(innermost):
        return mapper
    if len(tree) == 1 or SEQUENTIAL in mapper.get(tree[-2], []):
        # Heuristic: there should be at least an outer parallel Iteration
        # to mark /innermost/ as vectorizable, otherwise it is preferable
        # to save it for shared-memory parallelism
        return mapper
    is_US = True
    for k, v in deps_graph.items():
        is_US &= all(k.indices[-1] == i.indices[-1] for i in v)
    if is_US or PARALLEL in mapper.get(innermost, []):
        mapper.setdefault(innermost, []).append(VECTOR)
    return mapper
Ejemplo n.º 12
0
 def _ncollapse(self, root, candidates):
     # The OpenMP specification forbids collapsed loops to use iteration variables
     # in initializer expressions. For example, the following is forbidden:
     #
     # #pragma omp ... collapse(2)
     # for (int i = ... )
     #   for (int j = i ...)
     #     ...
     #
     # Below, we make sure this won't happen
     for n, i in enumerate(candidates):
         if any(j.dim in i.symbolic_min.free_symbols
                for j in candidates[:n]):
             break
     candidates = candidates[:n]
     # Heuristic: if at least two parallel loops are available and the
     # physical core count is greater than COLLAPSE, then omp-collapse them
     nparallel = len(candidates)
     isperfect = IsPerfectIteration().visit(root)
     if ncores() < Ompizer.COLLAPSE or nparallel < 2 or not isperfect:
         return 1
     else:
         return nparallel
Ejemplo n.º 13
0
    def make_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, self.blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel and i.is_Affine)
            if not self.blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not self.blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/readbility, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply hierarchical loop blocking to `tree`
            level_0 = []  # Outermost level of blocking
            level_i = [[] for i in range(1, self.nlevels)]  # Inner levels of blocking
            intra = []  # Within the smallest block
            for i in iterations:
                template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d')
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())

                # Build Iteration across `level_0` blocks
                d = BlockDimension(i.dim, name=template % 0)
                level_0.append(Iteration([], d, d.symbolic_max, properties=properties))

                # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels]
                for n, li in enumerate(level_i, 1):
                    di = BlockDimension(d, name=template % n)
                    li.append(Iteration([], di, limits=(d, d+d.step-1, di.step),
                                        properties=properties))
                    d = di

                # Build Iteration within the smallest block
                intra.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))
            level_i = flatten(level_i)

            # Track all constructed BlockDimensions
            block_dims.extend(i.dim for i in level_0 + level_i)

            # Construct the blocked tree
            blocked = compose_nodes(level_0 + level_i + intra + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0)
            dynamic_parameters.extend([li.step for li in level_i])
            efunc = make_efunc("bf%d" % self.nblocked, blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, l0 in zip(iterations, level_0):
                maxb = i.symbolic_max - (i.symbolic_size % l0.step)
                ranges.append(((i.symbolic_min, maxb, l0.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for l0, (m, M, b) in zip(level_0, p):
                    dynamic_args_mapper[l0.dim] = (m, M)
                    dynamic_args_mapper[l0.step] = (b,)
                    for li in level_i:
                        if li.dim.root is l0.dim.root:
                            value = li.step if b is l0.step else b
                            dynamic_args_mapper[li.step] = (value,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

            # Next blockable nest, use different (unique) variable/function names
            self.nblocked += 1

        iet = Transformer(mapper).visit(iet)

        # Force-unfold if some folded Iterations haven't been blocked in the end
        iet = unfold_blocked_tree(iet)

        return iet, {'dimensions': block_dims,
                     'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Ejemplo n.º 14
0
def test_is_perfect_iteration(block1, block2, block3, block4):
    checker = IsPerfectIteration()

    assert checker.visit(block1) is True
    assert checker.visit(block1.nodes[0]) is True
    assert checker.visit(block1.nodes[0].nodes[0]) is True

    assert checker.visit(block2) is False
    assert checker.visit(block2.nodes[1]) is True
    assert checker.visit(block2.nodes[1].nodes[0]) is True

    assert checker.visit(block3) is False
    assert checker.visit(block3.nodes[0]) is True
    assert checker.visit(block3.nodes[1]) is True
    assert checker.visit(block3.nodes[2]) is True

    assert checker.visit(block4) is False
    assert checker.visit(block4.nodes[0]) is False
    assert checker.visit(block4.nodes[0].then_body) is True
Ejemplo n.º 15
0
    def _loop_blocking(self, nodes, state):
        """Apply loop blocking to PARALLEL Iteration trees."""
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                # Build Iteration over blocks
                name = "%s%d_block" % (i.dim.name, len(mapper))
                dim = blocked.setdefault(i, BlockDimension(i.dim, name=name))
                binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0])
                bmax = i.dim.symbolic_max - (binnersize % dim.step)
                inter_block = Iteration([], dim, bmax, offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + dim.step - 1, 1)
                intra_block = i._rebuild([], limits=limits, offsets=(0, 0),
                                         properties=i.properties + (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1],
                                       offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [b._rebuild(properties=b.properties + (REMAINDER,))
                             for b, r in zip(inter_blocks, remainders)
                             if r.dim not in c]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        return processed, {'dimensions': list(blocked.values())}
Ejemplo n.º 16
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/redability, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())
                interb.append(Iteration([], d, d.symbolic_max, properties=properties))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Ejemplo n.º 17
0
    def _loop_blocking(self, nodes, state):
        """
        Apply loop blocking to :class:`Iteration` trees.

        Blocking is applied to parallel iteration trees. Heuristically, innermost
        dimensions are not blocked to maximize the trip count of the SIMD loops.

        Different heuristics may be specified by passing the keywords ``blockshape``
        and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate
        a specific block size for each blocked dimension. For example, for the
        :class:`Iteration` tree: ::

            for i
              for j
                for k
                  ...

        one may provide ``blockshape = {i: 4, j: 7}``, in which case the
        two outer loops will blocked, and the resulting 2-dimensional block will
        have size 4x7. The latter may be set to True to also block innermost parallel
        :class:`Iteration` objects.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree[0].is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                name = "%s%d_block" % (i.dim.name, len(mapper))

                # Build Iteration over blocks
                dim = blocked.setdefault(i, Dimension(name=name))
                bsize = dim.symbolic_size
                bstart = i.limits[0]
                binnersize = i.dim.symbolic_extent + (i.offsets[1] -
                                                      i.offsets[0])
                bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1
                inter_block = Iteration([],
                                        dim, [bstart, bfinish, bsize],
                                        offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + bsize - 1, 1)
                intra_block = i._rebuild([],
                                         limits=limits,
                                         offsets=(0, 0),
                                         properties=i.properties +
                                         (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild(
                    [],
                    limits=[bfinish + 1, i.dim.symbolic_end, 1],
                    offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [
                        b._rebuild(properties=b.properties + (REMAINDER, ))
                        for b, r in zip(inter_blocks, remainders)
                        if r.dim not in c
                    ]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        # All blocked dimensions
        if not blocked:
            return processed, {}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return processed, {'arguments': arguments, 'flags': 'blocking'}
Ejemplo n.º 18
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))
        noinline = self._compiler_decoration('noinline', cgen.Comment('noinline?'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = OrderedDict()
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            candidates = [i for i in tree if i.is_Parallel]
            if blockinner:
                iterations = candidates
            else:
                iterations = [i for i in candidates if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not blockalways:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_block" % (i.dim.name, len(mapper)))
                # Build Iteration over blocks
                interb.append(Iteration([], d, d.symbolic_max, offsets=i.offsets,
                                        properties=PARALLEL))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))
                # Record that a new BlockDimension has been introduced
                block_dims.append(d)

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc0 = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc0.make_call(dynamic_args_mapper)
                body.append(List(header=noinline, body=call))

            # Build indirect Call to the `efunc0` Calls
            dynamic_parameters = [i.dim.root for i in candidates]
            dynamic_parameters.extend([bi.dim.step for bi in interb])
            efunc1 = make_efunc("f%d" % len(mapper), body, dynamic_parameters)

            # Track everything to ultimately transform the input `iet`
            mapper[root] = efunc1.make_call()
            efuncs[efunc1] = None
            efuncs[efunc0] = [efunc1.name]

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs}