Example #1
0
    def _schedule_expressions(self, clusters):
        """Wrap :class:`Expression` objects, already grouped in :class:`Cluster`
        objects, within nested :class:`Iteration` objects (representing loops),
        according to dimensions and stencils."""

        # Topologically sort Iterations
        ordering = partial_order([i.stencil.dimensions for i in clusters])
        for i, d in enumerate(list(ordering)):
            if d.is_Buffered:
                ordering.insert(i, d.parent)

        # Build the Iteration/Expression tree
        processed = []
        schedule = OrderedDict()
        atomics = ()
        for i in clusters:
            # Build the Expression objects to be inserted within an Iteration tree
            expressions = [Expression(v, np.int32 if i.trace.is_index(k) else self.dtype)
                           for k, v in i.trace.items()]

            if not i.stencil.empty:
                root = None
                entries = i.stencil.entries

                # Reorder based on the globally-established loop ordering
                entries = sorted(entries, key=lambda i: ordering.index(i.dim))

                # Can I reuse any of the previously scheduled Iterations ?
                index = 0
                for j0, j1 in zip(entries, list(schedule)):
                    if j0 != j1 or j0.dim in atomics:
                        break
                    root = schedule[j1]
                    index += 1
                needed = entries[index:]

                # Build and insert the required Iterations
                iters = [Iteration([], j.dim, j.dim.size, offsets=j.ofs) for j in needed]
                body, tree = compose_nodes(iters + [expressions], retrieve=True)
                scheduling = OrderedDict(zip(needed, tree))
                if root is None:
                    processed.append(body)
                    schedule = scheduling
                else:
                    nodes = list(root.nodes) + [body]
                    mapper = {root: root._rebuild(nodes, **root.args_frozen)}
                    transformer = Transformer(mapper)
                    processed = list(transformer.visit(processed))
                    schedule = OrderedDict(list(schedule.items())[:index] +
                                           list(scheduling.items()))
                    for k, v in list(schedule.items()):
                        schedule[k] = transformer.rebuilt.get(v, v)
            else:
                # No Iterations are needed
                processed.extend(expressions)

            # Track dimensions that cannot be fused at next stage
            atomics = i.atomics

        return List(body=processed)
Example #2
0
    def _schedule_expressions(self, clusters, ordering):
        """Wrap :class:`Expression` objects, already grouped in :class:`Cluster`
        objects, within nested :class:`Iteration` objects (representing loops),
        according to dimensions and stencils."""

        processed = []
        schedule = OrderedDict()
        for i in clusters:
            # Build the Expression objects to be inserted within an Iteration tree
            expressions = [
                Expression(v, np.int32 if i.trace.is_index(k) else self.dtype)
                for k, v in i.trace.items()
            ]

            if not i.stencil.empty:
                root = None
                entries = i.stencil.entries

                # Can I reuse any of the previously scheduled Iterations ?
                index = 0
                for j0, j1 in zip(entries, list(schedule)):
                    if j0 != j1:
                        break
                    root = schedule[j1]
                    index += 1
                needed = entries[index:]

                # Build and insert the required Iterations
                iters = [
                    Iteration([], j.dim, j.dim.size, offsets=j.ofs)
                    for j in needed
                ]
                body, tree = compose_nodes(iters + [expressions],
                                           retrieve=True)
                scheduling = OrderedDict(zip(needed, tree))
                if root is None:
                    processed.append(body)
                    schedule = scheduling
                else:
                    nodes = list(root.nodes) + [body]
                    mapper = {root: root._rebuild(nodes, **root.args_frozen)}
                    transformer = Transformer(mapper)
                    processed = list(transformer.visit(processed))
                    schedule = OrderedDict(
                        list(schedule.items())[:index] +
                        list(scheduling.items()))
                    for k, v in list(schedule.items()):
                        schedule[k] = transformer.rebuilt.get(v, v)
            else:
                # No Iterations are needed
                processed.extend(expressions)

        return processed
Example #3
0
def test_transformer_replace(exprs, block1, block2, block3):
    """Basic transformer test that replaces an expression"""
    line1 = '// Replaced expression'
    replacer = Block(c.Line(line1))
    transformer = Transformer({exprs[0]: replacer})

    for block in [block1, block2, block3]:
        newblock = transformer.visit(block)
        newcode = str(newblock.ccode)
        oldnumlines = len(str(block.ccode).split('\n'))
        newnumlines = len(newcode.split('\n'))
        assert newnumlines >= oldnumlines
        assert line1 in newcode
        assert "a[i0] = a[i0] + b[i0] + 5.0F;" not in newcode
Example #4
0
    def _insert_declarations(self, dle_state, parameters):
        """Populate the Operator's body with the required array and
        variable declarations, to generate a legal C file."""

        nodes = dle_state.nodes

        # Resolve function calls first
        scopes = []
        for k, v in FindScopes().visit(nodes).items():
            if k.is_FunCall:
                function = dle_state.func_table[k.name]
                scopes.extend(FindScopes().visit(function,
                                                 queue=list(v)).items())
            else:
                scopes.append((k, v))

        # Determine all required declarations
        allocator = Allocator()
        mapper = OrderedDict()
        for k, v in scopes:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
            elif k.output_function._mem_external:
                # Nothing to do, variable passed as kernel argument
                continue
            elif k.output_function._mem_stack:
                # On the stack, as established by the DLE
                key = lambda i: i.dim not in k.output_function.indices
                site = filter_iterations(v, key=key, stop='consecutive')
                allocator.push_stack(site[-1], k.output_function)
            else:
                # On the heap, as a tensor that must be globally accessible
                allocator.push_heap(k.output_function)

        # Introduce declarations on the stack
        for k, v in allocator.onstack:
            allocs = as_tuple([Element(i) for i in v])
            mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen)
        nodes = Transformer(mapper).visit(nodes)
        elemental_functions = Transformer(mapper).visit(
            dle_state.elemental_functions)

        # Introduce declarations on the heap (if any)
        if allocator.onheap:
            decls, allocs, frees = zip(*allocator.onheap)
            nodes = List(header=decls + allocs, body=nodes, footer=frees)

        return nodes, elemental_functions
Example #5
0
 def decorate(nodes):
     processed = []
     for node in nodes:
         mapper = {}
         for tree in retrieve_iteration_tree(node):
             vector_iterations = [i for i in tree if i.is_Vectorizable]
             for i in vector_iterations:
                 handle = FindSymbols('symbolics').visit(i)
                 try:
                     aligned = [
                         j for j in handle
                         if j.is_Tensor and j.shape[-1] %
                         get_simd_items(j.dtype) == 0
                     ]
                 except KeyError:
                     aligned = []
                 if aligned:
                     simd = omplang['simd-for-aligned']
                     simd = as_tuple(
                         simd(','.join([j.name for j in aligned]),
                              simdinfo[get_simd_flag()]))
                 else:
                     simd = as_tuple(omplang['simd-for'])
                 mapper[i] = i._rebuild(pragmas=i.pragmas +
                                        ignore_deps + simd)
         processed.append(Transformer(mapper).visit(node))
     return processed
Example #6
0
def test_transformer_wrap(exprs, block1, block2, block3):
    """Basic transformer test that wraps an expression in comments"""
    line1 = '// This is the opening comment'
    line2 = '// This is the closing comment'
    wrapper = lambda n: Block(c.Line(line1), n, c.Line(line2))
    transformer = Transformer({exprs[0]: wrapper(exprs[0])})

    for block in [block1, block2, block3]:
        newblock = transformer.visit(block)
        newcode = str(newblock.ccode)
        oldnumlines = len(str(block.ccode).split('\n'))
        newnumlines = len(newcode.split('\n'))
        assert newnumlines >= oldnumlines + 2
        assert line1 in newcode
        assert line2 in newcode
        assert "a[i] = a[i] + b[i] + 5.0F;" in newcode
Example #7
0
def test_transformer_add_replace(exprs, block2, block3):
    """Basic transformer test that adds one expression and replaces another"""
    line1 = '// Replaced expression'
    line2 = '// Adding a simple line'
    replacer = Block(c.Line(line1))
    adder = lambda n: Block(c.Line(line2), n)
    transformer = Transformer({exprs[0]: replacer, exprs[1]: adder(exprs[1])})

    for block in [block2, block3]:
        newblock = transformer.visit(block)
        newcode = str(newblock.ccode)
        oldnumlines = len(str(block.ccode).split('\n'))
        newnumlines = len(newcode.split('\n'))
        assert newnumlines >= oldnumlines + 1
        assert line1 in newcode
        assert line2 in newcode
        assert "a[i0] = a[i0] + b[i0] + 5.0F;" not in newcode
Example #8
0
 def decorate(nodes):
     processed = []
     for node in nodes:
         mapper = {}
         for tree in retrieve_iteration_tree(node):
             for i in tree:
                 if i.is_Parallel:
                     mapper[i] = List(body=i, footer=fence)
                     break
         transformed = Transformer(mapper).visit(node)
         mapper = {}
         for tree in retrieve_iteration_tree(transformed):
             for i in tree:
                 if i.is_Vectorizable:
                     mapper[i] = List(header=pragma, body=i)
         transformed = Transformer(mapper).visit(transformed)
         processed.append(transformed)
     return processed
Example #9
0
    def __init__(self, expressions, **kwargs):
        super(OperatorDebug, self).__init__(expressions, **kwargs)
        self._includes.append('stdio.h')

        # Minimize the trip count of the sequential loops
        iterations = set(flatten(retrieve_iteration_tree(self.body)))
        mapper = {
            i: i._rebuild(limits=(max(i.offsets) + 2))
            for i in iterations if i.is_Sequential
        }
        self.body = Transformer(mapper).visit(self.body)

        # Mark entry/exit points of each non-sequential Iteration tree in the body
        iterations = [
            filter_iterations(i, lambda i: not i.is_Sequential, 'any')
            for i in retrieve_iteration_tree(self.body)
        ]
        iterations = [i[0] for i in iterations if i]
        mapper = {
            t: List(header=printmark('In nest %d' % i), body=t)
            for i, t in enumerate(iterations)
        }
        self.body = Transformer(mapper).visit(self.body)
Example #10
0
    def _insert_declarations(self, nodes):
        """Populate the Operator's body with the required array and variable
        declarations, to generate a legal C file."""

        # Resolve function calls first
        scopes = []
        for k, v in FindScopes().visit(nodes).items():
            if k.is_FunCall:
                func = self.func_table[k.name]
                if func.local:
                    scopes.extend(FindScopes().visit(func.root,
                                                     queue=list(v)).items())
            else:
                scopes.append((k, v))

        # Determine all required declarations
        allocator = Allocator()
        mapper = OrderedDict()
        for k, v in scopes:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
            elif k.output_function._mem_external:
                # Nothing to do, variable passed as kernel argument
                continue
            elif k.output_function._mem_stack:
                # On the stack, as established by the DLE
                key = lambda i: not i.is_Parallel
                site = filter_iterations(v, key=key, stop='asap') or [nodes]
                allocator.push_stack(site[-1], k.output_function)
            else:
                # On the heap, as a tensor that must be globally accessible
                allocator.push_heap(k.output_function)

        # Introduce declarations on the stack
        for k, v in allocator.onstack:
            mapper[k] = tuple(Element(i) for i in v)
        nodes = NestedTransformer(mapper).visit(nodes)
        for k, v in list(self.func_table.items()):
            if v.local:
                self.func_table[k] = FunMeta(
                    Transformer(mapper).visit(v.root), v.local)

        # Introduce declarations on the heap (if any)
        if allocator.onheap:
            decls, allocs, frees = zip(*allocator.onheap)
            nodes = List(header=decls + allocs, body=nodes, footer=frees)

        return nodes
Example #11
0
def unfold_blocked_tree(node):
    """
    Unfold nested :class:`IterationFold`.

    :Example:

    Given a section of Iteration/Expression tree as below: ::

        for i = 1 to N-1  // folded
          for j = 1 to N-1  // folded
            foo1()

    Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: ::

        for i = 1 to N-1
          for j = 1 to N-1
            foo1()
        for i = 2 to N-2
          for j = 2 to N-2
            foo2()
    """
    # Search the unfolding candidates
    candidates = []
    for tree in retrieve_iteration_tree(node):
        handle = tuple(i for i in tree if i.is_IterationFold)
        if handle:
            # Sanity check
            assert IsPerfectIteration().visit(handle[0])
            candidates.append(handle)

    # Perform unfolding
    tag = ntags()
    mapper = {}
    for tree in candidates:
        trees = list(zip(*[i.unfold() for i in tree]))
        # Update tag
        for i, _tree in enumerate(list(trees)):
            trees[i] = tuple(j.retag(tag + i) for j in _tree)
        trees = optimize_unfolded_tree(trees[:-1], trees[-1])
        mapper[tree[0]] = List(body=trees)

    # Insert the unfolded Iterations in the Iteration/Expression tree
    processed = Transformer(mapper).visit(node)

    return processed
Example #12
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = {
                i: List(c.Comment('DLE: moved denormals flag'))
                for i in denormals
            }

            # Handle parallelizable loops
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and not i.is_Vectorizable
                candidates = filter_iterations(tree,
                                               key=key,
                                               stop='consecutive')
                if not candidates:
                    continue

                # Heuristic: if at least two parallel loops are available and the
                # physical core count is greater than self.thresholds['collapse'],
                # then omp-collapse the loops
                nparallel = len(candidates)
                if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                        nparallel < 2:
                    parallelism = omplang['for']
                else:
                    parallelism = omplang['collapse'](nparallel)

                root = candidates[0]
                mapper[root] = Block(header=omplang['par-region'],
                                     body=denormals +
                                     [Element(parallelism), root])

            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed}
Example #13
0
def test_create_elemental_functions_simple(simple_function):
    roots = [i[-1] for i in retrieve_iteration_tree(simple_function)]
    retagged = [i._rebuild(properties=tagger(0)) for i in roots]
    mapper = {i: j._rebuild(properties=(j.properties + (ELEMENTAL,)))
              for i, j in zip(roots, retagged)}
    function = Transformer(mapper).visit(simple_function)
    handle = transform(function, mode='split')
    block = List(body=handle.nodes + handle.elemental_functions)
    output = str(block.ccode)
    # Make output compiler independent
    output = [i for i in output.split('\n')
              if all([j not in i for j in ('#pragma', '/*')])]
    assert '\n'.join(output) == \
        ("""void foo(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec;
  float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec;
  for (int i = 0; i < 3; i += 1)
  {
    for (int j = 0; j < 5; j += 1)
    {
      f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,j);
    }
  }
}
void f_0(const int k_start, const int k_finish,"""
         """ float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec, const int i, const int j)
{
  float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec;
  float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec;
  float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec;
  for (int k = k_start; k < k_finish; k += 1)
  {
    a[i] = a[i] + b[i] + 5.0F;
    a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k];
  }
}""")
Example #14
0
def test_transformer_replace_function_body(block1, block2):
    """Create a Function and replace its body with another."""
    args = FindSymbols().visit(block1)
    f = Function('foo', block1, 'void', args)
    assert str(f.ccode) == """void foo()
{
  for (int i = 0; i < 3; i += 1)
  {
    for (int j = 0; j < 5; j += 1)
    {
      for (int k = 0; k < 7; k += 1)
      {
        a[i] = a[i] + b[i] + 5.0F;
      }
    }
  }
}"""

    f = Transformer({block1: block2}).visit(f)
    assert str(f.ccode) == """void foo()
Example #15
0
def make_grid_accesses(node):
    """
    Construct a new Iteration/Expression based on ``node``, in which all
    :class:`interfaces.Indexed` accesses have been converted into YASK grid
    accesses.
    """

    def make_grid_gets(expr):
        mapper = {}
        indexeds = retrieve_indexed(expr)
        data_carriers = [i for i in indexeds if i.base.function.from_YASK]
        for i in data_carriers:
            name = namespace['code-grid-name'](i.base.function.name)
            args = [INT(make_grid_gets(j)) for j in i.indices]
            mapper[i] = make_sharedptr_funcall(namespace['code-grid-get'], args, name)
        return expr.xreplace(mapper)

    mapper = {}
    for i, e in enumerate(FindNodes(Expression).visit(node)):
        lhs, rhs = e.expr.args

        # RHS translation
        rhs = make_grid_gets(rhs)

        # LHS translation
        if e.output_function.from_YASK:
            name = namespace['code-grid-name'](e.output_function.name)
            args = [rhs] + [INT(make_grid_gets(i)) for i in lhs.indices]
            handle = make_sharedptr_funcall(namespace['code-grid-put'], args, name)
            processed = Element(c.Statement(ccode(handle)))
        else:
            # Writing to a scalar temporary
            processed = Expression(e.expr.func(lhs, rhs), dtype=e.dtype)

        mapper.update({e: processed})

    return Transformer(mapper).visit(node)
Example #16
0
    def _profile_sections(self, nodes):
        """Introduce C-level profiling nodes within the Iteration/Expression tree."""
        mapper = {}
        for node in nodes:
            for itspace in FindSections().visit(node).keys():
                for i in itspace:
                    if IsPerfectIteration().visit(i):
                        # Insert `TimedList` block. This should come from
                        # the profiler, but we do this manually for now.
                        lname = 'loop_%s_%d' % (i.index, len(mapper))
                        mapper[i] = TimedList(gname=self.profiler.varname,
                                              lname=lname,
                                              body=i)
                        self.profiler.add(lname)

                        # Estimate computational properties of the timed section
                        # (operational intensity, memory accesses)
                        expressions = FindNodes(Expression).visit(i)
                        ops = estimate_cost([e.expr for e in expressions])
                        memory = estimate_memory([e.expr for e in expressions])
                        self.sections[itspace] = Profile(lname, ops, memory)
                        break
        processed = Transformer(mapper).visit(List(body=nodes))
        return processed
Example #17
0
def create_profile(node):
    """
    Create a :class:`Profiler` for the Iteration/Expression tree ``node``.
    The following code sections are profiled: ::

        * The whole ``node``;
        * A sequence of perfectly nested loops that have common :class:`Iteration`
          dimensions, but possibly different extent. For example: ::

            for x = 0 to N
              ..
            for x = 1 to N-1
              ..

          Both Iterations have dimension ``x``, and will be profiled as a single
          section, though their extent is different.
        * Any perfectly nested loops.
    """
    profiler = Profiler()

    # Group by root Iteration
    mapper = OrderedDict()
    for itspace in FindSections().visit(node):
        mapper.setdefault(itspace[0], []).append(itspace)

    # Group sections if their iteration spaces overlap
    key = lambda itspace: set([i.dim for i in itspace])
    found = []
    for v in mapper.values():
        queue = list(v)
        handle = []
        while queue:
            item = queue.pop(0)
            if not handle or key(item) == key(handle[0]):
                handle.append(item)
            else:
                # Found a timing section
                found.append(tuple(handle))
                handle = [item]
        if handle:
            found.append(tuple(handle))

    # Create and track C-level timers
    mapper = OrderedDict()
    for i, group in enumerate(found):
        name = 'section_%d' % i
        section, remainder = group[0], group[1:]

        index = len(section) > 1 and not IsPerfectIteration().visit(section[0])
        root = section[index]

        # Prepare to transform the Iteration/Expression tree
        body = tuple(j[index] for j in group)
        mapper[root] = TimedList(gname=profiler.varname, lname=name, body=body)
        for j in remainder:
            mapper[j[index]] = None

        # Estimate computational properties of the profiled section
        expressions = FindNodes(Expression).visit(body)
        ops = estimate_cost([e.expr for e in expressions])
        memory = estimate_memory([e.expr for e in expressions])

        # Keep track of the new profiled section
        profiler.add(name, section, ops, memory)

    # Transform the Iteration/Expression tree introducing the C-level timers
    processed = Transformer(mapper).visit(node)

    return processed, profiler
Example #18
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    ========
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)
        modified_tree = []
        modified_root = []
        mapper = {}

        # "Shrink" the iteration space
        for t1, t2 in zip(tree, root):
            start, end, incr = t1.args['limits']
            index = Symbol('%ss%d' % (t1.index, i))

            t1_uindex = (UnboundedIndex(index, start), )
            t2_uindex = (UnboundedIndex(index, -start), )

            modified_tree.append(
                t1._rebuild(limits=[0, end - start, incr],
                            uindices=t1.uindices + t1_uindex))
            modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex))

            mapper[t1.dim] = index

        # Temporary arrays can now be moved onto the stack
        exprs = FindNodes(Expression).visit(modified_tree[-1])
        if all(not j.is_Remainder for j in modified_tree):
            shape = tuple(j.bounds_symbolic[1] for j in modified_tree)
            for j in exprs:
                j_shape = shape + j.output_function.shape[len(modified_tree):]
                j.output_function.update(shape=j_shape, onstack=True)

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs],
                                    mapper,
                                    only_rhs=True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(
            Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [j.output for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Example #19
0
    def _create_elemental_functions(self, state, **kwargs):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Function`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node, mode='superset'):
                # Search an elementizable sub-tree (if any)
                tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                           'asap')
                if not tagged:
                    continue
                root = tagged[0]
                if not root.is_Elementizable:
                    continue
                target = tree[tree.index(root):]

                # Elemental function arguments
                args = []  # Found so far (scalars, tensors)
                maybe_required = set(
                )  # Scalars that *may* have to be passed in
                not_required = set(
                )  # Elemental function locally declared scalars

                # Build a new Iteration/Expression tree with free bounds
                free = []
                for i in target:
                    name, bounds = i.dim.name, i.bounds_symbolic
                    # Iteration bounds
                    start = ScalarFunction(name='%s_start' % name,
                                           dtype=np.int32)
                    finish = ScalarFunction(name='%s_finish' % name,
                                            dtype=np.int32)
                    args.extend(
                        zip([ccode(j) for j in bounds], (start, finish)))
                    # Iteration unbounded indices
                    ufunc = [
                        ScalarFunction(name='%s_ub%d' % (name, j),
                                       dtype=np.int32)
                        for j in range(len(i.uindices))
                    ]
                    args.extend(
                        zip([ccode(j.start) for j in i.uindices], ufunc))
                    limits = [Symbol(start.name), Symbol(finish.name), 1]
                    uindices = [
                        UnboundedIndex(j.index, i.dim + as_symbol(k))
                        for j, k in zip(i.uindices, ufunc)
                    ]
                    free.append(
                        i._rebuild(limits=limits,
                                   offsets=None,
                                   uindices=uindices))
                    not_required.update({i.dim},
                                        set(j.index for j in i.uindices))

                # Construct elemental function body, and inspect it
                free = NestedTransformer(dict((zip(target, free)))).visit(root)
                expressions = FindNodes(Expression).visit(free)
                fsymbols = FindSymbols('symbolics').visit(free)

                # Retrieve symbolic arguments
                for i in fsymbols:
                    if i.is_TensorFunction:
                        args.append(
                            ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name),
                             i))
                    elif i.is_TensorData:
                        args.append(("%s_vec" % i.name, i))
                    elif i.is_ConstantData:
                        args.append((i.name, i))

                # Retrieve scalar arguments
                not_required.update(
                    {i.output
                     for i in expressions if i.is_scalar})
                maybe_required.update(
                    set(FindSymbols(mode='free-symbols').visit(free)))
                for i in fsymbols:
                    not_required.update({as_symbol(i), i.indexify()})
                    for j in i.symbolic_shape:
                        maybe_required.update(j.free_symbols)
                required = filter_sorted(maybe_required - not_required,
                                         key=attrgetter('name'))
                args.extend([(i.name,
                              ScalarFunction(name=i.name, dtype=np.int32))
                             for i in required])

                call, params = zip(*args)
                handle = flatten([p.rtargs for p in params])
                name = "f_%d" % root.tag

                # Produce the new FunCall
                mapper[root] = List(header=noinline, body=FunCall(name, call))

                # Produce the new Function
                functions.setdefault(
                    name, Function(name, free, 'void', handle, ('static', )))

            # Transform the main tree
            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed, 'elemental_functions': functions.values()}
Example #20
0
    def _loop_blocking(self, state, **kwargs):
        """
        Apply loop blocking to :class:`Iteration` trees.

        By default, the blocked :class:`Iteration` objects and the block size are
        determined heuristically. The heuristic consists of searching the deepest
        Iteration/Expression tree and blocking all dimensions except:

            * The innermost (eg, to retain SIMD vectorization);
            * Those dimensions inducing loop-carried dependencies.

        The caller may take over the heuristic through ``kwargs['blocking']``,
        a dictionary indicating the block size of each blocked dimension. For
        example, for the :class:`Iteration` tree below: ::

            for i
              for j
                for k
                  ...

        one may pass in ``kwargs['blocking'] = {i: 4, j: 7}``, in which case the
        two outer loops would be blocked, and the resulting 2-dimensional block
        would be of size 4x7.
        """
        Region = namedtuple('Region', 'main leftover')

        blocked = OrderedDict()
        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node):
                # Is the Iteration tree blockable ?
                iterations = [i for i in tree if i.is_Parallel]
                if 'blockinner' not in self.params:
                    iterations = [
                        i for i in iterations if not i.is_Vectorizable
                    ]
                if not iterations:
                    continue
                root = iterations[0]
                if not IsPerfectIteration().visit(root):
                    continue

                # Construct the blocked loop nest, as well as all necessary
                # remainder loops
                regions = OrderedDict()
                blocked_iterations = []
                for i in iterations:
                    # Build Iteration over blocks
                    dim = blocked.setdefault(
                        i, Dimension("%s_block" % i.dim.name))
                    block_size = dim.symbolic_size
                    iter_size = i.dim.size or i.dim.symbolic_size
                    start = i.limits[0] - i.offsets[0]
                    finish = iter_size - i.offsets[1]
                    finish = finish - ((finish - i.offsets[1]) % block_size)
                    inter_block = Iteration([],
                                            dim, [start, finish, block_size],
                                            properties=as_tuple('parallel'))

                    # Build Iteration within a block
                    start = inter_block.dim
                    finish = start + block_size
                    properties = 'vector-dim' if i.is_Vectorizable else None
                    intra_block = Iteration([],
                                            i.dim, [start, finish, 1],
                                            i.index,
                                            properties=as_tuple(properties))

                    blocked_iterations.append((inter_block, intra_block))

                    # Build unitary-increment Iteration over the 'main' region
                    # (the one blocked); necessary to generate code iterating over
                    # non-blocked ("remainder") iterations.
                    start = inter_block.limits[0]
                    finish = inter_block.limits[1]
                    main = Iteration([],
                                     i.dim, [start, finish, 1],
                                     i.index,
                                     properties=i.properties)

                    # Build unitary-increment Iteration over the 'leftover' region:
                    # again as above, this may be necessary when the dimension size
                    # is not a multiple of the block size.
                    start = inter_block.limits[1]
                    finish = iter_size - i.offsets[1]
                    leftover = Iteration([],
                                         i.dim, [start, finish, 1],
                                         i.index,
                                         properties=i.properties)

                    regions[i] = Region(main, leftover)

                blocked_tree = list(flatten(zip(*blocked_iterations)))
                blocked_tree = compose_nodes(blocked_tree +
                                             [iterations[-1].nodes])

                # Build remainder loops
                remainder_tree = []
                for n in range(len(iterations)):
                    for i in combinations(iterations, n + 1):
                        nodes = [
                            v.leftover if k in i else v.main
                            for k, v in regions.items()
                        ]
                        nodes += [iterations[-1].nodes]
                        remainder_tree.append(compose_nodes(nodes))

                # Will replace with blocked loop tree
                mapper[root] = List(body=[blocked_tree] + remainder_tree)

            rebuilt = Transformer(mapper).visit(node)

            processed.append(rebuilt)

        # All blocked dimensions
        if not blocked:
            return {'nodes': processed}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return {
            'nodes': processed,
            'arguments': arguments,
            'flags': 'blocking'
        }
Example #21
0
    def _specialize(self, nodes, parameters):
        """
        Create a YASK representation of this Iteration/Expression tree.

        ``parameters`` is modified in-place adding YASK-related arguments.
        """

        log("Specializing a Devito Operator for YASK...")

        # Find offloadable Iteration/Expression trees
        offloadable = []
        for tree in retrieve_iteration_tree(nodes):
            parallel = filter_iterations(tree, lambda i: i.is_Parallel)
            if not parallel:
                # Cannot offload non-parallel loops
                continue
            if not (IsPerfectIteration().visit(tree)
                    and all(i.is_Expression for i in tree[-1].nodes)):
                # Don't know how to offload this Iteration/Expression to YASK
                continue
            functions = flatten(i.functions for i in tree[-1].nodes)
            keys = set((i.indices, i.shape, i.dtype) for i in functions
                       if i.is_TimeData)
            if len(keys) == 0:
                continue
            elif len(keys) > 1:
                exit("Cannot handle Operators w/ heterogeneous grids")
            dimensions, shape, dtype = keys.pop()
            if len(dimensions) == len(tree) and\
                    all(i.dim == j for i, j in zip(tree, dimensions)):
                # Detected a "full" Iteration/Expression tree (over both
                # time and space dimensions)
                offloadable.append((tree, dimensions, shape, dtype))

        # Construct YASK ASTs given Devito expressions. New grids may be allocated.
        if len(offloadable) == 0:
            # No offloadable trees found
            self.context = YaskNullContext()
            self.yk_soln = YaskNullSolution()
            processed = nodes
            log("No offloadable trees found")
        elif len(offloadable) == 1:
            # Found *the* offloadable tree for this Operator
            tree, dimensions, shape, dtype = offloadable[0]
            self.context = contexts.fetch(dimensions, shape, dtype)

            # Create a YASK compiler solution for this Operator
            # Note: this can be dropped as soon as the kernel has been built
            yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln'])

            transform = sympy2yask(self.context, yc_soln)
            try:
                for i in tree[-1].nodes:
                    transform(i.expr)

                funcall = make_sharedptr_funcall(namespace['code-soln-run'],
                                                 ['time'],
                                                 namespace['code-soln-name'])
                funcall = Element(c.Statement(ccode(funcall)))
                processed = Transformer({tree[1]: funcall}).visit(nodes)

                # Track this is an external function call
                self.func_table[namespace['code-soln-run']] = FunMeta(
                    None, False)

                # JIT-compile the newly-created YASK kernel
                self.yk_soln = self.context.make_yk_solution(
                    namespace['jit-yk-soln'], yc_soln)

                # Now we must drop a pointer to the YASK solution down to C-land
                parameters.append(
                    Object(namespace['code-soln-name'],
                           namespace['type-solution'],
                           self.yk_soln.rawpointer))

                # Print some useful information about the newly constructed solution
                log("Solution '%s' contains %d grid(s) and %d equation(s)." %
                    (yc_soln.get_name(), yc_soln.get_num_grids(),
                     yc_soln.get_num_equations()))
            except:
                self.yk_soln = YaskNullSolution()
                processed = nodes
                log("Unable to offload a candidate tree.")
        else:
            exit("Found more than one offloadable trees in a single Operator")

        # Some Iteration/Expression trees are not offloaded to YASK and may
        # require further processing to be executed in YASK, due to the differences
        # in storage layout employed by Devito and YASK
        processed = make_grid_accesses(processed)

        # Update the parameters list adding all necessary YASK grids
        for i in list(parameters):
            try:
                if i.from_YASK:
                    parameters.append(
                        Object(namespace['code-grid-name'](i.name),
                               namespace['type-grid'], i.data.rawpointer))
            except AttributeError:
                # Ignore e.g. Dimensions
                pass

        log("Specialization successfully performed!")

        return processed
Example #22
0
    def _loop_blocking(self, state, **kwargs):
        """
        Apply loop blocking to :class:`Iteration` trees.

        Blocking is applied to parallel iteration trees. Heuristically, innermost
        dimensions are not blocked to maximize the trip count of the SIMD loops.

        Different heuristics may be specified by passing the keywords ``blockshape``
        and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate
        a specific block size for each blocked dimension. For example, for the
        :class:`Iteration` tree: ::

            for i
              for j
                for k
                  ...

        one may provide ``blockshape = {i: 4, j: 7}``, in which case the
        two outer loops will blocked, and the resulting 2-dimensional block will
        have size 4x7. The latter may be set to True to also block innermost parallel
        :class:`Iteration` objects.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        blocked = OrderedDict()
        processed = []
        for node in state.nodes:
            # Make sure loop blocking will span as many Iterations as possible
            fold = fold_blockable_tree(node, exclude_innermost)

            mapper = {}
            for tree in retrieve_iteration_tree(fold):
                # Is the Iteration tree blockable ?
                iterations = [i for i in tree if i.is_Parallel]
                if exclude_innermost:
                    iterations = [
                        i for i in iterations if not i.is_Vectorizable
                    ]
                if len(iterations) <= 1:
                    continue
                root = iterations[0]
                if not IsPerfectIteration().visit(root):
                    # Illegal/unsupported
                    continue
                if not tree[0].is_Sequential and not ignore_heuristic:
                    # Heuristic: avoid polluting the generated code with blocked
                    # nests (thus increasing JIT compilation time and affecting
                    # readability) if the blockable tree isn't embedded in a
                    # sequential loop (e.g., a timestepping loop)
                    continue

                # Decorate intra-block iterations with an IterationProperty
                TAG = tagger(len(mapper))

                # Build all necessary Iteration objects, individually. These will
                # subsequently be composed to implement loop blocking.
                inter_blocks = []
                intra_blocks = []
                remainders = []
                for i in iterations:
                    # Build Iteration over blocks
                    dim = blocked.setdefault(
                        i, Dimension("%s_block" % i.dim.name))
                    block_size = dim.symbolic_size
                    iter_size = i.dim.size or i.dim.symbolic_size
                    start = i.limits[0] - i.offsets[0]
                    finish = iter_size - i.offsets[1]
                    innersize = iter_size - (-i.offsets[0] + i.offsets[1])
                    finish = finish - (innersize % block_size)
                    inter_block = Iteration([],
                                            dim, [start, finish, block_size],
                                            properties=PARALLEL)
                    inter_blocks.append(inter_block)

                    # Build Iteration within a block
                    start = inter_block.dim
                    finish = start + block_size
                    intra_block = i._rebuild([],
                                             limits=[start, finish, 1],
                                             offsets=None,
                                             properties=i.properties +
                                             (TAG, ELEMENTAL))
                    intra_blocks.append(intra_block)

                    # Build unitary-increment Iteration over the 'leftover' region.
                    # This will be used for remainder loops, executed when any
                    # dimension size is not a multiple of the block size.
                    start = inter_block.limits[1]
                    finish = iter_size - i.offsets[1]
                    remainder = i._rebuild([],
                                           limits=[start, finish, 1],
                                           offsets=None)
                    remainders.append(remainder)

                # Build blocked Iteration nest
                blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                             [iterations[-1].nodes])

                # Build remainder Iterations
                remainder_trees = []
                for n in range(len(iterations)):
                    for c in combinations([i.dim for i in iterations], n + 1):
                        # First all inter-block Interations
                        nodes = [
                            b._rebuild(properties=b.properties + (REMAINDER, ))
                            for b, r in zip(inter_blocks, remainders)
                            if r.dim not in c
                        ]
                        # Then intra-block or remainder, for each dim (in order)
                        properties = (REMAINDER, TAG, ELEMENTAL)
                        for b, r in zip(intra_blocks, remainders):
                            handle = r if b.dim in c else b
                            nodes.append(
                                handle._rebuild(properties=properties))
                        nodes.extend([iterations[-1].nodes])
                        remainder_trees.append(compose_nodes(nodes))

                # Will replace with blocked loop tree
                mapper[root] = List(body=[blocked_tree] + remainder_trees)

            rebuilt = Transformer(mapper).visit(fold)

            # Finish unrolling any previously folded Iterations
            processed.append(unfold_blocked_tree(rebuilt))

        # All blocked dimensions
        if not blocked:
            return {'nodes': processed}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return {
            'nodes': processed,
            'arguments': arguments,
            'flags': 'blocking'
        }
Example #23
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = OrderedDict([(i, None) for i in denormals])

            # Group by outer loop so that we can embed within the same parallel region
            was_tagged = False
            groups = OrderedDict()
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and\
                    not (i.is_Elementizable or i.is_Vectorizable)
                candidates = filter_iterations(tree, key=key, stop='asap')
                if not candidates:
                    was_tagged = False
                    continue
                # Consecutive tagged Iteration go in the same group
                is_tagged = any(i.tag is not None for i in tree)
                key = len(groups) - (is_tagged & was_tagged)
                handle = groups.setdefault(key, OrderedDict())
                handle[candidates[0]] = candidates
                was_tagged = is_tagged

            # Handle parallelizable loops
            for group in groups.values():
                private = []
                for root, tree in group.items():
                    # Heuristic: if at least two parallel loops are available and the
                    # physical core count is greater than self.thresholds['collapse'],
                    # then omp-collapse the loops
                    nparallel = len(tree)
                    if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                            nparallel < 2:
                        parallel = omplang['for']
                    else:
                        parallel = omplang['collapse'](nparallel)

                    mapper[root] = root._rebuild(pragmas=root.pragmas +
                                                 (parallel, ))

                    # Track the thread-private and thread-shared variables
                    private.extend([
                        i for i in FindSymbols('symbolics').visit(root)
                        if i.is_TensorFunction and i._mem_stack
                    ])

                # Build the parallel region
                private = sorted(set([i.name for i in private]))
                private = ('private(%s)' %
                           ','.join(private)) if private else ''
                rebuilt = [v for k, v in mapper.items() if k in group]
                par_region = Block(header=omplang['par-region'](private),
                                   body=denormals + rebuilt)
                for k, v in list(mapper.items()):
                    if isinstance(v, Iteration):
                        mapper[k] = None if v.is_Remainder else par_region

            handle = Transformer(mapper).visit(node)
            if handle is not None:
                processed.append(handle)

        return {'nodes': processed}
Example #24
0
    def _loop_fission(self, state, **kwargs):
        """
        Apply loop fission to innermost :class:`Iteration` objects. This pass
        is not applied if the number of statements in an Iteration's body is
        lower than ``self.thresholds['fission'].``
        """

        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node):
                if len(tree) <= 1:
                    # Heuristically avoided
                    continue

                candidate = tree[-1]
                expressions = [e for e in candidate.nodes if e.is_Expression]

                if len(expressions) < self.thresholds['max_fission']:
                    # Heuristically avoided
                    continue
                if len(expressions) != len(candidate.nodes):
                    # Dangerous for correctness
                    continue

                functions = list(
                    set.union(*[set(e.functions) for e in expressions]))
                wrapped = [e.expr for e in expressions]

                if not functions or not wrapped:
                    # Heuristically avoided
                    continue

                # Promote temporaries from scalar to tensors
                handle = functions[0]
                dim = handle.indices[-1]
                size = handle.shape[-1]
                if any(dim != i.indices[-1] for i in functions):
                    # Dangerous for correctness
                    continue

                wrapped = promote_scalar_expressions(wrapped, (size, ),
                                                     (dim, ), True)

                assert len(wrapped) == len(expressions)
                rebuilt = [
                    Expression(s, e.dtype)
                    for s, e in zip(wrapped, expressions)
                ]

                # Group statements
                # TODO: Need a heuristic here to maximize reuse
                args_frozen = candidate.args_frozen
                properties = as_tuple(
                    args_frozen['properties']) + (ELEMENTAL, )
                args_frozen['properties'] = properties
                n = self.thresholds['min_fission']
                fissioned = [
                    Iteration(g, **args_frozen) for g in grouper(rebuilt, n)
                ]

                mapper[candidate] = List(body=fissioned)

            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed}
Example #25
0
    def _minimize_remainders(self, state, **kwargs):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """

        mapper = {}
        for tree in retrieve_iteration_tree(state.nodes +
                                            state.elemental_functions):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations:
                continue
            assert len(vector_iterations) == 1
            root = vector_iterations[0]
            if root.tag is None:
                continue

            # Padding
            writes = [
                i for i in FindSymbols('symbolics-writes').visit(root)
                if i.is_TensorFunction
            ]
            padding = []
            for i in writes:
                try:
                    simd_items = get_simd_items(i.dtype)
                except KeyError:
                    # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                    simd_items = simdinfo['avx512f'] / np.dtype(
                        i.dtype).itemsize
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, ))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.end_symbolic
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1]
                            for i in FindSymbols().visit(root))
            for i in root.uindices:
                for j in externals:
                    condition.append(root.end_symbolic + padding < j)
            condition = ' || '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func(name='_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' %
                          (condition, ccode(endpoint + padding), endpoint)))

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        nodes = Transformer(mapper).visit(state.nodes)
        elemental_functions = Transformer(mapper).visit(
            state.elemental_functions)

        return {'nodes': nodes, 'elemental_functions': elemental_functions}