Ejemplo n.º 1
0
    def _insert_declarations(self, nodes):
        """Populate the Operator's body with the required array and variable
        declarations, to generate a legal C file."""

        # Resolve function calls first
        scopes = []
        for k, v in FindScopes().visit(nodes).items():
            if k.is_FunCall:
                func = self.func_table[k.name]
                if func.local:
                    scopes.extend(FindScopes().visit(func.root,
                                                     queue=list(v)).items())
            else:
                scopes.append((k, v))

        # Determine all required declarations
        allocator = Allocator()
        mapper = OrderedDict()
        for k, v in scopes:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
            elif k.output_function._mem_external:
                # Nothing to do, variable passed as kernel argument
                continue
            elif k.output_function._mem_stack:
                # On the stack, as established by the DLE
                key = lambda i: not i.is_Parallel
                site = filter_iterations(v, key=key, stop='asap') or [nodes]
                allocator.push_stack(site[-1], k.output_function)
            else:
                # On the heap, as a tensor that must be globally accessible
                allocator.push_heap(k.output_function)

        # Introduce declarations on the stack
        for k, v in allocator.onstack:
            mapper[k] = tuple(Element(i) for i in v)
        nodes = NestedTransformer(mapper).visit(nodes)
        for k, v in list(self.func_table.items()):
            if v.local:
                self.func_table[k] = FunMeta(
                    Transformer(mapper).visit(v.root), v.local)

        # Introduce declarations on the heap (if any)
        if allocator.onheap:
            decls, allocs, frees = zip(*allocator.onheap)
            nodes = List(header=decls + allocs, body=nodes, footer=frees)

        return nodes
Ejemplo n.º 2
0
    def _insert_declarations(self, dle_state, parameters):
        """Populate the Operator's body with the required array and
        variable declarations, to generate a legal C file."""

        nodes = dle_state.nodes

        # Resolve function calls first
        scopes = []
        for k, v in FindScopes().visit(nodes).items():
            if k.is_FunCall:
                function = dle_state.func_table[k.name]
                scopes.extend(FindScopes().visit(function,
                                                 queue=list(v)).items())
            else:
                scopes.append((k, v))

        # Determine all required declarations
        allocator = Allocator()
        mapper = OrderedDict()
        for k, v in scopes:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
            elif k.output_function._mem_external:
                # Nothing to do, variable passed as kernel argument
                continue
            elif k.output_function._mem_stack:
                # On the stack, as established by the DLE
                key = lambda i: i.dim not in k.output_function.indices
                site = filter_iterations(v, key=key, stop='consecutive')
                allocator.push_stack(site[-1], k.output_function)
            else:
                # On the heap, as a tensor that must be globally accessible
                allocator.push_heap(k.output_function)

        # Introduce declarations on the stack
        for k, v in allocator.onstack:
            allocs = as_tuple([Element(i) for i in v])
            mapper[k] = Iteration(allocs + k.nodes, **k.args_frozen)
        nodes = Transformer(mapper).visit(nodes)
        elemental_functions = Transformer(mapper).visit(
            dle_state.elemental_functions)

        # Introduce declarations on the heap (if any)
        if allocator.onheap:
            decls, allocs, frees = zip(*allocator.onheap)
            nodes = List(header=decls + allocs, body=nodes, footer=frees)

        return nodes, elemental_functions
Ejemplo n.º 3
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = {
                i: List(c.Comment('DLE: moved denormals flag'))
                for i in denormals
            }

            # Handle parallelizable loops
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and not i.is_Vectorizable
                candidates = filter_iterations(tree,
                                               key=key,
                                               stop='consecutive')
                if not candidates:
                    continue

                # Heuristic: if at least two parallel loops are available and the
                # physical core count is greater than self.thresholds['collapse'],
                # then omp-collapse the loops
                nparallel = len(candidates)
                if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                        nparallel < 2:
                    parallelism = omplang['for']
                else:
                    parallelism = omplang['collapse'](nparallel)

                root = candidates[0]
                mapper[root] = Block(header=omplang['par-region'],
                                     body=denormals +
                                     [Element(parallelism), root])

            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed}
Ejemplo n.º 4
0
    def __init__(self, expressions, **kwargs):
        super(OperatorDebug, self).__init__(expressions, **kwargs)
        self._includes.append('stdio.h')

        # Minimize the trip count of the sequential loops
        iterations = set(flatten(retrieve_iteration_tree(self.body)))
        mapper = {
            i: i._rebuild(limits=(max(i.offsets) + 2))
            for i in iterations if i.is_Sequential
        }
        self.body = Transformer(mapper).visit(self.body)

        # Mark entry/exit points of each non-sequential Iteration tree in the body
        iterations = [
            filter_iterations(i, lambda i: not i.is_Sequential, 'any')
            for i in retrieve_iteration_tree(self.body)
        ]
        iterations = [i[0] for i in iterations if i]
        mapper = {
            t: List(header=printmark('In nest %d' % i), body=t)
            for i, t in enumerate(iterations)
        }
        self.body = Transformer(mapper).visit(self.body)
Ejemplo n.º 5
0
    def _ompize(self, state, **kwargs):
        """
        Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code
        """

        processed = []
        for node in state.nodes:

            # Reset denormals flag each time a parallel region is entered
            denormals = FindNodes(Denormals).visit(state.nodes)
            mapper = OrderedDict([(i, None) for i in denormals])

            # Group by outer loop so that we can embed within the same parallel region
            was_tagged = False
            groups = OrderedDict()
            for tree in retrieve_iteration_tree(node):
                # Determine the number of consecutive parallelizable Iterations
                key = lambda i: i.is_Parallel and\
                    not (i.is_Elementizable or i.is_Vectorizable)
                candidates = filter_iterations(tree, key=key, stop='asap')
                if not candidates:
                    was_tagged = False
                    continue
                # Consecutive tagged Iteration go in the same group
                is_tagged = any(i.tag is not None for i in tree)
                key = len(groups) - (is_tagged & was_tagged)
                handle = groups.setdefault(key, OrderedDict())
                handle[candidates[0]] = candidates
                was_tagged = is_tagged

            # Handle parallelizable loops
            for group in groups.values():
                private = []
                for root, tree in group.items():
                    # Heuristic: if at least two parallel loops are available and the
                    # physical core count is greater than self.thresholds['collapse'],
                    # then omp-collapse the loops
                    nparallel = len(tree)
                    if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\
                            nparallel < 2:
                        parallel = omplang['for']
                    else:
                        parallel = omplang['collapse'](nparallel)

                    mapper[root] = root._rebuild(pragmas=root.pragmas +
                                                 (parallel, ))

                    # Track the thread-private and thread-shared variables
                    private.extend([
                        i for i in FindSymbols('symbolics').visit(root)
                        if i.is_TensorFunction and i._mem_stack
                    ])

                # Build the parallel region
                private = sorted(set([i.name for i in private]))
                private = ('private(%s)' %
                           ','.join(private)) if private else ''
                rebuilt = [v for k, v in mapper.items() if k in group]
                par_region = Block(header=omplang['par-region'](private),
                                   body=denormals + rebuilt)
                for k, v in list(mapper.items()):
                    if isinstance(v, Iteration):
                        mapper[k] = None if v.is_Remainder else par_region

            handle = Transformer(mapper).visit(node)
            if handle is not None:
                processed.append(handle)

        return {'nodes': processed}
Ejemplo n.º 6
0
    def _specialize(self, nodes, parameters):
        """
        Create a YASK representation of this Iteration/Expression tree.

        ``parameters`` is modified in-place adding YASK-related arguments.
        """

        log("Specializing a Devito Operator for YASK...")

        # Find offloadable Iteration/Expression trees
        offloadable = []
        for tree in retrieve_iteration_tree(nodes):
            parallel = filter_iterations(tree, lambda i: i.is_Parallel)
            if not parallel:
                # Cannot offload non-parallel loops
                continue
            if not (IsPerfectIteration().visit(tree)
                    and all(i.is_Expression for i in tree[-1].nodes)):
                # Don't know how to offload this Iteration/Expression to YASK
                continue
            functions = flatten(i.functions for i in tree[-1].nodes)
            keys = set((i.indices, i.shape, i.dtype) for i in functions
                       if i.is_TimeData)
            if len(keys) == 0:
                continue
            elif len(keys) > 1:
                exit("Cannot handle Operators w/ heterogeneous grids")
            dimensions, shape, dtype = keys.pop()
            if len(dimensions) == len(tree) and\
                    all(i.dim == j for i, j in zip(tree, dimensions)):
                # Detected a "full" Iteration/Expression tree (over both
                # time and space dimensions)
                offloadable.append((tree, dimensions, shape, dtype))

        # Construct YASK ASTs given Devito expressions. New grids may be allocated.
        if len(offloadable) == 0:
            # No offloadable trees found
            self.context = YaskNullContext()
            self.yk_soln = YaskNullSolution()
            processed = nodes
            log("No offloadable trees found")
        elif len(offloadable) == 1:
            # Found *the* offloadable tree for this Operator
            tree, dimensions, shape, dtype = offloadable[0]
            self.context = contexts.fetch(dimensions, shape, dtype)

            # Create a YASK compiler solution for this Operator
            # Note: this can be dropped as soon as the kernel has been built
            yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln'])

            transform = sympy2yask(self.context, yc_soln)
            try:
                for i in tree[-1].nodes:
                    transform(i.expr)

                funcall = make_sharedptr_funcall(namespace['code-soln-run'],
                                                 ['time'],
                                                 namespace['code-soln-name'])
                funcall = Element(c.Statement(ccode(funcall)))
                processed = Transformer({tree[1]: funcall}).visit(nodes)

                # Track this is an external function call
                self.func_table[namespace['code-soln-run']] = FunMeta(
                    None, False)

                # JIT-compile the newly-created YASK kernel
                self.yk_soln = self.context.make_yk_solution(
                    namespace['jit-yk-soln'], yc_soln)

                # Now we must drop a pointer to the YASK solution down to C-land
                parameters.append(
                    Object(namespace['code-soln-name'],
                           namespace['type-solution'],
                           self.yk_soln.rawpointer))

                # Print some useful information about the newly constructed solution
                log("Solution '%s' contains %d grid(s) and %d equation(s)." %
                    (yc_soln.get_name(), yc_soln.get_num_grids(),
                     yc_soln.get_num_equations()))
            except:
                self.yk_soln = YaskNullSolution()
                processed = nodes
                log("Unable to offload a candidate tree.")
        else:
            exit("Found more than one offloadable trees in a single Operator")

        # Some Iteration/Expression trees are not offloaded to YASK and may
        # require further processing to be executed in YASK, due to the differences
        # in storage layout employed by Devito and YASK
        processed = make_grid_accesses(processed)

        # Update the parameters list adding all necessary YASK grids
        for i in list(parameters):
            try:
                if i.from_YASK:
                    parameters.append(
                        Object(namespace['code-grid-name'](i.name),
                               namespace['type-grid'], i.data.rawpointer))
            except AttributeError:
                # Ignore e.g. Dimensions
                pass

        log("Specialization successfully performed!")

        return processed
Ejemplo n.º 7
0
    def _create_elemental_functions(self, state, **kwargs):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Function`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        processed = []
        for node in state.nodes:
            mapper = {}
            for tree in retrieve_iteration_tree(node, mode='superset'):
                # Search an elementizable sub-tree (if any)
                tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                           'asap')
                if not tagged:
                    continue
                root = tagged[0]
                if not root.is_Elementizable:
                    continue
                target = tree[tree.index(root):]

                # Elemental function arguments
                args = []  # Found so far (scalars, tensors)
                maybe_required = set(
                )  # Scalars that *may* have to be passed in
                not_required = set(
                )  # Elemental function locally declared scalars

                # Build a new Iteration/Expression tree with free bounds
                free = []
                for i in target:
                    name, bounds = i.dim.name, i.bounds_symbolic
                    # Iteration bounds
                    start = ScalarFunction(name='%s_start' % name,
                                           dtype=np.int32)
                    finish = ScalarFunction(name='%s_finish' % name,
                                            dtype=np.int32)
                    args.extend(
                        zip([ccode(j) for j in bounds], (start, finish)))
                    # Iteration unbounded indices
                    ufunc = [
                        ScalarFunction(name='%s_ub%d' % (name, j),
                                       dtype=np.int32)
                        for j in range(len(i.uindices))
                    ]
                    args.extend(
                        zip([ccode(j.start) for j in i.uindices], ufunc))
                    limits = [Symbol(start.name), Symbol(finish.name), 1]
                    uindices = [
                        UnboundedIndex(j.index, i.dim + as_symbol(k))
                        for j, k in zip(i.uindices, ufunc)
                    ]
                    free.append(
                        i._rebuild(limits=limits,
                                   offsets=None,
                                   uindices=uindices))
                    not_required.update({i.dim},
                                        set(j.index for j in i.uindices))

                # Construct elemental function body, and inspect it
                free = NestedTransformer(dict((zip(target, free)))).visit(root)
                expressions = FindNodes(Expression).visit(free)
                fsymbols = FindSymbols('symbolics').visit(free)

                # Retrieve symbolic arguments
                for i in fsymbols:
                    if i.is_TensorFunction:
                        args.append(
                            ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name),
                             i))
                    elif i.is_TensorData:
                        args.append(("%s_vec" % i.name, i))
                    elif i.is_ConstantData:
                        args.append((i.name, i))

                # Retrieve scalar arguments
                not_required.update(
                    {i.output
                     for i in expressions if i.is_scalar})
                maybe_required.update(
                    set(FindSymbols(mode='free-symbols').visit(free)))
                for i in fsymbols:
                    not_required.update({as_symbol(i), i.indexify()})
                    for j in i.symbolic_shape:
                        maybe_required.update(j.free_symbols)
                required = filter_sorted(maybe_required - not_required,
                                         key=attrgetter('name'))
                args.extend([(i.name,
                              ScalarFunction(name=i.name, dtype=np.int32))
                             for i in required])

                call, params = zip(*args)
                handle = flatten([p.rtargs for p in params])
                name = "f_%d" % root.tag

                # Produce the new FunCall
                mapper[root] = List(header=noinline, body=FunCall(name, call))

                # Produce the new Function
                functions.setdefault(
                    name, Function(name, free, 'void', handle, ('static', )))

            # Transform the main tree
            processed.append(Transformer(mapper).visit(node))

        return {'nodes': processed, 'elemental_functions': functions.values()}