Ejemplo n.º 1
0
    def _specialize_iet(self, iet, **kwargs):
        warning("The OPS backend is still work-in-progress")

        affine_trees = find_affine_trees(iet).items()

        # If there is no affine trees, then there is no loop to be optimized using OPS.
        if not affine_trees:
            return iet

        ops_init = Call(namespace['ops_init'], [0, 0, 2])
        ops_partition = Call(namespace['ops_partition'], Literal('""'))
        ops_exit = Call(namespace['ops_exit'])

        # Extract all symbols that need to be converted to ops_dat
        dims = []
        to_dat = set()
        for _, tree in affine_trees:
            dims.append(len(tree[0].dimensions))
            symbols = set(FindSymbols('symbolics').visit(tree[0].root))
            symbols -= set(FindSymbols('defines').visit(tree[0].root))
            to_dat |= symbols

        # Create the OPS block for this problem
        ops_block = OpsBlock('block')
        ops_block_init = Expression(
            ClusterizedEq(
                Eq(ops_block,
                   namespace['ops_decl_block'](dims[0], Literal('"block"')))))

        # To ensure deterministic code generation we order the datasets to
        # be generated (since a set is an unordered collection)
        to_dat = filter_sorted(to_dat)

        name_to_ops_dat = {}
        pre_time_loop = []
        after_time_loop = []
        for f in to_dat:
            if f.is_Constant:
                continue

            pre_time_loop.extend(
                list(create_ops_dat(f, name_to_ops_dat, ops_block)))
            # To return the result to Devito, it is necessary to copy the data
            # from the dat object back to the CPU memory.
            after_time_loop.extend(
                create_ops_fetch(f, name_to_ops_dat,
                                 self.time_dimension.extreme_max))

        # Generate ops kernels for each offloadable iteration tree
        mapper = {}
        for n, (_, tree) in enumerate(affine_trees):
            pre_loop, ops_kernel, ops_par_loop_call = opsit(
                tree, n, name_to_ops_dat, ops_block, dims[0])

            pre_time_loop.extend(pre_loop)
            self._func_table[namespace['ops_kernel_file'](ops_kernel.name)] = \
                MetaCall(ops_kernel, False)
            mapper[tree[0].root] = ops_par_loop_call
            mapper.update({i.root: mapper.get(i.root)
                           for i in tree})  # Drop trees

        iet = Transformer(mapper).visit(iet)

        assert (d == dims[0] for d in dims), \
            "The OPS backend currently assumes that all kernels \
            have the same number of dimensions"

        self._headers.append(namespace['ops_define_dimension'](dims[0]))
        self._includes.extend(['stdio.h', 'ops_seq.h'])

        body = [
            ops_init, ops_block_init, *pre_time_loop, ops_partition, iet,
            *after_time_loop, ops_exit
        ]

        return List(body=body)
Ejemplo n.º 2
0
    def __init__(self, expressions, **kwargs):
        expressions = as_tuple(expressions)

        # Input check
        if any(not isinstance(i, sympy.Eq) for i in expressions):
            raise InvalidOperator("Only SymPy expressions are allowed.")

        self.name = kwargs.get("name", "Kernel")
        subs = kwargs.get("subs", {})
        time_axis = kwargs.get("time_axis", Forward)
        dse = kwargs.get("dse", configuration['dse'])
        dle = kwargs.get("dle", configuration['dle'])

        # Header files, etc.
        self._headers = list(self._default_headers)
        self._includes = list(self._default_includes)
        self._globals = list(self._default_globals)

        # Required for compilation
        self._compiler = configuration['compiler']
        self._lib = None
        self._cfunction = None

        # References to local or external routines
        self.func_table = OrderedDict()

        # Expression lowering and analysis
        expressions = [LoweredEq(e, subs=subs) for e in expressions]
        self.dtype = retrieve_dtype(expressions)
        self.input, self.output, self.dimensions = retrieve_symbols(
            expressions)

        # Set the direction of time acoording to the given TimeAxis
        for time in [d for d in self.dimensions if d.is_Time]:
            if not time.is_Stepping:
                time.reverse = time_axis == Backward

        # Parameters of the Operator (Dimensions necessary for data casts)
        parameters = self.input + self.dimensions

        # Group expressions based on their iteration space and data dependences,
        # and apply the Devito Symbolic Engine (DSE) for flop optimization
        clusters = clusterize(expressions)
        clusters = rewrite(clusters, mode=set_dse_mode(dse))

        # Lower Clusters to an Iteration/Expression tree (IET)
        nodes = iet_build(clusters, self.dtype)

        # Introduce C-level profiling infrastructure
        nodes, self.profiler = self._profile_sections(nodes, parameters)

        # Translate into backend-specific representation (e.g., GPU, Yask)
        nodes = self._specialize(nodes, parameters)

        # Apply the Devito Loop Engine (DLE) for loop optimization
        dle_state = transform(nodes, *set_dle_mode(dle))

        # Update the Operator state based on the DLE
        self.dle_arguments = dle_state.arguments
        self.dle_flags = dle_state.flags
        self.func_table.update(
            OrderedDict([(i.name, MetaCall(i, True))
                         for i in dle_state.elemental_functions]))
        parameters.extend([i.argument for i in self.dle_arguments])
        self.dimensions.extend([
            i.argument for i in self.dle_arguments
            if isinstance(i.argument, Dimension)
        ])
        self._includes.extend(list(dle_state.includes))

        # Introduce the required symbol declarations
        nodes = iet_insert_C_decls(dle_state.nodes, self.func_table)

        # Initialise ArgumentEngine
        self.argument_engine = ArgumentEngine(clusters.ispace, parameters,
                                              self.dle_arguments)

        parameters = self.argument_engine.arguments

        # Finish instantiation
        super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
Ejemplo n.º 3
0
    def _build(cls, expressions, **kwargs):
        expressions = as_tuple(expressions)

        # Input check
        if any(not isinstance(i, Evaluable) for i in expressions):
            raise InvalidOperator("Only `devito.Evaluable` are allowed.")

        # Python-level (i.e., compile time) and C-level (i.e., run time) performance
        profiler = create_profile('timers')

        # Lower input expressions
        expressions = cls._lower_exprs(expressions, **kwargs)

        # Group expressions based on iteration spaces and data dependences
        clusters = cls._lower_clusters(expressions, profiler, **kwargs)

        # Lower Clusters to a ScheduleTree
        stree = cls._lower_stree(clusters, **kwargs)

        # Lower ScheduleTree to an Iteration/Expression Tree
        iet, byproduct = cls._lower_iet(stree, profiler, **kwargs)

        # Make it an actual Operator
        op = Callable.__new__(cls, **iet.args)
        Callable.__init__(op, **op.args)

        # Header files, etc.
        op._headers = list(cls._default_headers)
        op._headers.extend(byproduct.headers)
        op._globals = list(cls._default_globals)
        op._includes = list(cls._default_includes)
        op._includes.extend(profiler._default_includes)
        op._includes.extend(byproduct.includes)

        # Required for the jit-compilation
        op._compiler = kwargs['compiler']
        op._lib = None
        op._cfunction = None

        # References to local or external routines
        op._func_table = OrderedDict()
        op._func_table.update(
            OrderedDict([(i, MetaCall(None, False))
                         for i in profiler._ext_calls]))
        op._func_table.update(
            OrderedDict([(i.root.name, i) for i in byproduct.funcs]))

        # Internal state. May be used to store information about previous runs,
        # autotuning reports, etc
        op._state = cls._initialize_state(**kwargs)

        # Produced by the various compilation passes
        op._input = filter_sorted(
            flatten(e.reads + e.writes for e in expressions))
        op._output = filter_sorted(flatten(e.writes for e in expressions))
        op._dimensions = flatten(c.dimensions
                                 for c in clusters) + byproduct.dimensions
        op._dimensions = sorted(set(op._dimensions), key=attrgetter('name'))
        op._dtype, op._dspace = clusters.meta
        op._profiler = profiler

        return op
Ejemplo n.º 4
0
    def _specialize_iet(self, iet, **kwargs):
        """
        Transform the Iteration/Expression tree to offload the computation of
        one or more loop nests onto YASK. This involves calling the YASK compiler
        to generate YASK code. Such YASK code is then called from within the
        transformed Iteration/Expression tree.
        """
        mapper = {}
        self.yk_solns = OrderedDict()
        for n, (section, trees) in enumerate(find_affine_trees(iet).items()):
            dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees)))
            context = contexts.fetch(dimensions, self._dtype)

            # A unique name for the 'real' compiler and kernel solutions
            name = namespace['jit-soln'](Signer._digest(configuration,
                                                        *[i.root for i in trees]))

            # Create a YASK compiler solution for this Operator
            yc_soln = context.make_yc_solution(name)

            try:
                # Generate YASK vars and populate `yc_soln` with equations
                local_vars = yaskit(trees, yc_soln)

                # Build the new IET nodes
                yk_soln_obj = YaskSolnObject(namespace['code-soln-name'](n))
                funcall = make_sharedptr_funcall(namespace['code-soln-run'],
                                                 ['time'], yk_soln_obj)
                funcall = Offloaded(funcall, self._dtype)
                mapper[trees[0].root] = funcall
                mapper.update({i.root: mapper.get(i.root) for i in trees})  # Drop trees

                # Mark `funcall` as an external function call
                self._func_table[namespace['code-soln-run']] = MetaCall(None, False)

                # JIT-compile the newly-created YASK kernel
                yk_soln = context.make_yk_solution(name, yc_soln, local_vars)
                self.yk_solns[(dimensions, yk_soln_obj)] = yk_soln

                # Print some useful information about the newly constructed solution
                log("Solution '%s' contains %d var(s) and %d equation(s)." %
                    (yc_soln.get_name(), yc_soln.get_num_vars(),
                     yc_soln.get_num_equations()))
            except NotImplementedError as e:
                log("Unable to offload a candidate tree. Reason: [%s]" % str(e))
        iet = Transformer(mapper).visit(iet)

        if not self.yk_solns:
            log("No offloadable trees found")

        # Some Iteration/Expression trees are not offloaded to YASK and may
        # require further processing to be executed in YASK, due to the differences
        # in storage layout employed by Devito and YASK
        yk_var_objs = {i.name: YaskVarObject(i.name)
                       for i in self._input
                       if i.from_YASK}
        yk_var_objs.update({i: YaskVarObject(i) for i in self._local_vars})
        iet = make_var_accesses(iet, yk_var_objs)

        # Finally optimize all non-yaskized loops
        iet = super(OperatorYASK, self)._specialize_iet(iet, **kwargs)

        return iet
Ejemplo n.º 5
0
    def _build(cls, expressions, **kwargs):
        expressions = as_tuple(expressions)

        # Input check
        if any(not isinstance(i, Eq) for i in expressions):
            raise InvalidOperator("Only `devito.Eq` expressions are allowed.")

        name = kwargs.get("name", "Kernel")
        dse = kwargs.get("dse", configuration['dse'])

        # Python-level (i.e., compile time) and C-level (i.e., run time) performance
        profiler = create_profile('timers')

        # Lower input expressions to internal expressions (e.g., attaching metadata)
        expressions = cls._lower_exprs(expressions, **kwargs)

        # Group expressions based on their iteration space and data dependences
        # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...)
        clusters = clusterize(expressions, dse_mode=set_dse_mode(dse))

        # Lower Clusters to a Schedule tree
        stree = st_build(clusters)

        # Lower Schedule tree to an Iteration/Expression tree (IET)
        iet = iet_build(stree)

        # Instrument the IET for C-level profiling
        iet = profiler.instrument(iet)

        # Wrap the IET with a Callable
        parameters = derive_parameters(iet, True)
        op = Callable(name, iet, 'int', parameters, ())

        # Lower IET to a Target-specific IET
        op, target_state = cls._specialize_iet(op, **kwargs)

        # Make it an actual Operator
        op = Callable.__new__(cls, **op.args)
        Callable.__init__(op, **op.args)

        # Header files, etc.
        op._headers = list(cls._default_headers)
        op._headers.extend(target_state.headers)
        op._globals = list(cls._default_globals)
        op._includes = list(cls._default_includes)
        op._includes.extend(profiler._default_includes)
        op._includes.extend(target_state.includes)

        # Required for the jit-compilation
        op._compiler = configuration['compiler']
        op._lib = None
        op._cfunction = None

        # References to local or external routines
        op._func_table = OrderedDict()
        op._func_table.update(
            OrderedDict([(i, MetaCall(None, False))
                         for i in profiler._ext_calls]))
        op._func_table.update(
            OrderedDict([(i.root.name, i) for i in target_state.funcs]))

        # Internal state. May be used to store information about previous runs,
        # autotuning reports, etc
        op._state = cls._initialize_state(**kwargs)

        # Produced by the various compilation passes
        op._input = filter_sorted(
            flatten(e.reads + e.writes for e in expressions))
        op._output = filter_sorted(flatten(e.writes for e in expressions))
        op._dimensions = filter_sorted(
            flatten(e.dimensions for e in expressions))
        op._dimensions.extend(target_state.dimensions)
        op._dtype, op._dspace = clusters.meta
        op._profiler = profiler

        return op
Ejemplo n.º 6
0
 def funcs(self):
     retval = [
         MetaCall(v, True) for k, v in self.efuncs.items() if k != 'root'
     ]
     retval.extend([MetaCall(i, False) for i in self.ffuncs])
     return tuple(retval)
Ejemplo n.º 7
0
    def __init__(self, expressions, **kwargs):
        expressions = as_tuple(expressions)

        # Input check
        if any(not isinstance(i, sympy.Eq) for i in expressions):
            raise InvalidOperator("Only SymPy expressions are allowed.")

        self.name = kwargs.get("name", "Kernel")
        subs = kwargs.get("subs", {})
        dse = kwargs.get("dse", configuration['dse'])
        dle = kwargs.get("dle", configuration['dle'])

        # Header files, etc.
        self._headers = list(self._default_headers)
        self._includes = list(self._default_includes)
        self._globals = list(self._default_globals)

        # Required for compilation
        self._compiler = configuration['compiler']
        self._lib = None
        self._cfunction = None

        # References to local or external routines
        self.func_table = OrderedDict()

        # Expression lowering: indexification, substitution rules, specialization
        expressions = [indexify(i) for i in expressions]
        expressions = [i.xreplace(subs) for i in expressions]
        expressions = self._specialize_exprs(expressions)

        # Expression analysis
        self.input = filter_sorted(flatten(e.reads for e in expressions))
        self.output = filter_sorted(flatten(e.writes for e in expressions))
        self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions))

        # Group expressions based on their iteration space and data dependences,
        # and apply the Devito Symbolic Engine (DSE) for flop optimization
        clusters = clusterize(expressions)
        clusters = rewrite(clusters, mode=set_dse_mode(dse))
        self._dtype, self._dspace = clusters.meta

        # Lower Clusters to an Iteration/Expression tree (IET)
        nodes = iet_build(clusters)

        # Introduce C-level profiling infrastructure
        nodes, self.profiler = self._profile_sections(nodes)

        # Translate into backend-specific representation (e.g., GPU, Yask)
        nodes = self._specialize_iet(nodes)

        # Apply the Devito Loop Engine (DLE) for loop optimization
        dle_state = transform(nodes, *set_dle_mode(dle))

        # Update the Operator state based on the DLE
        self.dle_args = dle_state.arguments
        self.dle_flags = dle_state.flags
        self.func_table.update(OrderedDict([(i.name, MetaCall(i, True))
                                            for i in dle_state.elemental_functions]))
        self.dimensions.extend([i.argument for i in self.dle_args
                                if isinstance(i.argument, Dimension)])
        self._includes.extend(list(dle_state.includes))

        # Introduce the required symbol declarations
        nodes = iet_insert_C_decls(dle_state.nodes, self.func_table)

        # Insert data and pointer casts for array parameters and profiling structs
        nodes = self._build_casts(nodes)

        # Derive parameters as symbols not defined in the kernel itself
        parameters = self._build_parameters(nodes)

        # Finish instantiation
        super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
Ejemplo n.º 8
0
    def _specialize_iet(self, iet, **kwargs):
        """
        Transform the Iteration/Expression tree to offload the computation of
        one or more loop nests onto YASK. This involves calling the YASK compiler
        to generate YASK code. Such YASK code is then called from within the
        transformed Iteration/Expression tree.
        """
        offloadable = find_offloadable_trees(iet)

        if len(offloadable.trees) == 0:
            self.yk_soln = YaskNullKernel()

            log("No offloadable trees found")
        else:
            context = contexts.fetch(offloadable.grid, offloadable.dtype)

            # A unique name for the 'real' compiler and kernel solutions
            name = namespace['jit-soln'](Signer._digest(iet, configuration))

            # Create a YASK compiler solution for this Operator
            yc_soln = context.make_yc_solution(name)

            try:
                trees = offloadable.trees

                # Generate YASK grids and populate `yc_soln` with equations
                mapper = yaskizer(trees, yc_soln)
                local_grids = [i for i in mapper if i.is_Array]

                # Transform the IET
                funcall = make_sharedptr_funcall(namespace['code-soln-run'],
                                                 ['time'],
                                                 namespace['code-soln-name'])
                funcall = Element(c.Statement(ccode(funcall)))
                mapper = {trees[0].root: funcall}
                mapper.update({i.root: mapper.get(i.root)
                               for i in trees})  # Drop trees
                iet = Transformer(mapper).visit(iet)

                # Mark `funcall` as an external function call
                self.func_table[namespace['code-soln-run']] = MetaCall(
                    None, False)

                # JIT-compile the newly-created YASK kernel
                self.yk_soln = context.make_yk_solution(
                    name, yc_soln, local_grids)

                # Print some useful information about the newly constructed solution
                log("Solution '%s' contains %d grid(s) and %d equation(s)." %
                    (yc_soln.get_name(), yc_soln.get_num_grids(),
                     yc_soln.get_num_equations()))
            except NotImplementedError as e:
                self.yk_soln = YaskNullKernel()

                log("Unable to offload a candidate tree. Reason: [%s]" %
                    str(e))

        # Some Iteration/Expression trees are not offloaded to YASK and may
        # require further processing to be executed in YASK, due to the differences
        # in storage layout employed by Devito and YASK
        iet = make_grid_accesses(iet)

        # Finally optimize all non-yaskized loops
        iet = super(Operator, self)._specialize_iet(iet, **kwargs)

        return iet
Ejemplo n.º 9
0
def iet_insert_C_decls(iet, func_table=None):
    """
    Given an Iteration/Expression tree ``iet``, build a new tree with the
    necessary symbol declarations. Declarations are placed as close as
    possible to the first symbol use.

    :param iet: The input Iteration/Expression tree.
    :param func_table: (Optional) a mapper from callable names within ``iet``
                       to :class:`Callable`s.
    """
    func_table = func_table or {}
    allocator = Allocator()
    mapper = OrderedDict()

    # Detect all IET nodes accessing symbols that need to be declared
    scopes = []
    me = MapExpressions()
    for k, v in me.visit(iet).items():
        if k.is_Call:
            func = func_table.get(k.name)
            if func is not None and func.local:
                scopes.extend(me.visit(func.root, queue=list(v)).items())
        scopes.append((k, v))

    # Classify, and then schedule declarations to stack/heap
    for k, v in scopes:
        if k.is_Expression:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
                continue
            objs = [k.write]
        elif k.is_Call:
            objs = k.params
        else:
            raise NotImplementedError("Cannot schedule declarations for IET "
                                      "node of type `%s`" % type(k))
        for i in objs:
            try:
                if i.is_LocalObject:
                    # On the stack
                    site = v[-1] if v else iet
                    allocator.push_stack(site, i)
                elif i.is_Array:
                    if i._mem_external:
                        # Nothing to do; e.g., a user-provided Function
                        continue
                    elif i._mem_stack:
                        # On the stack
                        key = lambda i: not i.is_Parallel
                        site = filter_iterations(v, key=key,
                                                 stop='asap') or [iet]
                        allocator.push_stack(site[-1], i)
                    else:
                        # On the heap, as a tensor that must be globally accessible
                        allocator.push_heap(i)
            except AttributeError:
                # E.g., a generic SymPy expression
                pass

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = Transformer(mapper, nested=True).visit(iet)
    for k, v in list(func_table.items()):
        if v.local:
            func_table[k] = MetaCall(
                Transformer(mapper).visit(v.root), v.local)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Ejemplo n.º 10
0
def iet_insert_C_decls(iet, func_table=None):
    """
    Given an Iteration/Expression tree ``iet``, build a new tree with the
    necessary symbol declarations. Declarations are placed as close as
    possible to the first symbol use.

    :param iet: The input Iteration/Expression tree.
    :param func_table: (Optional) a mapper from callable names within ``iet``
                       to :class:`Callable`s.
    """
    func_table = func_table or {}
    allocator = Allocator()
    mapper = OrderedDict()

    # First, schedule declarations for Expressions
    scopes = []
    me = MapExpressions()
    for k, v in me.visit(iet).items():
        if k.is_Call:
            func = func_table.get(k.name)
            if func is not None and func.local:
                scopes.extend(me.visit(func.root, queue=list(v)).items())
        else:
            scopes.append((k, v))
    for k, v in scopes:
        if k.is_scalar:
            # Inline declaration
            mapper[k] = LocalExpression(**k.args)
        elif k.write is None or k.write._mem_external:
            # Nothing to do, e.g., variable passed as kernel argument
            continue
        elif k.write._mem_stack:
            # On the stack
            key = lambda i: not i.is_Parallel
            site = filter_iterations(v, key=key, stop='asap') or [iet]
            allocator.push_stack(site[-1], k.write)
        else:
            # On the heap, as a tensor that must be globally accessible
            allocator.push_heap(k.write)

    # Then, schedule declarations callables arguments passed by reference/pointer
    # (as modified internally by the callable)
    scopes = [(k, v) for k, v in me.visit(iet).items() if k.is_Call]
    for k, v in scopes:
        site = v[-1] if v else iet
        for i in k.params:
            try:
                if i.is_LocalObject:
                    # On the stack
                    allocator.push_stack(site, i)
                elif i.is_Array:
                    if i._mem_stack:
                        # On the stack
                        allocator.push_stack(site, i)
                    elif i._mem_heap:
                        # On the heap
                        allocator.push_heap(i)
            except AttributeError:
                # E.g., a generic SymPy expression
                pass

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = NestedTransformer(mapper).visit(iet)
    for k, v in list(func_table.items()):
        if v.local:
            func_table[k] = MetaCall(Transformer(mapper).visit(v.root), v.local)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet