Esempio n. 1
0
    def _create_efuncs(self, nodes, state):
        """
        Extract Iteration sub-trees and turn them into Calls+Callables.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        efuncs = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Build a new Iteration/Expression tree with free bounds
            free = []
            defined_args = {}  # Map of argument values defined by loop bounds
            for i in target:
                name, bounds = i.dim.name, i.symbolic_bounds
                # Iteration bounds
                _min = Scalar(name='%sf_m' % name,
                              dtype=np.int32,
                              is_const=True)
                _max = Scalar(name='%sf_M' % name,
                              dtype=np.int32,
                              is_const=True)
                defined_args[_min.name] = bounds[0]
                defined_args[_max.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update({
                    uf.name: j.symbolic_min
                    for uf, j in zip(ufunc, i.uindices)
                })
                uindices = [
                    IncrDimension(j.parent, i.dim + as_symbol(k), 1, j.name)
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=(_min, _max, 1),
                               offsets=None,
                               uindices=uindices))

            # Construct elemental function body
            free = Transformer(dict((zip(target, free))),
                               nested=True).visit(root)
            items = FindSymbols().visit(free)

            # Insert array casts
            casts = [ArrayCast(i) for i in items if i.is_Tensor]
            free = List(body=casts + [free])

            # Insert declarations
            external = [i for i in items if i.is_Array]
            free = iet_insert_C_decls(free, external)

            # Create the Callable
            name = "f_%d" % root.tag
            params = derive_parameters(free)
            efuncs.setdefault(name,
                              Callable(name, free, 'void', params, 'static'))

            # Create the Call
            args = [defined_args.get(i.name, i) for i in params]
            mapper[root] = List(header=noinline, body=Call(name, args))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'efuncs': efuncs.values()}
Esempio n. 2
0
    def _optimize_halospots(self, iet):
        """
        Optimize the HaloSpots in ``iet``.

        * Remove all USELESS HaloSpots;
        * Merge all hoistable HaloSpots with their root HaloSpot, thus
          removing redundant communications and anticipating communications
          that will be required by later Iterations.
        """
        # Drop USELESS HaloSpots
        mapper = {hs: hs.body for hs in FindNodes(HaloSpot).visit(iet) if hs.is_Useless}
        iet = Transformer(mapper, nested=True).visit(iet)

        # Handle `hoistable` HaloSpots
        mapper = {}
        for halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).values():
            root = halo_spots[0]
            halo_schemes = [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]]
            mapper[root] = root._rebuild(halo_scheme=root.halo_scheme.union(halo_schemes))
            mapper.update({hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.hoistable))
                           for hs in halo_spots[1:]})
        iet = Transformer(mapper, nested=True).visit(iet)

        # At this point, some HaloSpots may have become empty (i.e., requiring
        # no communications), hence they can be removed
        #
        # <HaloSpot(u,v)>           HaloSpot(u,v)
        #   <A>                       <A>
        # <HaloSpot()>      ---->   <B>
        #   <B>
        mapper = {i: i.body for i in FindNodes(HaloSpot).visit(iet) if i.is_empty}
        iet = Transformer(mapper, nested=True).visit(iet)

        # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot
        # subtrees, to overlap as much computation as possible. The HaloSpot-free
        # Iteration nests must be fully affine, otherwise we wouldn't be able to
        # honour the data dependences along the halo
        #
        # <HaloSpot(u,v)>            HaloSpot(u,v)
        #   <A>             ---->      <A>
        # <B>              affine?     <B>
        #
        # Here, <B> doesn't require any halo exchange, but it might still need the
        # output of <A>; thus, if we do computation/communication overlap over <A>
        # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space
        # will have to be split as well. For this, <B> must be affine.
        mapper = {}
        for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values():
            for g in v:
                root = None
                for i in g:
                    if i.is_HaloSpot:
                        root = i
                        mapper[root] = [root.body]
                    elif root and all(j.is_Affine for j in FindNodes(Iteration).visit(i)):
                        mapper[root].append(i)
                        mapper[i] = None
                    else:
                        root = None
        mapper = {k: k._rebuild(body=List(body=v)) if v else v for k, v in mapper.items()}
        iet = Transformer(mapper).visit(iet)

        return iet, {}
Esempio n. 3
0
 def _avoid_denormals(self, iet):
     header = [cgen.Comment('Flush denormal numbers to zero in hardware'),
               cgen.Statement('_MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON)'),
               cgen.Statement('_MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON)')]
     iet = List(header=header, body=iet)
     return iet, {'includes': ('xmmintrin.h', 'pmmintrin.h')}
Esempio n. 4
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not (tree.root.is_Sequential
                    or iet.is_Callable) and not blockalways:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim,
                                   name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                interb.append(
                    Iteration([], d, d.symbolic_max, properties=PARALLEL))
                # Build Iteration within a block
                intrab.append(
                    i._rebuild([],
                               limits=(d, d + d.step - 1, 1),
                               offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten(
                (bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked,
                               dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(
                    ((i.symbolic_min, maxb, bi.dim.step),
                     (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b, )
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {
            'dimensions': block_dims,
            'efuncs': efuncs,
            'args': [i.step for i in block_dims]
        }
Esempio n. 5
0
    def make_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, self.blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(
                tree, lambda i: i.is_Parallel and i.is_Affine)
            if not self.blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not self.blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/readbility, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply hierarchical loop blocking to `tree`
            level_0 = []  # Outermost level of blocking
            level_i = [[] for i in range(1, self.nlevels)
                       ]  # Inner levels of blocking
            intra = []  # Within the smallest block
            for i in iterations:
                template = "%s%d_blk%s" % (i.dim.name, self.nblocked, '%d')
                properties = (PARALLEL, ) + ((AFFINE, ) if i.is_Affine else ())

                # Build Iteration across `level_0` blocks
                d = BlockDimension(i.dim, name=template % 0)
                level_0.append(
                    Iteration([], d, d.symbolic_max, properties=properties))

                # Build Iteration across all `level_i` blocks, `i` in (1, self.nlevels]
                for n, li in enumerate(level_i, 1):
                    di = BlockDimension(d, name=template % n)
                    li.append(
                        Iteration([],
                                  di,
                                  limits=(d, d + d.step - 1, di.step),
                                  properties=properties))
                    d = di

                # Build Iteration within the smallest block
                intra.append(
                    i._rebuild([],
                               limits=(d, d + d.step - 1, 1),
                               offsets=(0, 0)))
            level_i = flatten(level_i)

            # Track all constructed BlockDimensions
            block_dims.extend(i.dim for i in level_0 + level_i)

            # Construct the blocked tree
            blocked = compose_nodes(level_0 + level_i + intra +
                                    [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((l0.dim, l0.step) for l0 in level_0)
            dynamic_parameters.extend([li.step for li in level_i])
            efunc = make_efunc("bf%d" % self.nblocked, blocked,
                               dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, l0 in zip(iterations, level_0):
                maxb = i.symbolic_max - (i.symbolic_size % l0.step)
                ranges.append(
                    ((i.symbolic_min, maxb, l0.step),
                     (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for l0, (m, M, b) in zip(level_0, p):
                    dynamic_args_mapper[l0.dim] = (m, M)
                    dynamic_args_mapper[l0.step] = (b, )
                    for li in level_i:
                        if li.dim.root is l0.dim.root:
                            value = li.step if b is l0.step else b
                            dynamic_args_mapper[li.step] = (value, )
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

            # Next blockable nest, use different (unique) variable/function names
            self.nblocked += 1

        iet = Transformer(mapper).visit(iet)

        # Force-unfold if some folded Iterations haven't been blocked in the end
        iet = unfold_blocked_tree(iet)

        return iet, {
            'dimensions': block_dims,
            'efuncs': efuncs,
            'args': [i.step for i in block_dims]
        }
Esempio n. 6
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline',
                                             c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None,
                                       'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            defined_args = {}  # Map of argument values defined by loop bounds

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                defined_args[start.name] = bounds[0]
                defined_args[finish.name] = bounds[1]

                # Iteration unbounded indices
                ufunc = [
                    Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                    for j in range(len(i.uindices))
                ]
                defined_args.update(
                    {uf.name: j.start
                     for uf, j in zip(ufunc, i.uindices)})
                limits = [
                    Scalar(name=start.name, dtype=np.int32),
                    Scalar(name=finish.name, dtype=np.int32), 1
                ]
                uindices = [
                    UnboundedIndex(j.index, i.dim + as_symbol(k))
                    for j, k in zip(i.uindices, ufunc)
                ]
                free.append(
                    i._rebuild(limits=limits, offsets=None, uindices=uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)

            # Insert array casts for all non-defined
            f_symbols = FindSymbols('symbolics').visit(free)
            defines = [s.name for s in FindSymbols('defines').visit(free)]
            casts = [
                ArrayCast(f) for f in f_symbols
                if f.is_Tensor and f.name not in defines
            ]
            free = (List(body=casts), free)

            for i in derive_parameters(free):
                if i.name in defined_args:
                    args.append((defined_args[i.name], i))
                elif i.is_Dimension:
                    d = Scalar(name=i.name, dtype=i.dtype)
                    args.append((d, d))
                else:
                    args.append((i, i))

            call, params = zip(*args)
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(
                name,
                Callable(name, free, 'void', flatten(params), ('static', )))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Esempio n. 7
0
def optimize_halospots(iet):
    """
    Optimize the HaloSpots in ``iet``.

    * Remove all ``useless`` HaloSpots;
    * Merge all ``hoistable`` HaloSpots with their root HaloSpot, thus
      removing redundant communications and anticipating communications
      that will be required by later Iterations.
    """
    # Drop `useless` HaloSpots
    mapper = {
        hs: hs._rebuild(halo_scheme=hs.halo_scheme.drop(hs.useless))
        for hs in FindNodes(HaloSpot).visit(iet)
    }
    iet = Transformer(mapper, nested=True).visit(iet)

    # Handle `hoistable` HaloSpots
    # First, we merge `hoistable` HaloSpots together, to anticipate communications
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        halo_spots = FindNodes(HaloSpot).visit(tree.root)
        if not halo_spots:
            continue
        root = halo_spots[0]
        if root in mapper:
            continue
        hss = [root.halo_scheme]
        hss.extend(
            [hs.halo_scheme.project(hs.hoistable) for hs in halo_spots[1:]])
        try:
            mapper[root] = root._rebuild(halo_scheme=HaloScheme.union(hss))
        except ValueError:
            # HaloSpots have non-matching `loc_indices` and therefore can't be merged
            perf_adv("Found hoistable HaloSpots with disjoint loc_indices, "
                     "skipping optimization")
            continue
        for hs in halo_spots[1:]:
            halo_scheme = hs.halo_scheme.drop(hs.hoistable)
            if halo_scheme.is_void:
                mapper[hs] = hs.body
            else:
                mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)
    iet = Transformer(mapper, nested=True).visit(iet)

    # Then, we make sure the halo exchanges get performed *before*
    # the first distributed Dimension. Again, we do this to anticipate
    # communications, which hopefully has a pay off in performance
    #
    # <Iteration x>                    <HaloSpot(u)>, in y
    #   <HaloSpot(u)>, in y    ---->   <Iteration x>
    #   <Iteration y>                    <Iteration y>
    mapper = {}
    for i, halo_spots in MapNodes(Iteration, HaloSpot).visit(iet).items():
        hoistable = [hs for hs in halo_spots if hs.hoistable]
        if not hoistable:
            continue
        elif len(hoistable) > 1:
            # We should never end up here, but for now we can't prove it formally
            perf_adv(
                "Found multiple hoistable HaloSpots, skipping optimization")
            continue
        hs = hoistable.pop()
        if hs in mapper:
            continue
        if i.dim.root in hs.dimensions:
            halo_scheme = hs.halo_scheme.drop(hs.hoistable)
            if halo_scheme.is_void:
                mapper[hs] = hs.body
            else:
                mapper[hs] = hs._rebuild(halo_scheme=halo_scheme)

            halo_scheme = hs.halo_scheme.project(hs.hoistable)
            mapper[i] = hs._rebuild(halo_scheme=halo_scheme, body=i._rebuild())
    iet = Transformer(mapper, nested=True).visit(iet)

    # Finally, we try to move HaloSpot-free Iteration nests within HaloSpot
    # subtrees, to overlap as much computation as possible. The HaloSpot-free
    # Iteration nests must be fully affine, otherwise we wouldn't be able to
    # honour the data dependences along the halo
    #
    # <HaloSpot(u,v)>            HaloSpot(u,v)
    #   <A>             ---->      <A>
    # <B>              affine?     <B>
    #
    # Here, <B> doesn't require any halo exchange, but it might still need the
    # output of <A>; thus, if we do computation/communication overlap over <A>
    # *and* want to embed <B> within the HaloSpot, then <B>'s iteration space
    # will have to be split as well. For this, <B> must be affine.
    mapper = {}
    for v in FindAdjacent((HaloSpot, Iteration)).visit(iet).values():
        for g in v:
            root = None
            for i in g:
                if i.is_HaloSpot:
                    root = i
                    mapper[root] = [root.body]
                elif root and all(j.is_Affine
                                  for j in FindNodes(Iteration).visit(i)):
                    mapper[root].append(i)
                    mapper[i] = None
                else:
                    root = None
    mapper = {
        k: k._rebuild(body=List(body=v)) if v else v
        for k, v in mapper.items()
    }
    iet = Transformer(mapper).visit(iet)

    return iet, {}
Esempio n. 8
0
    def _loop_fission(self, nodes, state):
        """
        Apply loop fission to innermost :class:`Iteration` objects. This pass
        is not applied if the number of statements in an Iteration's body is
        lower than ``self.thresholds['fission'].``
        """

        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            if len(tree) <= 1:
                # Heuristically avoided
                continue

            candidate = tree[-1]
            expressions = [e for e in candidate.nodes if e.is_Expression]

            if len(expressions) < self.thresholds['max_fission']:
                # Heuristically avoided
                continue
            if len(expressions) != len(candidate.nodes):
                # Dangerous for correctness
                continue

            functions = list(
                set.union(*[set(e.functions) for e in expressions]))
            wrapped = [e.expr for e in expressions]

            if not functions or not wrapped:
                # Heuristically avoided
                continue

            # Promote temporaries from scalar to tensors
            handle = functions[0]
            dim = handle.indices[-1]
            size = handle.shape[-1]
            if any(dim != i.indices[-1] for i in functions):
                # Dangerous for correctness
                continue

            wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ),
                                                 True)

            assert len(wrapped) == len(expressions)
            rebuilt = [
                Expression(s, e.dtype) for s, e in zip(wrapped, expressions)
            ]

            # Group statements
            # TODO: Need a heuristic here to maximize reuse
            args_frozen = candidate.args_frozen
            properties = as_tuple(args_frozen['properties']) + (ELEMENTAL, )
            args_frozen['properties'] = properties
            n = self.thresholds['min_fission']
            fissioned = [
                Iteration(g, **args_frozen) for g in grouper(rebuilt, n)
            ]

            mapper[candidate] = List(body=fissioned)

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
Esempio n. 9
0
    def make(self, hs, key):
        """
        Construct Callables and Calls implementing distributed-memory halo
        exchange for the HaloSpot ``hs``.

        At least three Callables are constructed:

            * ``update_halo``, to be called to trigger the halo exchange,
            * ``sendrecv``, called from within ``update_halo``.
            * ``copy``, called from within ``sendrecv``, to implement, for example,
              data gathering prior to an MPI_Send, and data scattering following
              an MPI recv.

        Additional Callables may be constructed if the halo exchange is asynchronous
        (which depends on the specific HaloExchangeBuilder implementation).
        """
        # Sanity check
        assert all(f.is_Function and f.grid is not None for f in hs.fmapper)

        # Callable for compute over the CORE region
        compute = self._make_compute(hs, key)
        if compute is not None:
            self._efuncs[compute] = [None]

        # Callables for send/recv/wait
        for f, hse in hs.fmapper.items():
            msg = self._make_msg(f, hse, key='%d_%d' % (key, len(self.msgs)))
            msg = self._msgs.setdefault((f, hse), msg)
            if (f.ndim, hse) not in self._cache:
                df = f.__class__.__base__(name='a',
                                          grid=f.grid,
                                          shape=f.shape_global,
                                          dimensions=f.dimensions)

                haloupdate = self._make_haloupdate(df, hse, key, msg=msg)
                sendrecv = self._make_sendrecv(df, hse, key, msg=msg)
                gather = self._make_copy(df, hse, key)

                self._efuncs[haloupdate] = None
                self._efuncs[sendrecv] = [haloupdate.name]
                self._efuncs[gather] = [sendrecv.name]

                halowait = self._make_halowait(df, hse, key, msg=msg)
                wait = self._make_wait(df, hse, key, msg=msg)
                scatter = self._make_copy(df, hse, key, swap=True)

                if halowait is None:
                    assert wait is None
                    self._efuncs[scatter] = [sendrecv.name]
                else:
                    self._efuncs[halowait] = None
                    self._efuncs[wait] = [halowait.name]
                    self._efuncs[scatter] = [wait.name]

                self._cache[(f.ndim, hse)] = (haloupdate, halowait)

        # Callable for compute over the OWNED region
        callcompute = self._call_compute(hs, compute)
        remainder = self._make_remainder(callcompute, hs, key)
        if remainder is not None:
            self._efuncs[remainder] = None
            self._efuncs.setdefault(callcompute, []).append(remainder.name)

        # Now build up the HaloSpot body, with explicit Calls to the constructed Callables
        body = [callcompute]
        for f, hse in hs.fmapper.items():
            msg = self._msgs[(f, hse)]
            haloupdate, halowait = self._cache[(f.ndim, hse)]
            body.insert(0, self._call_haloupdate(haloupdate.name, f, hse, msg))
            if halowait is not None:
                body.append(self._call_halowait(halowait.name, f, hse, msg))
        if remainder is not None:
            body.append(self._call_remainder(remainder))

        return List(body=body)
Esempio n. 10
0
    def make(self, hs):
        """
        Construct Callables and Calls implementing distributed-memory halo
        exchange for the HaloSpot ``hs``.
        """
        # Sanity check
        assert all(f.is_Function and f.grid is not None for f in hs.fmapper)

        for f, hse in hs.fmapper.items():
            # Build an MPIMsg, a data structure to be propagated across the
            # various halo exchange routines
            if (f, hse) not in self._msgs:
                key = self._gen_msgkey()
                msg = self._msgs.setdefault((f, hse), self._make_msg(f, hse, key))
            else:
                msg = self._msgs[(f, hse)]

            # Callables for send/recv/wait
            if (f.ndim, hse) not in self._cache_halo:
                self._make_all(f, hse, msg)

        msgs = [self._msgs[(f, hse)] for f, hse in hs.fmapper.items()]

        # Callable for poking the asynchronous progress engine
        key = self._gen_compkey()
        poke = self._make_poke(hs, key, msgs)
        if isinstance(poke, Callable):
            self._efuncs.append(poke)

        # Callable for compute over the CORE region
        callpoke = self._call_poke(poke)
        compute = self._make_compute(hs, key, msgs, callpoke)
        if isinstance(compute, Callable):
            self._efuncs.append(compute)

        # Callable for compute over the OWNED region
        region = self._make_region(hs, key)
        region = self._regions.setdefault(hs, region)
        callcompute = self._call_compute(hs, compute, msgs)
        remainder = self._make_remainder(hs, key, callcompute, region)
        if isinstance(remainder, Callable):
            self._efuncs.append(remainder)

        # Now build up the HaloSpot body, with explicit Calls to the constructed Callables
        haloupdates = []
        halowaits = []
        for i, (f, hse) in enumerate(hs.fmapper.items()):
            msg = self._msgs[(f, hse)]
            haloupdate, halowait = self._cache_halo[(f.ndim, hse)]
            haloupdates.append(self._call_haloupdate(haloupdate.name, f, hse, msg))
            if halowait is not None:
                halowaits.append(self._call_halowait(halowait.name, f, hse, msg))
        body = []
        body.append(HaloUpdateList(body=haloupdates))
        if callcompute is not None:
            body.append(callcompute)
        body.append(HaloWaitList(body=halowaits))
        if remainder is not None:
            body.append(self._call_remainder(remainder))

        return List(body=body)
Esempio n. 11
0
    def _minimize_remainders(self, nodes, state):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        mapper = {}
        for tree in retrieve_iteration_tree(nodes):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]
            if root.tag is None:
                continue

            # Padding
            writes = [
                i for i in FindSymbols('symbolics-writes').visit(root)
                if i.is_Array
            ]
            padding = []
            for i in writes:
                try:
                    simd_items = get_simd_items(i.dtype)
                except KeyError:
                    # Fallback to 16 (maximum expectable padding, for AVX512 registers)
                    simd_items = simdinfo['avx512f'] / np.dtype(
                        i.dtype).itemsize
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, ))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.end_symbolic
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1]
                            for i in FindSymbols().visit(root))
            for i in root.uindices:
                for j in externals:
                    condition.append(root.end_symbolic + padding < j)
            condition = ' || '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func(name='_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' %
                          (condition, ccode(endpoint + padding), endpoint)))

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(nodes)

        return processed, {}
Esempio n. 12
0
    def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root):
        threads = self.__make_threads()

        fetches = []
        prefetches = []
        presents = []
        for s in sync_ops:
            if s.direction is Forward:
                fc = s.fetch.subs(s.dim, s.dim.symbolic_min)
                fsize = s.function._C_get_field(FULL, s.dim).size
                fc_cond = fc + (s.size - 1) < fsize
                pfc = s.fetch + 1
                pfc_cond = pfc + (s.size - 1) < fsize
            else:
                fc = s.fetch.subs(s.dim, s.dim.symbolic_max)
                fc_cond = fc >= 0
                pfc = s.fetch - 1
                pfc_cond = pfc >= 0

            # Construct fetch IET
            imask = [(fc, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            fetch = List(header=self._P._map_to(s.function, imask))
            fetches.append(Conditional(fc_cond, fetch))

            # Construct present clauses
            imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            presents.extend(as_list(self._P._map_present(s.function, imask)))

            # Construct prefetch IET
            imask = [(pfc, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            prefetch = List(header=self._P._map_to_wait(
                s.function, imask, SharedData._field_id))
            prefetches.append(Conditional(pfc_cond, prefetch))

        functions = filter_ordered(s.function for s in sync_ops)
        casts = [PointerCast(f) for f in functions]

        # Turn init IET into a Callable
        name = self.sregistry.make_name(prefix='init_device')
        body = List(body=casts + fetches)
        parameters = filter_sorted(functions + derive_parameters(body))
        func = Callable(name, body, 'void', parameters, 'static')
        pieces.funcs.append(func)

        # Perform initial fetch by the main thread
        pieces.init.append(
            List(header=c.Comment("Initialize data stream for `%s`" %
                                  threads.name),
                 body=[Call(name, func.parameters), BlankLine]))

        # Turn prefetch IET into a threaded Callable
        name = self.sregistry.make_name(prefix='prefetch_host_to_device')
        body = List(header=c.Line(), body=casts + prefetches)
        tfunc, sdata = self.__make_tfunc(name, body, root, threads)
        pieces.funcs.append(tfunc)

        # Glue together all the IET pieces, including the activation bits
        iet = List(body=[
            BlankLine,
            BusyWait(
                CondNe(
                    FieldFromComposite(sdata._field_flag, sdata[
                        threads.index]), 1)),
            List(header=presents), iet,
            self.__make_activate_thread(threads, sdata, sync_ops)
        ])

        # Fire up the threads
        pieces.init.append(
            self.__make_init_threads(threads, sdata, tfunc, pieces))
        pieces.threads.append(threads)

        # Final wait before jumping back to Python land
        pieces.finalize.append(self.__make_finalize_threads(threads, sdata))

        return iet
Esempio n. 13
0
    def _make_fetchprefetch(self, iet, sync_ops, pieces, root):
        fid = SharedData._field_id

        fetches = []
        prefetches = []
        presents = []
        for s in sync_ops:
            f = s.function
            dimensions = s.dimensions
            fc = s.fetch
            ifc = s.ifetch
            pfc = s.pfetch
            fcond = s.fcond
            pcond = s.pcond

            # Construct init IET
            imask = [(ifc, s.size) if d.root is s.dim.root else FULL
                     for d in dimensions]
            fetch = PragmaTransfer(self.lang._map_to, f, imask=imask)
            fetches.append(Conditional(fcond, fetch))

            # Construct present clauses
            imask = [(fc, s.size) if d.root is s.dim.root else FULL
                     for d in dimensions]
            presents.append(
                PragmaTransfer(self.lang._map_present, f, imask=imask))

            # Construct prefetch IET
            imask = [(pfc, s.size) if d.root is s.dim.root else FULL
                     for d in dimensions]
            prefetch = PragmaTransfer(self.lang._map_to_wait,
                                      f,
                                      imask=imask,
                                      queueid=fid)
            prefetches.append(Conditional(pcond, prefetch))

        # Turn init IET into a Callable
        functions = filter_ordered(s.function for s in sync_ops)
        name = self.sregistry.make_name(prefix='init_device')
        body = List(body=fetches)
        parameters = filter_sorted(functions + derive_parameters(body))
        func = Callable(name, body, 'void', parameters, 'static')
        pieces.funcs.append(func)

        # Perform initial fetch by the main thread
        pieces.init.append(
            List(header=c.Comment("Initialize data stream"),
                 body=[Call(name, parameters), BlankLine]))

        # Turn prefetch IET into a ThreadFunction
        name = self.sregistry.make_name(prefix='prefetch_host_to_device')
        body = List(header=c.Line(), body=prefetches)
        tctx = make_thread_ctx(name, body, root, None, sync_ops,
                               self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # Glue together all the IET pieces, including the activation logic
        sdata = tctx.sdata
        threads = tctx.threads
        iet = List(body=[
            BlankLine,
            BusyWait(
                CondNe(
                    FieldFromComposite(sdata._field_flag, sdata[
                        threads.index]), 1))
        ] + presents + [iet, tctx.activate])

        # Fire up the threads
        pieces.init.append(tctx.init)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        # Keep track of created objects
        pieces.objs.add(sync_ops, sdata, threads)

        return iet
Esempio n. 14
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))
        noinline = self._compiler_decoration('noinline', cgen.Comment('noinline?'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = OrderedDict()
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            candidates = [i for i in tree if i.is_Parallel]
            if blockinner:
                iterations = candidates
            else:
                iterations = [i for i in candidates if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not blockalways:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_block" % (i.dim.name, len(mapper)))
                # Build Iteration over blocks
                interb.append(Iteration([], d, d.symbolic_max, offsets=i.offsets,
                                        properties=PARALLEL))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))
                # Record that a new BlockDimension has been introduced
                block_dims.append(d)

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc0 = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc0.make_call(dynamic_args_mapper)
                body.append(List(header=noinline, body=call))

            # Build indirect Call to the `efunc0` Calls
            dynamic_parameters = [i.dim.root for i in candidates]
            dynamic_parameters.extend([bi.dim.step for bi in interb])
            efunc1 = make_efunc("f%d" % len(mapper), body, dynamic_parameters)

            # Track everything to ultimately transform the input `iet`
            mapper[root] = efunc1.make_call()
            efuncs[efunc1] = None
            efuncs[efunc0] = [efunc1.name]

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs}
Esempio n. 15
0
    def _loop_blocking(self, nodes, state):
        """
        Apply loop blocking to :class:`Iteration` trees.

        Blocking is applied to parallel iteration trees. Heuristically, innermost
        dimensions are not blocked to maximize the trip count of the SIMD loops.

        Different heuristics may be specified by passing the keywords ``blockshape``
        and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate
        a specific block size for each blocked dimension. For example, for the
        :class:`Iteration` tree: ::

            for i
              for j
                for k
                  ...

        one may provide ``blockshape = {i: 4, j: 7}``, in which case the
        two outer loops will blocked, and the resulting 2-dimensional block will
        have size 4x7. The latter may be set to True to also block innermost parallel
        :class:`Iteration` objects.
        """
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree[0].is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                name = "%s%d_block" % (i.dim.name, len(mapper))

                # Build Iteration over blocks
                dim = blocked.setdefault(i, Dimension(name=name))
                bsize = dim.symbolic_size
                bstart = i.limits[0]
                binnersize = i.dim.symbolic_extent + (i.offsets[1] -
                                                      i.offsets[0])
                bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1
                inter_block = Iteration([],
                                        dim, [bstart, bfinish, bsize],
                                        offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + bsize - 1, 1)
                intra_block = i._rebuild([],
                                         limits=limits,
                                         offsets=(0, 0),
                                         properties=i.properties +
                                         (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild(
                    [],
                    limits=[bfinish + 1, i.dim.symbolic_end, 1],
                    offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [
                        b._rebuild(properties=b.properties + (REMAINDER, ))
                        for b, r in zip(inter_blocks, remainders)
                        if r.dim not in c
                    ]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        # All blocked dimensions
        if not blocked:
            return processed, {}

        # Determine the block shape
        blockshape = self.params.get('blockshape')
        if not blockshape:
            # Use trivial heuristic for a suitable blockshape
            def heuristic(dim_size):
                ths = 8  # FIXME: This really needs to be improved
                return ths if dim_size > ths else 1

            blockshape = {k: heuristic for k in blocked.keys()}
        else:
            try:
                nitems, nrequired = len(blockshape), len(blocked)
                blockshape = {k: v for k, v in zip(blocked, blockshape)}
                if nitems > nrequired:
                    dle_warning("Provided 'blockshape' has more entries than "
                                "blocked loops; dropping entries ...")
                if nitems < nrequired:
                    dle_warning("Provided 'blockshape' has fewer entries than "
                                "blocked loops; dropping dimensions ...")
            except TypeError:
                blockshape = {list(blocked)[0]: blockshape}
            blockshape.update(
                {k: None
                 for k in blocked.keys() if k not in blockshape})

        # Track any additional arguments required to execute /state.nodes/
        arguments = [
            BlockingArg(v, k, blockshape[k]) for k, v in blocked.items()
        ]

        return processed, {'arguments': arguments, 'flags': 'blocking'}
Esempio n. 16
0
def iet_make(clusters, dtype):
    """
    Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s.

    :param clusters: The iterable :class:`Cluster`s for which the IET is built.
    :param dtype: The data type of the scalar expressions.
    """
    processed = []
    schedule = OrderedDict()
    for cluster in clusters:
        if not cluster.ispace.empty:
            root = None
            intervals = cluster.ispace.intervals

            # Can I reuse any of the previously scheduled Iterations ?
            index = 0
            for i0, i1 in zip(intervals, list(schedule)):
                if i0 != i1 or i0.dim in cluster.atomics:
                    break
                root = schedule[i1]
                index += 1
            needed = intervals[index:]

            # Build Expressions
            body = [
                Expression(
                    e, np.int32 if cluster.trace.is_index(e.lhs) else dtype)
                for e in cluster.exprs
            ]
            if not needed:
                body = List(body=body)

            # Build Iterations
            scheduling = []
            for i in reversed(needed):
                # Prepare any necessary unbounded index
                uindices = []
                for j, offs in cluster.ispace.sub_iterators.get(i.dim, []):
                    modulo = len(offs)
                    for n, o in enumerate(filter_ordered(offs)):
                        name = "%s%d" % (j.name, n)
                        vname = Scalar(name=name, dtype=np.int32)
                        value = (i.dim + o) % modulo
                        uindices.append(
                            UnboundedIndex(vname, value, value, j, j + o))

                # Retrieve the iteration direction
                direction = cluster.ispace.directions[i.dim]

                # Update IET and scheduling
                if i.dim in cluster.guards:
                    # Must wrap within an if-then scope
                    body = Conditional(cluster.guards[i.dim], body)
                    iteration = Iteration(body,
                                          i.dim,
                                          i.dim.limits,
                                          offsets=i.limits,
                                          direction=direction,
                                          uindices=uindices)
                    # Adding (None, None) ensures that nested iterations won't
                    # be reused by the next cluster
                    scheduling.extend([(None, None), (i, iteration)])
                else:
                    iteration = Iteration(body,
                                          i.dim,
                                          i.dim.limits,
                                          offsets=i.limits,
                                          direction=direction,
                                          uindices=uindices)
                    scheduling.append((i, iteration))

                # Prepare for next dimension
                body = iteration

            # If /needed/ is != [], root.dim might be a guarded dimension for /cluster/
            if root is not None and root.dim in cluster.guards:
                body = Conditional(cluster.guards[root.dim], body)

            # Update the current schedule
            scheduling = OrderedDict(reversed(scheduling))
            if root is None:
                processed.append(body)
                schedule = scheduling
            else:
                nodes = list(root.nodes) + [body]
                mapper = {root: root._rebuild(nodes, **root.args_frozen)}
                transformer = Transformer(mapper)
                processed = list(transformer.visit(processed))
                schedule = OrderedDict(
                    list(schedule.items())[:index] + list(scheduling.items()))
                for k, v in list(schedule.items()):
                    schedule[k] = transformer.rebuilt.get(v, v)
        else:
            # No Iterations are needed
            processed.extend([Expression(e, dtype) for e in cluster.exprs])

    return List(body=processed)
Esempio n. 17
0
    def _specialize_iet(self, iet, **kwargs):
        warning("The OPS backend is still work-in-progress")

        affine_trees = find_affine_trees(iet).items()

        # If there is no affine trees, then there is no loop to be optimized using OPS.
        if not affine_trees:
            return iet

        ops_init = Call(namespace['ops_init'], [0, 0, 2])
        ops_partition = Call(namespace['ops_partition'], Literal('""'))
        ops_exit = Call(namespace['ops_exit'])

        # Extract all symbols that need to be converted to ops_dat
        dims = []
        to_dat = set()
        for _, tree in affine_trees:
            dims.append(len(tree[0].dimensions))
            symbols = set(FindSymbols('symbolics').visit(tree[0].root))
            symbols -= set(FindSymbols('defines').visit(tree[0].root))
            to_dat |= symbols

        # Create the OPS block for this problem
        ops_block = OpsBlock('block')
        ops_block_init = Expression(ClusterizedEq(Eq(
            ops_block,
            namespace['ops_decl_block'](
                dims[0],
                Literal('"block"')
            )
        )))

        # To ensure deterministic code generation we order the datasets to
        # be generated (since a set is an unordered collection)
        to_dat = filter_sorted(to_dat)

        name_to_ops_dat = {}
        pre_time_loop = []
        after_time_loop = []
        for f in to_dat:
            if f.is_Constant:
                continue

            pre_time_loop.extend(list(create_ops_dat(f, name_to_ops_dat, ops_block)))
            # To return the result to Devito, it is necessary to copy the data
            # from the dat object back to the CPU memory.
            after_time_loop.extend(create_ops_fetch(f,
                                                    name_to_ops_dat,
                                                    self.time_dimension.extreme_max))

        # Generate ops kernels for each offloadable iteration tree
        mapper = {}
        for n, (_, tree) in enumerate(affine_trees):
            pre_loop, ops_kernel, ops_par_loop_call = opsit(
                tree, n, name_to_ops_dat, ops_block, dims[0]
            )

            pre_time_loop.extend(pre_loop)
            self._ops_kernels.append(ops_kernel)
            mapper[tree[0].root] = ops_par_loop_call
            mapper.update({i.root: mapper.get(i.root) for i in tree})  # Drop trees

        iet = Transformer(mapper).visit(iet)

        assert (d == dims[0] for d in dims), \
            "The OPS backend currently assumes that all kernels \
            have the same number of dimensions"

        self._headers.append(namespace['ops_define_dimension'](dims[0]))
        self._includes.extend(['stdio.h', 'ops_seq.h'])

        body = [ops_init, ops_block_init, *pre_time_loop,
                ops_partition, iet, *after_time_loop, ops_exit]

        return List(body=body)
Esempio n. 18
0
def iet_insert_C_decls(iet, func_table=None):
    """
    Given an Iteration/Expression tree ``iet``, build a new tree with the
    necessary symbol declarations. Declarations are placed as close as
    possible to the first symbol use.

    :param iet: The input Iteration/Expression tree.
    :param func_table: (Optional) a mapper from callable names within ``iet``
                       to :class:`Callable`s.
    """
    func_table = func_table or {}
    allocator = Allocator()
    mapper = OrderedDict()

    # Detect all IET nodes accessing symbols that need to be declared
    scopes = []
    me = MapExpressions()
    for k, v in me.visit(iet).items():
        if k.is_Call:
            func = func_table.get(k.name)
            if func is not None and func.local:
                scopes.extend(me.visit(func.root, queue=list(v)).items())
        scopes.append((k, v))

    # Classify, and then schedule declarations to stack/heap
    for k, v in scopes:
        if k.is_Expression:
            if k.is_scalar:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
                continue
            objs = [k.write]
        elif k.is_Call:
            objs = k.params
        else:
            raise NotImplementedError("Cannot schedule declarations for IET "
                                      "node of type `%s`" % type(k))
        for i in objs:
            try:
                if i.is_LocalObject:
                    # On the stack
                    site = v[-1] if v else iet
                    allocator.push_stack(site, i)
                elif i.is_Array:
                    if i._mem_external:
                        # Nothing to do; e.g., a user-provided Function
                        continue
                    elif i._mem_stack:
                        # On the stack
                        key = lambda i: not i.is_Parallel
                        site = filter_iterations(v, key=key,
                                                 stop='asap') or [iet]
                        allocator.push_stack(site[-1], i)
                    else:
                        # On the heap, as a tensor that must be globally accessible
                        allocator.push_heap(i)
            except AttributeError:
                # E.g., a generic SymPy expression
                pass

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = Transformer(mapper, nested=True).visit(iet)
    for k, v in list(func_table.items()):
        if v.local:
            func_table[k] = MetaCall(
                Transformer(mapper).visit(v.root), v.local)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Esempio n. 19
0
def relax_incr_dimensions(iet, **kwargs):
    """
    Recast Iterations over IncrDimensions as ElementalFunctions; insert
    ElementalCalls to iterate over the "main" and "remainder" regions induced
    by the IncrDimensions.
    """
    sregistry = kwargs['sregistry']

    efuncs = []
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        iterations = [i for i in tree if i.dim.is_Incr]
        if not iterations:
            continue

        root = iterations[0]
        if root in mapper:
            continue

        outer, inner = split(iterations, lambda i: not i.dim.parent.is_Incr)

        # Compute the iteration ranges
        ranges = []
        for i in outer:
            maxb = i.symbolic_max - (i.symbolic_size % i.dim.step)
            ranges.append(((i.symbolic_min, maxb, i.dim.step),
                           (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

        # Remove any offsets
        # E.g., `x = x_m + 2 to x_M - 2` --> `x = x_m to x_M`
        outer = [
            i._rebuild(limits=(i.dim.root.symbolic_min,
                               i.dim.root.symbolic_max, i.step)) for i in outer
        ]

        # Create the ElementalFunction
        name = sregistry.make_name(prefix="bf")
        body = compose_nodes(outer)
        dynamic_parameters = flatten(
            (i.symbolic_bounds, i.step) for i in outer)
        dynamic_parameters.extend(
            [i.step for i in inner if not is_integer(i.step)])
        efunc = make_efunc(name, body, dynamic_parameters)

        efuncs.append(efunc)

        # Create the ElementalCalls
        calls = []
        for p in product(*ranges):
            dynamic_args_mapper = {}
            for i, (m, M, b) in zip(outer, p):
                dynamic_args_mapper[i.symbolic_min] = m
                dynamic_args_mapper[i.symbolic_max] = M
                dynamic_args_mapper[i.step] = b
                for j in inner:
                    if j.dim.root is i.dim.root and not is_integer(j.step):
                        value = j.step if b is i.step else b
                        dynamic_args_mapper[j.step] = (value, )
            calls.append(efunc.make_call(dynamic_args_mapper))

        mapper[root] = List(body=calls)

    iet = Transformer(mapper).visit(iet)

    return iet, {'efuncs': efuncs}
Esempio n. 20
0
    def _create_elemental_functions(self, nodes, state):
        """
        Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s.

        Currently, only tagged, elementizable Iteration objects are targeted.
        """
        noinline = self._compiler_decoration('noinline', c.Comment('noinline?'))

        functions = OrderedDict()
        mapper = {}
        for tree in retrieve_iteration_tree(nodes, mode='superset'):
            # Search an elementizable sub-tree (if any)
            tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap')
            if not tagged:
                continue
            root = tagged[0]
            if not root.is_Elementizable:
                continue
            target = tree[tree.index(root):]

            # Elemental function arguments
            args = []  # Found so far (scalars, tensors)
            maybe_required = set()  # Scalars that *may* have to be passed in
            not_required = set()  # Elemental function locally declared scalars

            # Build a new Iteration/Expression tree with free bounds
            free = []
            for i in target:
                name, bounds = i.dim.name, i.bounds_symbolic
                # Iteration bounds
                start = Scalar(name='%s_start' % name, dtype=np.int32)
                finish = Scalar(name='%s_finish' % name, dtype=np.int32)
                args.extend(zip([ccode(j) for j in bounds], (start, finish)))
                # Iteration unbounded indices
                ufunc = [Scalar(name='%s_ub%d' % (name, j), dtype=np.int32)
                         for j in range(len(i.uindices))]
                args.extend(zip([ccode(j.start) for j in i.uindices], ufunc))
                limits = [Symbol(start.name), Symbol(finish.name), 1]
                uindices = [UnboundedIndex(j.index, i.dim + as_symbol(k))
                            for j, k in zip(i.uindices, ufunc)]
                free.append(i._rebuild(limits=limits, offsets=None, uindices=uindices))
                not_required.update({i.dim}, set(j.index for j in i.uindices))

            # Construct elemental function body, and inspect it
            free = NestedTransformer(dict((zip(target, free)))).visit(root)
            expressions = FindNodes(Expression).visit(free)
            fsymbols = FindSymbols('symbolics').visit(free)

            # Add all definitely-required arguments
            not_required.update({i.output for i in expressions if i.is_scalar})
            for i in fsymbols:
                if i in not_required:
                    continue
                elif i.is_Array:
                    args.append(("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i))
                elif i.is_TensorFunction:
                    args.append(("%s_vec" % i.name, i))
                elif i.is_Scalar:
                    args.append((i.name, i))

            # Add all maybe-required arguments that turn out to be required
            maybe_required.update(set(FindSymbols(mode='free-symbols').visit(free)))
            for i in fsymbols:
                not_required.update({as_symbol(i), i.indexify()})
                for j in i.symbolic_shape:
                    maybe_required.update(j.free_symbols)
            required = filter_sorted(maybe_required - not_required,
                                     key=attrgetter('name'))
            args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required])

            call, params = zip(*args)
            name = "f_%d" % root.tag

            # Produce the new Call
            mapper[root] = List(header=noinline, body=Call(name, call))

            # Produce the new Callable
            functions.setdefault(name, Callable(name, free, 'void', flatten(params),
                                                ('static',)))

        # Transform the main tree
        processed = Transformer(mapper).visit(nodes)

        return processed, {'elemental_functions': functions.values()}
Esempio n. 21
0
def iet_insert_C_decls(iet, external=None):
    """
    Given an IET, build a new tree with the necessary symbol declarations.
    Declarations are placed as close as possible to the first symbol occurrence.

    Parameters
    ----------
    iet : Node
        The input Iteration/Expression tree.
    external : tuple, optional
        The symbols defined in some outer Callable, which therefore must not
        be re-defined.
    """
    external = external or []

    # Classify and then schedule declarations to stack/heap
    allocator = Allocator()
    mapper = OrderedDict()
    for k, v in MapExpressions().visit(iet).items():
        if k.is_Expression:
            if k.is_scalar_assign:
                # Inline declaration
                mapper[k] = LocalExpression(**k.args)
                continue
            objs = [k.write]
        elif k.is_Call:
            objs = k.arguments

        for i in objs:
            try:
                if i.is_LocalObject:
                    # On the stack
                    site = v[-1] if v else iet
                    allocator.push_stack(site, i)
                elif i.is_Array:
                    if i in external:
                        # The Array is to be defined in some foreign IET
                        continue
                    elif i._mem_stack:
                        # On the stack
                        key = lambda i: not i.is_Parallel
                        site = filter_iterations(v, key=key, stop='asap') or [iet]
                        allocator.push_stack(site[-1], i)
                    else:
                        # On the heap, as a tensor that must be globally accessible
                        allocator.push_heap(i)
            except AttributeError:
                # E.g., a generic SymPy expression
                pass

    # Introduce declarations on the stack
    for k, v in allocator.onstack:
        mapper[k] = tuple(Element(i) for i in v)
    iet = Transformer(mapper, nested=True).visit(iet)

    # Introduce declarations on the heap (if any)
    if allocator.onheap:
        decls, allocs, frees = zip(*allocator.onheap)
        iet = List(header=decls + allocs, body=iet, footer=frees)

    return iet
Esempio n. 22
0
 def _build_casts(self, iet):
     """Introduce array casts."""
     casts = [
         ArrayCast(f) for f in self.input if f.is_Tensor and f._mem_external
     ]
     return List(body=casts + [iet])
Esempio n. 23
0
def sendrecv(f, fixed):
    """Construct an IET performing a halo exchange along arbitrary
    dimension and side."""
    assert f.is_Function
    assert f.grid is not None

    comm = f.grid.distributor._C_comm

    buf_dims = [
        Dimension(name='buf_%s' % d.root) for d in f.dimensions
        if d not in fixed
    ]
    bufg = Array(name='bufg',
                 dimensions=buf_dims,
                 dtype=f.dtype,
                 scope='stack')
    bufs = Array(name='bufs',
                 dimensions=buf_dims,
                 dtype=f.dtype,
                 scope='stack')

    dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions]
    dat = Array(name='dat',
                dimensions=dat_dims,
                dtype=f.dtype,
                scope='external')

    ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions]
    ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions]

    fromrank = Symbol(name='fromrank')
    torank = Symbol(name='torank')

    parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg
    gather = Call('gather_%s' % f.name, parameters)
    parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss
    scatter = Call('scatter_%s' % f.name, parameters)

    # The scatter must be guarded as we must not alter the halo values along
    # the domain boundary, where the sender is actually MPI.PROC_NULL
    scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter)

    MPI_Status = type('MPI_Status', (c_void_p, ), {})
    srecv = LocalObject(name='srecv', dtype=MPI_Status)

    MPI_Request = type('MPI_Request', (c_void_p, ), {})
    rrecv = LocalObject(name='rrecv', dtype=MPI_Request)
    rsend = LocalObject(name='rsend', dtype=MPI_Request)

    count = reduce(mul, bufs.shape, 1)
    recv = Call('MPI_Irecv', [
        bufs, count,
        Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv
    ])
    send = Call('MPI_Isend', [
        bufg, count,
        Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend
    ])

    waitrecv = Call('MPI_Wait', [rrecv, srecv])
    waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')])

    iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter])
    iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)])
    parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss +
                  [fromrank, torank, comm])
    return Callable('sendrecv_%s' % f.name, iet, 'void', parameters,
                    ('static', ))
Esempio n. 24
0
def test_create_efuncs_complex(complex_function):
    roots = [i[-1] for i in retrieve_iteration_tree(complex_function)]
    retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)]
    mapper = {
        i: j._rebuild(properties=(j.properties + (ELEMENTAL, )))
        for i, j in zip(roots, retagged)
    }
    function = Transformer(mapper).visit(complex_function)
    handle = transform(function, mode='split')
    block = List(body=[handle.nodes] + handle.efuncs)
    output = str(block.ccode)
    # Make output compiler independent
    output = [
        i for i in output.split('\n')
        if all([j not in i for j in ('#pragma', '/*')])
    ]
    assert '\n'.join(output) == \
        ("""void foo(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec)
{
  for (int i = 0; i <= 3; i += 1)
  {
    f_0((float *)a,(float *)b,i_size,i,4,0);
    for (int j = 0; j <= 5; j += 1)
    {
      f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0);
    }
    f_2((float *)a,(float *)b,i_size,i,4,0);
  }
}
void f_0(float *restrict a_vec, float *restrict b_vec,"""
         """ const int i_size, const int i, const int sf_M, const int sf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  for (int s = sf_m; s <= sf_M; s += 1)
  {
    b[i] = a[i] + pow(b[i], 2) + 3;
  }
}
void f_1(float *restrict a_vec, float *restrict b_vec,"""
         """ float *restrict c_vec, float *restrict d_vec,"""
         """ const int i_size, const int j_size, const int k_size,"""
         """ const int i, const int j, const int kf_M, const int kf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec;
  float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) ="""
         """ (float (*)[j_size][k_size]) d_vec;
  for (int k = kf_m; k <= kf_M; k += 1)
  {
    a[i] = a[i]*b[i]*c[i][j]*d[i][j][k];
    a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]);
  }
}
void f_2(float *restrict a_vec, float *restrict b_vec,"""
         """ const int i_size, const int i, const int qf_M, const int qf_m)
{
  float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec;
  float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec;
  for (int q = qf_m; q <= qf_M; q += 1)
  {
    a[i] = 8.0F*a[i] + 6.0F/b[i];
  }
}""")
Esempio n. 25
0
    def _make_fetchwaitprefetch(self, iet, sync_ops, pieces, root):
        fetches = []
        prefetches = []
        presents = []
        for s in sync_ops:
            if s.direction is Forward:
                fc = s.fetch.subs(s.dim, s.dim.symbolic_min)
                pfc = s.fetch + 1
                fc_cond = s.next_cbk(s.dim.symbolic_min)
                pfc_cond = s.next_cbk(s.dim + 1)
            else:
                fc = s.fetch.subs(s.dim, s.dim.symbolic_max)
                pfc = s.fetch - 1
                fc_cond = s.next_cbk(s.dim.symbolic_max)
                pfc_cond = s.next_cbk(s.dim - 1)

            # Construct init IET
            imask = [(fc, s.size) if d.root is s.dim.root else FULL for d in s.dimensions]
            fetch = PragmaList(self.lang._map_to(s.function, imask),
                               {s.function} | fc.free_symbols)
            fetches.append(Conditional(fc_cond, fetch))

            # Construct present clauses
            imask = [(s.fetch, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            presents.extend(as_list(self.lang._map_present(s.function, imask)))

            # Construct prefetch IET
            imask = [(pfc, s.size) if d.root is s.dim.root else FULL
                     for d in s.dimensions]
            prefetch = PragmaList(self.lang._map_to_wait(s.function, imask,
                                                         SharedData._field_id),
                                  {s.function} | pfc.free_symbols)
            prefetches.append(Conditional(pfc_cond, prefetch))

        # Turn init IET into a Callable
        functions = filter_ordered(s.function for s in sync_ops)
        name = self.sregistry.make_name(prefix='init_device')
        body = List(body=fetches)
        parameters = filter_sorted(functions + derive_parameters(body))
        func = Callable(name, body, 'void', parameters, 'static')
        pieces.funcs.append(func)

        # Perform initial fetch by the main thread
        pieces.init.append(List(
            header=c.Comment("Initialize data stream"),
            body=[Call(name, parameters), BlankLine]
        ))

        # Turn prefetch IET into a ThreadFunction
        name = self.sregistry.make_name(prefix='prefetch_host_to_device')
        body = List(header=c.Line(), body=prefetches)
        tctx = make_thread_ctx(name, body, root, None, sync_ops, self.sregistry)
        pieces.funcs.extend(tctx.funcs)

        # Glue together all the IET pieces, including the activation logic
        sdata = tctx.sdata
        threads = tctx.threads
        iet = List(body=[
            BlankLine,
            BusyWait(CondNe(FieldFromComposite(sdata._field_flag,
                                               sdata[threads.index]), 1)),
            List(header=presents),
            iet,
            tctx.activate
        ])

        # Fire up the threads
        pieces.init.append(tctx.init)
        pieces.threads.append(threads)

        # Final wait before jumping back to Python land
        pieces.finalize.append(tctx.finalize)

        return iet
Esempio n. 26
0
 def _profile_sections(self, iet):
     """Introduce C-level profiling nodes within the Iteration/Expression tree."""
     return List(body=iet), None
Esempio n. 27
0
    def _loop_blocking(self, iet):
        """
        Apply loop blocking to PARALLEL Iteration trees.
        """
        blockinner = bool(self.params.get('blockinner'))
        blockalways = bool(self.params.get('blockalways'))

        # Make sure loop blocking will span as many Iterations as possible
        iet = fold_blockable_tree(iet, blockinner)

        mapper = {}
        efuncs = []
        block_dims = []
        for tree in retrieve_iteration_tree(iet):
            # Is the Iteration tree blockable ?
            iterations = filter_iterations(tree, lambda i: i.is_Parallel)
            if not blockinner:
                iterations = iterations[:-1]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not blockalways:
                # Heuristically bypass loop blocking if we think `tree`
                # won't be computationally expensive. This will help with code
                # size/redability, JIT time, and auto-tuning time
                if not (tree.root.is_Sequential or iet.is_Callable):
                    # E.g., not inside a time-stepping Iteration
                    continue
                if any(i.dim.is_Sub and i.dim.local for i in tree):
                    # At least an outer Iteration is over a local SubDimension,
                    # which suggests the computational cost of this Iteration
                    # nest will be negligible w.r.t. the "core" Iteration nest
                    # (making use of non-local (Sub)Dimensions only)
                    continue
            if not IsPerfectIteration().visit(root):
                # Don't know how to block non-perfect nests
                continue

            # Apply loop blocking to `tree`
            interb = []
            intrab = []
            for i in iterations:
                d = BlockDimension(i.dim, name="%s%d_blk" % (i.dim.name, len(mapper)))
                block_dims.append(d)
                # Build Iteration over blocks
                properties = (PARALLEL,) + ((AFFINE,) if i.is_Affine else ())
                interb.append(Iteration([], d, d.symbolic_max, properties=properties))
                # Build Iteration within a block
                intrab.append(i._rebuild([], limits=(d, d+d.step-1, 1), offsets=(0, 0)))

            # Construct the blocked tree
            blocked = compose_nodes(interb + intrab + [iterations[-1].nodes])
            blocked = unfold_blocked_tree(blocked)

            # Promote to a separate Callable
            dynamic_parameters = flatten((bi.dim, bi.dim.symbolic_size) for bi in interb)
            efunc = make_efunc("bf%d" % len(mapper), blocked, dynamic_parameters)
            efuncs.append(efunc)

            # Compute the iteration ranges
            ranges = []
            for i, bi in zip(iterations, interb):
                maxb = i.symbolic_max - (i.symbolic_size % bi.dim.step)
                ranges.append(((i.symbolic_min, maxb, bi.dim.step),
                               (maxb + 1, i.symbolic_max, i.symbolic_max - maxb)))

            # Build Calls to the `efunc`
            body = []
            for p in product(*ranges):
                dynamic_args_mapper = {}
                for bi, (m, M, b) in zip(interb, p):
                    dynamic_args_mapper[bi.dim] = (m, M)
                    dynamic_args_mapper[bi.dim.step] = (b,)
                call = efunc.make_call(dynamic_args_mapper)
                body.append(List(body=call))

            mapper[root] = List(body=body)

        iet = Transformer(mapper).visit(iet)

        return iet, {'dimensions': block_dims, 'efuncs': efuncs,
                     'args': [i.step for i in block_dims]}
Esempio n. 28
0
    def _loop_blocking(self, nodes, state):
        """Apply loop blocking to PARALLEL Iteration trees."""
        exclude_innermost = not self.params.get('blockinner', False)
        ignore_heuristic = self.params.get('blockalways', False)

        # Make sure loop blocking will span as many Iterations as possible
        fold = fold_blockable_tree(nodes, exclude_innermost)

        mapper = {}
        blocked = OrderedDict()
        for tree in retrieve_iteration_tree(fold):
            # Is the Iteration tree blockable ?
            iterations = [i for i in tree if i.is_Parallel]
            if exclude_innermost:
                iterations = [i for i in iterations if not i.is_Vectorizable]
            if len(iterations) <= 1:
                continue
            root = iterations[0]
            if not IsPerfectIteration().visit(root):
                # Illegal/unsupported
                continue
            if not tree.root.is_Sequential and not ignore_heuristic:
                # Heuristic: avoid polluting the generated code with blocked
                # nests (thus increasing JIT compilation time and affecting
                # readability) if the blockable tree isn't embedded in a
                # sequential loop (e.g., a timestepping loop)
                continue

            # Decorate intra-block iterations with an IterationProperty
            TAG = tagger(len(mapper))

            # Build all necessary Iteration objects, individually. These will
            # subsequently be composed to implement loop blocking.
            inter_blocks = []
            intra_blocks = []
            remainders = []
            for i in iterations:
                # Build Iteration over blocks
                name = "%s%d_block" % (i.dim.name, len(mapper))
                dim = blocked.setdefault(i, BlockDimension(i.dim, name=name))
                binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0])
                bmax = i.dim.symbolic_max - (binnersize % dim.step)
                inter_block = Iteration([], dim, bmax, offsets=i.offsets,
                                        properties=PARALLEL)
                inter_blocks.append(inter_block)

                # Build Iteration within a block
                limits = (dim, dim + dim.step - 1, 1)
                intra_block = i._rebuild([], limits=limits, offsets=(0, 0),
                                         properties=i.properties + (TAG, ELEMENTAL))
                intra_blocks.append(intra_block)

                # Build unitary-increment Iteration over the 'leftover' region.
                # This will be used for remainder loops, executed when any
                # dimension size is not a multiple of the block size.
                remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1],
                                       offsets=(i.offsets[1], i.offsets[1]))
                remainders.append(remainder)

            # Build blocked Iteration nest
            blocked_tree = compose_nodes(inter_blocks + intra_blocks +
                                         [iterations[-1].nodes])

            # Build remainder Iterations
            remainder_trees = []
            for n in range(len(iterations)):
                for c in combinations([i.dim for i in iterations], n + 1):
                    # First all inter-block Interations
                    nodes = [b._rebuild(properties=b.properties + (REMAINDER,))
                             for b, r in zip(inter_blocks, remainders)
                             if r.dim not in c]
                    # Then intra-block or remainder, for each dim (in order)
                    properties = (REMAINDER, TAG, ELEMENTAL)
                    for b, r in zip(intra_blocks, remainders):
                        handle = r if b.dim in c else b
                        nodes.append(handle._rebuild(properties=properties))
                    nodes.extend([iterations[-1].nodes])
                    remainder_trees.append(compose_nodes(nodes))

            # Will replace with blocked loop tree
            mapper[root] = List(body=[blocked_tree] + remainder_trees)

        rebuilt = Transformer(mapper).visit(fold)

        # Finish unrolling any previously folded Iterations
        processed = unfold_blocked_tree(rebuilt)

        return processed, {'dimensions': list(blocked.values())}
Esempio n. 29
0
    def _minimize_remainders(self, iet):
        """
        Reshape temporary tensors and adjust loop trip counts to prevent as many
        compiler-generated remainder loops as possible.
        """
        # The innermost dimension is the one that might get padded
        p_dim = -1

        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            vector_iterations = [i for i in tree if i.is_Vectorizable]
            if not vector_iterations or len(vector_iterations) > 1:
                continue
            root = vector_iterations[0]

            # Padding
            writes = [i.write for i in FindNodes(Expression).visit(root)
                      if i.write.is_Array]
            padding = []
            for i in writes:
                try:
                    simd_items = self.platform.simd_items_per_reg(i.dtype)
                except KeyError:
                    return iet, {}
                padding.append(simd_items - i.shape[-1] % simd_items)
            if len(set(padding)) == 1:
                padding = padding[0]
                for i in writes:
                    padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding)
                    i.update(padding=i._padding[:p_dim] + (padded,))
            else:
                # Padding must be uniform -- not the case, so giving up
                continue

            # Dynamic trip count adjustment
            endpoint = root.symbolic_max
            if not endpoint.is_Symbol:
                continue
            condition = []
            externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)
                            if i.is_Tensor)
            for i in root.uindices:
                for j in externals:
                    condition.append(root.symbolic_max + padding < j)
            condition = ' && '.join(ccode(i) for i in condition)
            endpoint_padded = endpoint.func('_%s' % endpoint.name)
            init = cgen.Initializer(
                cgen.Value("const int", endpoint_padded),
                cgen.Line('(%s) ? %s : %s' % (condition,
                                              ccode(endpoint + padding),
                                              endpoint))
            )

            # Update the Iteration bound
            limits = list(root.limits)
            limits[1] = endpoint_padded.func(endpoint_padded.name)
            rebuilt = list(tree)
            rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits)

            mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt))

        processed = Transformer(mapper).visit(iet)

        return processed, {}
Esempio n. 30
0
def iet_make(clusters, dtype):
    """
    Create an Iteration/Expression tree (IET) given an iterable of :class:`Cluster`s.

    :param clusters: The iterable :class:`Cluster`s for which the IET is built.
    :param dtype: The data type of the scalar expressions.
    """
    processed = []
    schedule = OrderedDict()
    for cluster in clusters:
        if not cluster.ispace.empty:
            root = None
            intervals = cluster.ispace.intervals

            # Can I reuse any of the previously scheduled Iterations ?
            index = 0
            for i0, i1 in zip(intervals, list(schedule)):
                if i0 != i1 or i0.dim in clusters.atomics[cluster]:
                    break
                root = schedule[i1]
                index += 1
            needed = intervals[index:]

            # Build Iterations, including any necessary unbounded index
            iters = []
            for i in needed:
                uindices = []
                for j, offs in cluster.ispace.sub_iterators.get(i.dim, []):
                    for n, o in enumerate(filter_ordered(offs)):
                        name = "%s%d" % (j.name, n)
                        vname = Scalar(name=name, dtype=np.int32)
                        value = (i.dim + o) % j.modulo
                        uindices.append(UnboundedIndex(vname, value, value, j, j + o))
                iters.append(Iteration([], i.dim, i.dim.limits, offsets=i.limits,
                                       uindices=uindices))

            # Build Expressions
            exprs = [Expression(v, np.int32 if cluster.trace.is_index(k) else dtype)
                     for k, v in cluster.trace.items()]

            # Compose Iterations and Expressions
            body, tree = compose_nodes(iters + [exprs], retrieve=True)

            # Update the current scheduling
            scheduling = OrderedDict(zip(needed, tree))
            if root is None:
                processed.append(body)
                schedule = scheduling
            else:
                nodes = list(root.nodes) + [body]
                mapper = {root: root._rebuild(nodes, **root.args_frozen)}
                transformer = Transformer(mapper)
                processed = list(transformer.visit(processed))
                schedule = OrderedDict(list(schedule.items())[:index] +
                                       list(scheduling.items()))
                for k, v in list(schedule.items()):
                    schedule[k] = transformer.rebuilt.get(v, v)
        else:
            # No Iterations are needed
            processed.extend([Expression(e, dtype) for e in cluster.exprs])

    return List(body=processed)