Example #1
0
    def _make_partree(self, candidates, nthreads=None):
        """Parallelize the `candidates` Iterations attaching suitable OpenMP pragmas."""
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.DYNAMIC_WORK:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma omp for ... schedule(..., 1)
                nthreads = self.nthreads
                body = ParallelIteration(schedule=schedule, ncollapse=ncollapse,
                                         **root.args)
            else:
                # pragma omp parallel for ... schedule(..., 1)
                body = ParallelIteration(schedule=schedule, parallel=True,
                                         ncollapse=ncollapse, nthreads=nthreads,
                                         **root.args)
            prefix = []
        else:
            # pragma omp for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = ParallelIteration(ncollapse=ncollapse, chunk_size=chunk_size,
                                     **root.args)

            niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads*self.CHUNKSIZE_NONAFFINE), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        collapsed = [partree] + collapsable

        return root, partree, collapsed
Example #2
0
    def _make_reductions(self, partree):
        if not any(i.is_ParallelAtomic for i in partree.collapsed):
            return partree

        exprs = [i for i in FindNodes(Expression).visit(partree) if i.is_Increment]
        reduction = [i.output for i in exprs]
        if all(i.is_Affine for i in partree.collapsed) or \
           all(not i.is_Indexed for i in reduction):
            # Implement reduction
            mapper = {partree.root: partree.root._rebuild(reduction=reduction)}
        else:
            # Make sure the increment is atomic
            mapper = {i: i._rebuild(pragmas=self.lang['atomic']) for i in exprs}

        partree = Transformer(mapper).visit(partree)

        return partree
Example #3
0
def test_codegen_quality0():
    grid = Grid(shape=(4, 4))

    u = TimeFunction(name='u', grid=grid, space_order=2)

    eqn = Eq(u.forward, u.dx2 + 1.)

    op = Operator(eqn, opt=('advanced', {'linearize': True}))

    assert 'uL0' in str(op)

    exprs = FindNodes(Expression).visit(op)
    assert len(exprs) == 6
    assert all('const long' in str(i) for i in exprs[:-2])

    # Only four access macros necessary, namely `uL0`, `aL0`, `bufL0`, `bufL1` (the
    # other three obviously are _POSIX_C_SOURCE, START_TIMER, STOP_TIMER)
    assert len(op._headers) == 7
Example #4
0
    def _make_partree(self, candidates, nthreads=None):
        assert candidates
        root = candidates[0]

        # Get the collapsable Iterations
        collapsable = self._find_collapsable(root, candidates)
        ncollapse = 1 + len(collapsable)

        # Prepare to build a ParallelTree
        if all(i.is_Affine for i in candidates):
            bundles = FindNodes(ExpressionBundle).visit(root)
            sops = sum(i.ops for i in bundles)
            if sops >= self.dynamic_work:
                schedule = 'dynamic'
            else:
                schedule = 'static'
            if nthreads is None:
                # pragma ... for ... schedule(..., 1)
                nthreads = self.nthreads
                body = self.HostIteration(schedule=schedule, ncollapse=ncollapse,
                                          **root.args)
            else:
                # pragma ... parallel for ... schedule(..., 1)
                body = self.HostIteration(schedule=schedule, parallel=True,
                                          ncollapse=ncollapse, nthreads=nthreads,
                                          **root.args)
            prefix = []
        else:
            # pragma ... for ... schedule(..., expr)
            assert nthreads is None
            nthreads = self.nthreads_nonaffine
            chunk_size = Symbol(name='chunk_size')
            body = self.HostIteration(ncollapse=ncollapse, chunk_size=chunk_size,
                                      **root.args)

            niters = prod([root.symbolic_size] + [j.symbolic_size for j in collapsable])
            value = INT(Max(niters / (nthreads*self.chunk_nonaffine), 1))
            prefix = [Expression(DummyEq(chunk_size, value, dtype=np.int32))]

        # Create a ParallelTree
        partree = ParallelTree(prefix, body, nthreads=nthreads)

        return root, partree
Example #5
0
    def _make_parallel(self, iet):
        mapper = {}
        parrays = {}
        for tree in retrieve_iteration_tree(iet):
            # Get the parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)
            if partree is None or root in mapper:
                continue

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region
            parregion = self._make_parregion(partree, parrays)

            # Protect the parallel region if necessary
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The new arguments introduced by this pass
        args = [
            i for i in FindSymbols().visit(iet)
            if isinstance(i, (NThreadsMixin))
        ]
        for n in FindNodes(HeapGlobal).visit(iet):
            args.extend([(n.array, True), n.parray])

        return iet, {'args': args, 'includes': [self.lang['header']]}
Example #6
0
    def _make_parallel(self, iet):
        mapper = {}
        parrays = {}
        for tree in retrieve_iteration_tree(iet):
            # Get the omp-parallelizable Iterations in `tree`
            candidates = filter_iterations(tree, key=self.key)
            if not candidates:
                continue

            # Outer parallelism
            root, partree, collapsed = self._make_partree(candidates)
            if root in mapper:
                continue

            # Nested parallelism
            partree = self._make_nested_partree(partree)

            # Handle reductions
            partree = self._make_reductions(partree, collapsed)

            # Atomicize and optimize single-thread prodders
            partree = self._make_threaded_prodders(partree)

            # Wrap within a parallel region, declaring private and shared variables
            parregion = self._make_parregion(partree, parrays)

            # Protect the parallel region in case of 0-valued step increments
            parregion = self._make_guard(parregion, collapsed)

            mapper[root] = parregion

        iet = Transformer(mapper).visit(iet)

        # The new arguments introduced by this pass
        args = [
            i for i in FindSymbols().visit(iet)
            if isinstance(i, (NThreadsMixin))
        ]
        for n in FindNodes(Dereference).visit(iet):
            args.extend([(n.array, True), n.parray])

        return iet, {'args': args, 'includes': ['omp.h']}
Example #7
0
    def test_tasking_lock_placement(self):
        grid = Grid(shape=(10, 10, 10))

        f = Function(name='f', grid=grid, space_order=2)
        u = TimeFunction(name='u', grid=grid)
        usave = TimeFunction(name='usave', grid=grid, save=10)

        eqns = [Eq(f, u + 1), Eq(u.forward, f.dx + u + 1), Eq(usave, u)]

        op = Operator(eqns, opt=('tasking', 'orchestrate'))

        # Check generated code -- the wait-lock is expected in section1
        assert len(retrieve_iteration_tree(op)) == 5
        assert len([i for i in FindSymbols().visit(op)
                    if isinstance(i, Lock)]) == 1
        sections = FindNodes(Section).visit(op)
        assert len(sections) == 3
        assert sections[0].body[0].body[0].body[0].is_Iteration
        assert str(sections[1].body[0].body[0].body[0].body[0]) ==\
            'while(lock0[t1] == 0);'
Example #8
0
    def _make_reductions(self, partree, collapsed):
        if not partree.is_ParallelAtomic:
            return partree

        # Collect expressions inducing reductions
        exprs = FindNodes(Expression).visit(partree)
        exprs = [i for i in exprs if i.is_Increment and not i.is_ForeignExpression]

        reduction = [i.output for i in exprs]
        if (all(i.is_Affine for i in collapsed)
                or all(not i.is_Indexed for i in reduction)):
            # Introduce reduction clause
            mapper = {partree.root: partree.root._rebuild(reduction=reduction)}
        else:
            # Introduce one `omp atomic` pragma for each increment
            mapper = {i: List(header=self.lang['atomic'], body=i) for i in exprs}

        partree = Transformer(mapper).visit(partree)

        return partree
Example #9
0
    def place_casts(self, iet):
        """
        Create a new IET with the necessary type casts.

        Parameters
        ----------
        iet : Callable
            The input Iteration/Expression tree.
        """
        # Make the generated code less verbose: if a non-Array parameter does not
        # appear in any Expression, that is, if the parameter is merely propagated
        # down to another Call, then there's no need to cast it
        exprs = FindNodes(Expression).visit(iet)
        need_cast = {i for i in set().union(*[i.functions for i in exprs]) if i.is_Tensor}
        need_cast.update({i for i in iet.parameters if i.is_Array})

        casts = tuple(ArrayCast(i) for i in iet.parameters if i in need_cast)
        iet = iet._rebuild(body=casts + iet.body)

        return iet, {}
Example #10
0
    def test_tasking_fused(self):
        nt = 10
        bundle0 = Bundle()
        grid = Grid(shape=(10, 10, 10), subdomains=bundle0)

        tmp = Function(name='tmp', grid=grid)
        u = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid, save=nt)
        w = TimeFunction(name='w', grid=grid)

        eqns = [
            Eq(w.forward, w + 1),
            Eq(tmp, w.forward),
            Eq(u.forward, tmp, subdomain=bundle0),
            Eq(v.forward, tmp, subdomain=bundle0)
        ]

        op = Operator(eqns, opt=('tasking', 'fuse', 'orchestrate'))

        # Check generated code
        assert len(retrieve_iteration_tree(op)) == 5
        locks = [i for i in FindSymbols().visit(op) if isinstance(i, Lock)]
        assert len(
            locks) == 1  # Only 1 because it's only `tmp` that needs protection
        assert len(op._func_table) == 2
        exprs = FindNodes(Expression).visit(
            op._func_table['copy_device_to_host0'].root)
        assert len(exprs) == 20
        assert str(exprs[12]) == 'int id = sdata0->id;'
        assert str(exprs[13]) == 'int deviceid = sdata0->deviceid;'
        assert str(exprs[14]) == 'const int time = sdata0->time;'
        assert str(exprs[15]) == 'lock0[0] = 1;'
        assert exprs[16].write is u
        assert exprs[17].write is v
        assert str(exprs[18]) == 'lock0[0] = 2;'
        assert str(exprs[19]) == 'sdata0->flag = 1;'

        op.apply(time_M=nt - 2)

        assert np.all(u.data[nt - 1] == 9)
        assert np.all(v.data[nt - 1] == 9)
Example #11
0
File: misc.py Project: ofmla/devito
def hoist_prodders(iet):
    """
    Move Prodders within the outer levels of an Iteration tree.
    """
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        for prodder in FindNodes(Prodder).visit(tree.root):
            if prodder._periodic:
                try:
                    key = lambda i: i.dim.is_Incr and i.dim.step != 1
                    candidate = filter_iterations(tree, key)[-1]
                except IndexError:
                    # Fallback: use the outermost Iteration
                    candidate = tree.root
                mapper[candidate] = candidate._rebuild(nodes=(candidate.nodes +
                                                              (prodder._rebuild(),)))
                mapper[prodder] = None

    iet = Transformer(mapper, nested=True).visit(iet)

    return iet, {}
Example #12
0
def test_tti_clusters_to_graph():
    solver = tti_operator()

    nodes = FindNodes(Expression).visit(
        solver.op_fwd('centered').elemental_functions +
        (solver.op_fwd('centered'), ))
    expressions = [n.expr for n in nodes]
    stencils = solver.op_fwd('centered')._retrieve_stencils(expressions)
    clusters = clusterize(expressions, stencils)
    assert len(clusters) == 3

    main_cluster = clusters[0]
    n_output_tensors = len(main_cluster.trace)

    clusters = rewrite([main_cluster], mode='basic')
    assert len(clusters) == 1
    main_cluster = clusters[0]

    graph = main_cluster.trace
    assert len([v for v in graph.values() if v.is_tensor
                ]) == n_output_tensors  # u and v
    assert all(v.reads or v.readby for v in graph.values())
Example #13
0
def test_codegen_quality1():
    grid = Grid(shape=(4, 4, 4))

    u = TimeFunction(name='u', grid=grid, space_order=2)

    eqn = Eq(u.forward, u.dy.dy + 1.)

    op = Operator(eqn,
                  opt=('advanced', {
                      'linearize': True,
                      'cire-mingain': 0
                  }))

    assert 'uL0' in str(op)

    # 11 expressions in total are expected, 8 of which are for the linearized accesses
    exprs = FindNodes(Expression).visit(op)
    assert len(exprs) == 11
    assert all('const long' in str(i) for i in exprs[:-3])
    assert all('const long' not in str(i) for i in exprs[-3:])

    # Only two access macros necessary, namely `uL0` and `r1L0` (the other five
    # obviously are _POSIX_C_SOURCE, MIN, MAX, START_TIMER, STOP_TIMER)
    assert len(op._headers) == 7
Example #14
0
    def test_skewing_codegen(self, expr, expected, skewing, blockinner):
        """Tests code generation on skewed indices."""
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions
        time = grid.time_dim

        u = TimeFunction(name='u', grid=grid)  # noqa
        v = TimeFunction(name='v', grid=grid)  # noqa
        eqn = eval(expr)
        # List comprehension would need explicit locals/globals mappings to eval
        op = Operator(eqn,
                      opt=('blocking', {
                          'blocklevels': 0,
                          'skewing': skewing,
                          'blockinner': blockinner
                      }))
        op.apply(time_M=5)

        iters = FindNodes(Iteration).visit(op)

        assert len(iters) == 4
        assert iters[0].dim is time
        assert iters[1].dim is x
        assert iters[2].dim is y
        assert iters[3].dim is z

        skewed = [i.expr for i in FindNodes(Expression).visit(op)]

        if skewing and not blockinner:
            assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min +
                                              time))
            assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max +
                                              time))
            assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min +
                                              time))
            assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max +
                                              time))
            assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min))
            assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max))
        elif skewing and blockinner:
            assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min +
                                              time))
            assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max +
                                              time))
            assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min +
                                              time))
            assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max +
                                              time))
            assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min +
                                              time))
            assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max +
                                              time))
        elif not skewing and not blockinner:
            assert (iters[1].symbolic_min == (iters[1].dim.symbolic_min))
            assert (iters[1].symbolic_max == (iters[1].dim.symbolic_max))
            assert (iters[2].symbolic_min == (iters[2].dim.symbolic_min))
            assert (iters[2].symbolic_max == (iters[2].dim.symbolic_max))
            assert (iters[3].symbolic_min == (iters[3].dim.symbolic_min))
            assert (iters[3].symbolic_max == (iters[3].dim.symbolic_max))

        assert str(skewed[0]).replace(' ', '') == expected
 def _make_atomic_prodders(self, partree):
     # Atomic-ize any single-thread Prodders in the parallel tree
     mapper = {i: SingleThreadProdder(i) for i in FindNodes(Prodder).visit(partree)}
     partree = Transformer(mapper).visit(partree)
     return partree
Example #16
0
    def place_definitions(self, iet, **kwargs):
        """
        Create a new IET with symbols allocated/deallocated in some memory space.

        Parameters
        ----------
        iet : Callable
            The input Iteration/Expression tree.
        """
        storage = Storage()

        # Collect and declare symbols
        for k, v in MapExprStmts().visit(iet).items():
            if k.is_Expression:
                if k.is_definition:
                    site = v[-1] if v else iet
                    self._alloc_scalar_on_low_lat_mem(site, k, storage)
                    continue
                objs = [k.write]
            elif k.is_Call:
                objs = k.arguments

            for i in objs:
                try:
                    if i.is_LocalObject:
                        site = v[-1] if v else iet
                        self._alloc_object_on_low_lat_mem(site, i, storage)
                    elif i.is_Array:
                        if i in iet.parameters:
                            # The Array is passed as a Callable argument
                            continue
                        elif i._mem_stack:
                            self._alloc_array_on_low_lat_mem(iet, i, storage)
                        else:
                            self._alloc_array_on_high_bw_mem(i, storage)
                except AttributeError:
                    # E.g., a generic SymPy expression
                    pass

        # Place symbols in a memory space
        if not iet.is_ElementalFunction:
            writes = set()
            reads = set()
            for efunc in kwargs.get('efuncs', []):
                for i in FindNodes(Expression).visit(efunc):
                    if i.write.is_Function:
                        writes.add(i.write)
                    reads = (reads
                             | {r
                                for r in i.reads if r.is_Function}) - writes

            for i in filter_sorted(writes):
                self._map_function_on_high_bw_mem(i, storage)
            for i in filter_sorted(reads):
                self._map_function_on_high_bw_mem(i, storage, read_only=True)

        # Introduce symbol definitions going in the low latency memory
        mapper = dict(storage._on_low_lat_mem)
        iet = Transformer(mapper, nested=True).visit(iet)

        # Introduce symbol definitions going in the high bandwidth memory
        header = []
        footer = []
        for decl, alloc, free in storage._on_high_bw_mem:
            if decl is None:
                header.append(alloc)
            else:
                header.extend([decl, alloc])
            footer.append(free)
        if header or footer:
            body = List(header=header, body=iet.body, footer=footer)
            iet = iet._rebuild(body=body)

        return iet, {}
Example #17
0
 def _make_threaded_prodders(self, partree):
     mapper = {i: self.Prodder(i) for i in FindNodes(Prodder).visit(partree)}
     partree = Transformer(mapper).visit(partree)
     return partree
Example #18
0
def optimize_unfolded_tree(unfolded, root):
    """
    Transform folded trees to reduce the memory footprint.

    Examples
    --------
    Given:

        .. code-block::
            for i = 1 to N - 1  # Folded tree
              for j = 1 to N - 1
                tmp[i,j] = ...
            for i = 2 to N - 2  # Root
              for j = 2 to N - 2
                ... = ... tmp[i,j] ...

    The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the
    iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the
    ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting
    iteration tree becomes:

        .. code-block::
            for i = 1 to i_bs + 1  # Folded tree
              for j = 1 to j_bs + 1
                i' = i + i_block - 2
                j' = j + j_block - 2
                tmp[i,j] = ... # use i' and j'
            for i = i_block to i_block + i_bs  # Root
              for j = j_block to j_block + j_bs
                i' = i - x_block
                j' = j - j_block
                ... = ... tmp[i',j'] ...
    """
    processed = []
    for i, tree in enumerate(unfolded):
        assert len(tree) == len(root)

        # We can optimize the folded trees only iff:
        # test0 := they compute temporary arrays, but not if they compute input data
        # test1 := the outer Iterations have actually been blocked
        exprs = FindNodes(Expression).visit(tree)
        writes = [j.write for j in exprs if j.is_tensor]
        test0 = not all(j.is_Array for j in writes)
        test1 = any(not isinstance(j.limits[0], BlockDimension) for j in root)
        if test0 or test1:
            processed.append(compose_nodes(tree))
            root = compose_nodes(root)
            continue

        # Shrink the iteration space
        modified_tree = []
        modified_root = []
        modified_dims = {}
        mapper = {}
        for t, r in zip(tree, root):
            udim0 = IncrDimension(t.dim, t.symbolic_min, 1, "%ss%d" % (t.index, i))
            modified_tree.append(t._rebuild(limits=(0, t.limits[1] - t.limits[0], t.step),
                                            uindices=t.uindices + (udim0,)))

            mapper[t.dim] = udim0

            udim1 = IncrDimension(t.dim, 0, 1, "%ss%d" % (t.index, i))
            modified_root.append(r._rebuild(uindices=r.uindices + (udim1,)))

            d = r.limits[0]
            assert isinstance(d, BlockDimension)
            modified_dims[d.root] = d

        # Temporary arrays can now be moved onto the stack
        for w in writes:
            dims = tuple(modified_dims.get(d, d) for d in w.dimensions)
            shape = tuple(d.symbolic_size for d in dims)
            w.update(shape=shape, dimensions=dims, scope='stack')

        # Substitute iteration variables within the folded trees
        modified_tree = compose_nodes(modified_tree)
        replaced = xreplace_indices([j.expr for j in exprs], mapper,
                                    lambda i: i.function not in writes, True)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        processed.append(Transformer(dict(zip(exprs, subs))).visit(modified_tree))

        # Introduce the new iteration variables within /root/
        modified_root = compose_nodes(modified_root)
        exprs = FindNodes(Expression).visit(modified_root)
        candidates = [as_symbol(j.output) for j in subs]
        replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates)
        subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)]
        root = Transformer(dict(zip(exprs, subs))).visit(modified_root)

    return processed + [root]
Example #19
0
def linearize_accesses(iet, cache, sregistry):
    """
    Turn Indexeds into FIndexeds and create the necessary access Macros.
    """
    # Find all objects amenable to linearization
    symbol_names = {i.name for i in FindSymbols('indexeds').visit(iet)}
    functions = [f for f in FindSymbols().visit(iet)
                 if ((f.is_DiscreteFunction or f.is_Array) and
                     f.ndim > 1 and
                     f.name in symbol_names)]
    functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True)

    # Find unique sizes (unique -> minimize necessary registers)
    mapper = DefaultOrderedDict(list)
    for f in functions:
        if f not in cache:
            # NOTE: the outermost dimension is unnecessary
            for d in f.dimensions[1:]:
                # TODO: same grid + same halo => same padding, however this is
                # never asserted throughout the compiler yet... maybe should do
                # it when in debug mode at `prepare_arguments` time, ie right
                # before jumping to C?
                mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f)

    # Build all exprs such as `x_fsz0 = u_vec->size[1]`
    imapper = DefaultOrderedDict(list)
    for (d, halo, _), v in mapper.items():
        name = sregistry.make_name(prefix='%s_fsz' % d.name)
        s = Symbol(name=name, dtype=np.int32, is_const=True)
        try:
            expr = DummyExpr(s, v[0]._C_get_field(FULL, d).size, init=True)
        except AttributeError:
            assert v[0].is_Array
            expr = DummyExpr(s, v[0].symbolic_shape[d], init=True)
        for f in v:
            imapper[f].append((d, s))
            cache[f].stmts0.append(expr)

    # Build all exprs such as `y_slc0 = y_fsz0*z_fsz0`
    built = {}
    mapper = DefaultOrderedDict(list)
    for f, v in imapper.items():
        for n, (d, _) in enumerate(v):
            expr = prod(list(zip(*v[n:]))[1])
            try:
                stmt = built[expr]
            except KeyError:
                name = sregistry.make_name(prefix='%s_slc' % d.name)
                s = Symbol(name=name, dtype=np.int32, is_const=True)
                stmt = built[expr] = DummyExpr(s, expr, init=True)
            mapper[f].append(stmt.write)
            cache[f].stmts1.append(stmt)
    mapper.update([(f, []) for f in functions if f not in mapper])

    # Build defines. For example:
    # `define uL(t, x, y, z) u[(t)*t_slice_sz + (x)*x_slice_sz + (y)*y_slice_sz + (z)]`
    headers = []
    findexeds = {}
    for f, szs in mapper.items():
        if cache[f].cbk is not None:
            # Perhaps we've already built an access macro for `f` through another efunc
            findexeds[f] = cache[f].cbk
        else:
            assert len(szs) == len(f.dimensions) - 1
            pname = sregistry.make_name(prefix='%sL' % f.name)

            expr = sum([MacroArgument(d.name)*s for d, s in zip(f.dimensions, szs)])
            expr += MacroArgument(f.dimensions[-1].name)
            expr = Indexed(IndexedData(f.name, None, f), expr)
            define = DefFunction(pname, f.dimensions)
            headers.append((ccode(define), ccode(expr)))

            cache[f].cbk = findexeds[f] = lambda i, pname=pname: FIndexed(i, pname)

    # Build "functional" Indexeds. For example:
    # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)`
    mapper = {}
    for n in FindNodes(Expression).visit(iet):
        subs = {}
        for i in retrieve_indexed(n.expr):
            try:
                subs[i] = findexeds[i.function](i)
            except KeyError:
                pass
        mapper[n] = n._rebuild(expr=uxreplace(n.expr, subs))

    # Put together all of the necessary exprs for `y_fsz0`, ..., `y_slc0`, ...
    stmts0 = filter_ordered(flatten(cache[f].stmts0 for f in functions))
    if stmts0:
        stmts0.append(BlankLine)
    stmts1 = filter_ordered(flatten(cache[f].stmts1 for f in functions))
    if stmts1:
        stmts1.append(BlankLine)

    iet = Transformer(mapper).visit(iet)
    body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body)
    iet = iet._rebuild(body=body)

    return iet, headers
Example #20
0
def linearize_accesses(iet, key, cache, sregistry):
    """
    Turn Indexeds into FIndexeds and create the necessary access Macros.
    """
    # `functions` are all Functions that `iet` may need to linearize
    functions = [f for f in FindSymbols().visit(iet) if key(f) and f.ndim > 1]
    functions = sorted(functions, key=lambda f: len(f.dimensions), reverse=True)

    # `functions_unseen` are all Functions that `iet` may need to linearize
    # and have not been seen while processing other IETs
    functions_unseen = [f for f in functions if f not in cache]

    # Find unique sizes (unique -> minimize necessary registers)
    mapper = DefaultOrderedDict(list)
    for f in functions:
        # NOTE: the outermost dimension is unnecessary
        for d in f.dimensions[1:]:
            # TODO: same grid + same halo => same padding, however this is
            # never asserted throughout the compiler yet... maybe should do
            # it when in debug mode at `prepare_arguments` time, ie right
            # before jumping to C?
            mapper[(d, f._size_halo[d], getattr(f, 'grid', None))].append(f)

    # For all unseen Functions, build the size exprs. For example:
    # `x_fsz0 = u_vec->size[1]`
    imapper = DefaultOrderedDict(dict)
    for (d, halo, _), v in mapper.items():
        v_unseen = [f for f in v if f in functions_unseen]
        if not v_unseen:
            continue
        expr = _generate_fsz(v_unseen[0], d, sregistry)
        if expr:
            for f in v_unseen:
                imapper[f][d] = expr.write
                cache[f].stmts0.append(expr)

    # For all unseen Functions, build the stride exprs. For example:
    # `y_stride0 = y_fsz0*z_fsz0`
    built = {}
    mapper = DefaultOrderedDict(dict)
    for f, v in imapper.items():
        for d in v:
            n = f.dimensions.index(d)
            expr = prod(v[i] for i in f.dimensions[n:])
            try:
                stmt = built[expr]
            except KeyError:
                name = sregistry.make_name(prefix='%s_stride' % d.name)
                s = Symbol(name=name, dtype=np.int64, is_const=True)
                stmt = built[expr] = DummyExpr(s, expr, init=True)
            mapper[f][d] = stmt.write
            cache[f].stmts1.append(stmt)
    mapper.update([(f, {}) for f in functions_unseen if f not in mapper])

    # For all unseen Functions, build defines. For example:
    # `#define uL(t, x, y, z) u[(t)*t_stride0 + (x)*x_stride0 + (y)*y_stride0 + (z)]`
    headers = []
    findexeds = {}
    for f in functions:
        if cache[f].cbk is None:
            header, cbk = _generate_macro(f, mapper[f], sregistry)
            headers.append(header)
            cache[f].cbk = findexeds[f] = cbk
        else:
            findexeds[f] = cache[f].cbk

    # Build "functional" Indexeds. For example:
    # `u[t2, x+8, y+9, z+7] => uL(t2, x+8, y+9, z+7)`
    mapper = {}
    indexeds = FindSymbols('indexeds').visit(iet)
    for i in indexeds:
        try:
            mapper[i] = findexeds[i.function](i)
        except KeyError:
            pass

    # Introduce the linearized expressions
    iet = Uxreplace(mapper).visit(iet)

    # All Functions that actually require linearization in `iet`
    candidates = []

    candidates.extend(filter_ordered(i.function for i in indexeds))

    calls = FindNodes(Call).visit(iet)
    cfuncs = filter_ordered(flatten(i.functions for i in calls))
    candidates.extend(i for i in cfuncs if i.function.is_DiscreteFunction)

    # All Functions that can be linearized in `iet`
    defines = FindSymbols('defines-aliases').visit(iet)

    # Place the linearization expressions or delegate to ancestor efunc
    stmts0 = []
    stmts1 = []
    args = []
    for f in candidates:
        if f in defines:
            stmts0.extend(cache[f].stmts0)
            stmts1.extend(cache[f].stmts1)
        else:
            args.extend([e.write for e in cache[f].stmts1])
    if stmts0:
        assert len(stmts1) > 0
        stmts0 = filter_ordered(stmts0) + [BlankLine]
        stmts1 = filter_ordered(stmts1) + [BlankLine]
        body = iet.body._rebuild(body=tuple(stmts0) + tuple(stmts1) + iet.body.body)
        iet = iet._rebuild(body=body)
    else:
        assert len(stmts0) == 0

    return iet, headers, args
Example #21
0
    def make_simd(self, iet):
        mapper = {}
        for tree in retrieve_iteration_tree(iet):
            candidates = [i for i in tree if i.is_ParallelRelaxed]

            # As long as there's an outer level of parallelism, the innermost
            # PARALLEL Iteration gets vectorized
            if len(candidates) < 2:
                continue
            candidate = candidates[-1]

            # Only fully-parallel Iterations will be SIMD-ized (ParallelRelaxed
            # might not be enough then)
            if not candidate.is_Parallel:
                continue

            # This check catches cases where an iteration appears as the vectorizable
            # candidate in tree A but has actually less priority over a candidate in
            # another tree B.
            #
            # Example:
            #
            # for (i = ... ) (End of tree A - i is the candidate for tree A)
            #   Expr1
            #   for (j = ...) (End of tree B - j is the candidate for tree B)
            #     Expr2
            #     ...
            if not IsPerfectIteration(depth=candidates[-2]).visit(candidate):
                continue

            # If it's an array reduction, we need to be sure the backend compiler
            # actually supports it. For example, it may be possible to
            #
            # #pragma parallel reduction(a[...])
            # for (i = ...)
            #   #pragma simd
            #   for (j = ...)
            #     a[j] += ...
            #
            # While the following could be unsupported
            #
            # #pragma parallel  // compiler doesn't support array reduction
            # for (i = ...)
            #   #pragma simd
            #   for (j = ...)
            #     #pragma atomic  // cannot nest simd and atomic
            #     a[j] += ...
            if any(i.is_ParallelAtomic for i in candidates[:-1]) and \
               not self._support_array_reduction(self.compiler):
                exprs = FindNodes(Expression).visit(candidate)
                reductions = [i.output for i in exprs if i.is_Increment]
                if any(i.is_Indexed for i in reductions):
                    continue

            # Add SIMD pragma
            indexeds = FindSymbols('indexeds').visit(candidate)
            aligned = {i.name for i in indexeds if i.function.is_DiscreteFunction}
            if aligned:
                simd = self.lang['simd-for-aligned']
                simd = as_tuple(simd(','.join(sorted(aligned)), self.simd_reg_size))
            else:
                simd = as_tuple(self.lang['simd-for'])
            pragmas = candidate.pragmas + simd

            # Add VECTORIZED property
            properties = list(candidate.properties) + [VECTORIZED]

            mapper[candidate] = candidate._rebuild(pragmas=pragmas, properties=properties)

        iet = Transformer(mapper).visit(iet)

        return iet, {}