Exemple #1
0
def test_two_homogeneous_buffers():
    nt = 10
    grid = Grid(shape=(4, 4))

    u = TimeFunction(name='u', grid=grid, save=nt)
    u1 = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid, save=nt)
    v1 = TimeFunction(name='v', grid=grid, save=nt)

    eqns = [
        Eq(u.forward, u + v + u.backward + v.backward + 1.),
        Eq(v.forward, u + v + u.backward + v.backward + 1.)
    ]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')
    op2 = Operator(eqns, opt=('buffering', 'fuse'))

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 2
    assert len(retrieve_iteration_tree(op2)) == 2
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 2

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, u=u1, v=v1)

    assert np.all(u.data == u1.data)
    assert np.all(v.data == v1.data)
Exemple #2
0
    def test_from_different_nests(self):
        """
        Check that aliases arising from two sets of equations A and B,
        characterized by a flow dependence, are scheduled within A's and B's
        loop nests respectively.
        """
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions  # noqa
        t = grid.stepping_dim
        i = Dimension(name='i')

        f = Function(name='f', grid=grid)
        f.data_with_halo[:] = 1.
        g = Function(name='g', shape=(3, ), dimensions=(i, ))
        g.data[:] = 2.
        u = TimeFunction(name='u', grid=grid, space_order=3)
        v = TimeFunction(name='v', grid=grid, space_order=3)

        # Leads to 3D aliases
        eqns = [
            Eq(u.forward,
               ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f +
                (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f
                + 1)),
            Inc(u[t + 1, i, i, i], g + 1),
            Eq(v.forward,
               ((v[t, x, y, z] + v[t, x + 1, y + 1, z + 1]) * 3 * u.forward +
                (v[t, x + 2, y + 2, z + 2] + v[t, x + 3, y + 3, z + 3]) * 3 *
                u.forward + 1))
        ]
        op0 = Operator(eqns, dse='noop', dle=('noop', {'openmp': True}))
        op1 = Operator(eqns,
                       dse='aggressive',
                       dle=('advanced', {
                           'openmp': True
                       }))

        # Check code generation
        assert 'bf0' in op1._func_table
        assert 'bf1' in op1._func_table
        trees = retrieve_iteration_tree(op1._func_table['bf0'].root)
        assert len(trees) == 2
        assert trees[0][-1].nodes[0].body[0].write.is_Array
        assert trees[1][-1].nodes[0].body[0].write is u
        trees = retrieve_iteration_tree(op1._func_table['bf1'].root)
        assert len(trees) == 2
        assert trees[0][-1].nodes[0].body[0].write.is_Array
        assert trees[1][-1].nodes[0].body[0].write is v

        # Check numerical output
        op0(time_M=1)
        exp = np.copy(u.data[:])
        u.data_with_halo[:] = 0.
        op1(time_M=1)
        assert np.all(u.data == exp)
Exemple #3
0
def test_over_injection():
    nt = 10
    grid = Grid(shape=(4, 4))

    src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=nt)
    rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=nt)
    u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt)
    u1 = TimeFunction(name="u",
                      grid=grid,
                      time_order=2,
                      space_order=2,
                      save=nt)

    src.data[:] = 1.

    eqns = ([Eq(u.forward, u + 1)] + src.inject(field=u.forward, expr=src) +
            rec.interpolate(expr=u.forward))

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) ==\
        5 + bool(configuration['language'] != 'C')
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, u=u1)

    assert np.all(u.data == u1.data)
Exemple #4
0
def test_read_only_backwards_unstructured():
    """
    Instead of the class `time-1`, `time`, and `time+1`, here we access the
    buffered Function via `time-2`, `time-1` and `time+2`.
    """
    nt = 10
    grid = Grid(shape=(2, 2))

    u = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(nt):
        u.data[i, :] = i

    eqns = [
        Eq(v.backward,
           v + u.backward.backward + u.backward + u.forward.forward + 1.)
    ]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 2
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_m=2)
    op1.apply(time_m=2, v=v1)

    assert np.all(v.data == v1.data)
Exemple #5
0
def test_read_only():
    nt = 10
    grid = Grid(shape=(2, 2))

    u = TimeFunction(name='u', grid=grid, save=nt)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(nt):
        u.data[i, :] = i

    eqns = [Eq(v.forward, v + u.backward + u + u.forward + 1.)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 2
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_M=nt - 2)
    op1.apply(time_M=nt - 2, v=v1)

    assert np.all(v.data == v1.data)
Exemple #6
0
    def _make_guard(self, parregion, *args):
        partrees = FindNodes(ParallelTree).visit(parregion)
        if not any(isinstance(i.root, self.DeviceIteration) for i in partrees):
            return super()._make_guard(parregion, *args)

        cond = []
        # There must be at least one iteration or potential crash
        if not parregion.is_Affine:
            trees = retrieve_iteration_tree(parregion.root)
            tree = trees[0][:parregion.ncollapsed]
            cond.extend([i.symbolic_size > 0 for i in tree])

        # SparseFunctions may occasionally degenerate to zero-size arrays. In such
        # a case, a copy-in produces a `nil` pointer on the device. To fire up a
        # parallel loop we must ensure none of the SparseFunction pointers are `nil`
        symbols = FindSymbols().visit(parregion)
        sfs = [i for i in symbols if i.is_SparseFunction]
        if sfs:
            size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs]
            cond.extend([i > 0 for i in size])

        # Drop dynamically evaluated conditions (e.g. because the `symbolic_size`
        # is an integer value rather than a symbol). This avoids ugly and
        # unnecessary conditionals such as `if (true) { ...}`
        cond = [i for i in cond if i != true]

        # Combine all cond elements
        if cond:
            parregion = List(body=[Conditional(And(*cond), parregion)])

        return parregion
Exemple #7
0
def test_conddim_backwards():
    nt = 10
    grid = Grid(shape=(4, 4))
    time_dim = grid.time_dim
    x, y = grid.dimensions

    factor = Constant(name='factor', value=2, dtype=np.int32)
    time_sub = ConditionalDimension(name="time_sub", parent=time_dim, factor=factor)

    u = TimeFunction(name='u', grid=grid, time_order=0, save=nt, time_dim=time_sub)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(u.save):
        u.data[i, :] = i

    eqns = [Eq(v.backward, v.backward + v + u + 1.)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 3
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    op0.apply(time_m=1, time_M=9)
    op1.apply(time_m=1, time_M=9, v=v1)

    assert np.all(v.data == v1.data)
Exemple #8
0
    def test_unread_buffered_function(self):
        nt = 10
        grid = Grid(shape=(4, 4))
        time = grid.time_dim

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)
        v = TimeFunction(name='v', grid=grid)
        v1 = TimeFunction(name='v', grid=grid)

        eqns = [Eq(v.forward, v + 1, implicit_dims=time), Eq(u, v)]

        op0 = Operator(eqns, opt='noop')
        op1 = Operator(eqns, opt='buffering')

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 1
        buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
        assert len(buffers) == 1

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1, v=v1)

        assert np.all(u.data == u1.data)
        assert np.all(v.data == v1.data)
Exemple #9
0
    def test_subdimensions(self):
        nt = 10
        grid = Grid(shape=(10, 10, 10))
        x, y, z = grid.dimensions
        xi = SubDimension.middle(name='xi',
                                 parent=x,
                                 thickness_left=2,
                                 thickness_right=2)
        yi = SubDimension.middle(name='yi',
                                 parent=y,
                                 thickness_left=2,
                                 thickness_right=2)
        zi = SubDimension.middle(name='zi',
                                 parent=z,
                                 thickness_left=2,
                                 thickness_right=2)

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)

        eqn = Eq(u.forward, u + 1).xreplace({x: xi, y: yi, z: zi})

        op0 = Operator(eqn, opt='noop')
        op1 = Operator(eqn, opt='buffering')

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 2
        assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 1

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1)

        assert np.all(u.data == u1.data)
Exemple #10
0
    def test_async_degree(self, async_degree):
        nt = 10
        grid = Grid(shape=(4, 4))

        u = TimeFunction(name='u', grid=grid, save=nt)
        u1 = TimeFunction(name='u', grid=grid, save=nt)

        eqn = Eq(u.forward, u + 1)

        op0 = Operator(eqn, opt='noop')
        op1 = Operator(eqn,
                       opt=('buffering', {
                           'buf-async-degree': async_degree
                       }))

        # Check generated code
        assert len(retrieve_iteration_tree(op1)) == 2
        buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
        assert len(buffers) == 1
        assert buffers.pop().symbolic_shape[0] == async_degree

        op0.apply(time_M=nt - 2)
        op1.apply(time_M=nt - 2, u=u1)

        assert np.all(u.data == u1.data)
Exemple #11
0
    def test_hoisting_if_coupled(self):
        """
        Test that coupled aliases are successfully hoisted out of the time loop.
        """
        grid = Grid((10, 10))

        a = Function(name="a", grid=grid, space_order=4)
        b = Function(name="b", grid=grid, space_order=4)

        e = TimeFunction(name="e", grid=grid, space_order=4)
        f = TimeFunction(name="f", grid=grid, space_order=4)

        subexpr0 = sqrt(1. + 1. / a)
        subexpr1 = 1 / (8. * subexpr0 - 8. / b)
        eqns = [
            Eq(e.forward, e + 1),
            Eq(f.forward, f * subexpr0 - f * subexpr1 + e.forward.dx)
        ]

        op = Operator(eqns)

        trees = retrieve_iteration_tree(op)
        assert len(trees) == 3
        arrays = [i for i in FindSymbols().visit(trees[0].root) if i.is_Array]
        assert len(arrays) == 2
        assert all(i._mem_heap and not i._mem_external for i in arrays)
Exemple #12
0
def relax_incr_dimensions(iet, **kwargs):
    """
    This pass adjusts the bounds of blocked Iterations in order to include the "remainder
    regions".  Without the relaxation that occurs in this pass, the only way to iterate
    over the entire iteration space is to have step increments that are perfect divisors
    of the iteration space (e.g. in case of an iteration space of size 67 and block size
    8 only 64 iterations would be computed, as `67 - 67mod8 = 64`.

    A simple 1D example: nested Iterations are transformed from:

    <Iteration x0_blk0; (x_m, x_M, x0_blk0_size)>
        <Iteration x; (x0_blk0, x0_blk0 + x0_blk0_size - 1, 1)>

    to:

    <Iteration x0_blk0; (x_m, x_M, x0_blk0_size)>
        <Iteration x; (x0_blk0, MIN(x_M, x0_blk0 + x0_blk0_size - 1)), 1)>

    """
    mapper = {}
    for tree in retrieve_iteration_tree(iet):
        iterations = [i for i in tree if i.dim.is_Block]
        if not iterations:
            continue

        root = iterations[0]
        if root in mapper:
            continue

        assert all(i.direction is Forward for i in iterations)
        outer, inner = split(iterations, lambda i: not i.dim.parent.is_Block)

        # Get root's `symbolic_max` out of each outer Dimension
        roots_max = {i.dim.root: i.symbolic_max for i in outer}

        # Process inner iterations and adjust their bounds
        for n, i in enumerate(inner):
            # The Iteration's maximum is the MIN of (a) the `symbolic_max` of current
            # Iteration e.g. `x0_blk0 + x0_blk0_size - 1` and (b) the `symbolic_max`
            # of the current Iteration's root Dimension e.g. `x_M`. The generated
            # maximum will be `MIN(x0_blk0 + x0_blk0_size - 1, x_M)

            # In some corner cases an offset may be added (e.g. after CIRE passes)
            # E.g. assume `i.symbolic_max = x0_blk0 + x0_blk0_size + 1` and
            # `i.dim.symbolic_max = x0_blk0 + x0_blk0_size - 1` then the generated
            # maximum will be `MIN(x0_blk0 + x0_blk0_size + 1, x_M + 2)`

            root_max = roots_max[i.dim.root] + i.symbolic_max - i.dim.symbolic_max
            iter_max = evalrel(min, [i.symbolic_max, root_max])
            mapper[i] = i._rebuild(limits=(i.symbolic_min, iter_max, i.step))

    if mapper:
        iet = Transformer(mapper, nested=True).visit(iet)

        headers = [('%s(a,b)' % MIN.name, ('(((a) < (b)) ? (a) : (b))')),
                   ('%s(a,b)' % MAX.name, ('(((a) > (b)) ? (a) : (b))'))]
    else:
        headers = []

    return iet, {'headers': headers}
Exemple #13
0
    def test_minimize_remainders_due_to_autopadding(self):
        """
        Check that the bounds of the Iteration computing the DSE-captured aliasing
        expressions are relaxed (i.e., slightly larger) so that backend-compiler-generated
        remainder loops are avoided.
        """
        grid = Grid(shape=(3, 3, 3))
        x, y, z = grid.dimensions  # noqa
        t = grid.stepping_dim

        f = Function(name='f', grid=grid)
        f.data_with_halo[:] = 1.
        u = TimeFunction(name='u', grid=grid, space_order=3)
        u.data_with_halo[:] = 0.5

        # Leads to 3D aliases
        eqn = Eq(
            u.forward,
            ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f +
             (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f +
             1))
        op0 = Operator(eqn, dse='noop', dle=('advanced', {'openmp': False}))
        op1 = Operator(eqn,
                       dse='aggressive',
                       dle=('advanced', {
                           'openmp': False
                       }))

        x0_blk_size = op1.parameters[-2]
        y0_blk_size = op1.parameters[-1]
        z_size = op1.parameters[4]

        # Check Array shape
        arrays = [
            i for i in FindSymbols().visit(op1._func_table['bf0'].root)
            if i.is_Array
        ]
        assert len(arrays) == 1
        a = arrays[0]
        assert len(a.dimensions) == 3
        assert a.halo == ((1, 1), (1, 1), (1, 1))
        assert a.padding == ((0, 0), (0, 0), (0, 30))
        assert Add(*a.symbolic_shape[0].args) == x0_blk_size + 2
        assert Add(*a.symbolic_shape[1].args) == y0_blk_size + 2
        assert Add(*a.symbolic_shape[2].args) == z_size + 32

        # Check loop bounds
        trees = retrieve_iteration_tree(op1._func_table['bf0'].root)
        assert len(trees) == 2
        expected_rounded = trees[0].inner
        assert expected_rounded.symbolic_max ==\
            z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1

        # Check numerical output
        op0(time_M=1)
        exp = np.copy(u.data[:])
        u.data_with_halo[:] = 0.5
        op1(time_M=1)
        assert np.all(u.data == exp)
Exemple #14
0
def fold_blockable_tree(iet, blockinner=True):
    """
    Create IterationFolds from sequences of nested Iterations.
    """
    mapper = {}
    for k, sequence in FindAdjacent(Iteration).visit(iet).items():
        # Group based on Dimension
        groups = []
        for subsequence in sequence:
            for _, v in groupby(subsequence, lambda i: i.dim):
                i = list(v)
                if len(i) >= 2:
                    groups.append(i)
        for i in groups:
            # Pre-condition: they all must be perfect iterations
            if any(not IsPerfectIteration().visit(j) for j in i):
                continue
            # Only retain consecutive trees having same depth
            trees = [retrieve_iteration_tree(j)[0] for j in i]
            handle = []
            for j in trees:
                if len(j) != len(trees[0]):
                    break
                handle.append(j)
            trees = handle
            if not trees:
                continue
            # Check foldability
            pairwise_folds = list(zip(*reversed(trees)))
            if any(not is_foldable(j) for j in pairwise_folds):
                continue
            # Maybe heuristically exclude innermost Iteration
            if blockinner is False:
                pairwise_folds = pairwise_folds[:-1]
            # Perhaps there's nothing to fold
            if len(pairwise_folds) == 0:
                continue
            # TODO: we do not currently support blocking if any of the foldable
            # iterations writes to user data (need min/max loop bounds?)
            exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1])
            if any(j.write.is_Input for j in exprs):
                continue
            # Perform folding
            for j in pairwise_folds:
                r, remainder = j[0], j[1:]
                folds = [(tuple(y-x for x, y in zip(i.offsets, r.offsets)), i.nodes)
                         for i in remainder]
                mapper[r] = IterationFold(folds=folds, **r.args)
                for k in remainder:
                    mapper[k] = None

    # Insert the IterationFolds in the Iteration/Expression tree
    iet = Transformer(mapper, nested=True).visit(iet)

    return iet
Exemple #15
0
def test_conddim_backwards_unstructured():
    nt = 10
    grid = Grid(shape=(4, 4))
    time_dim = grid.time_dim
    x, y = grid.dimensions

    factor = Constant(name='factor', value=2, dtype=np.int32)
    time_sub = ConditionalDimension(name="time_sub",
                                    parent=time_dim,
                                    factor=factor)

    u = TimeFunction(name='u',
                     grid=grid,
                     time_order=0,
                     save=nt,
                     time_dim=time_sub)
    v = TimeFunction(name='v', grid=grid)
    v1 = TimeFunction(name='v', grid=grid)

    for i in range(u.save):
        u.data[i, :] = i

    ub = u[time_sub - 1, x, y]
    ubb = u[time_sub - 2, x, y]
    uff = u[time_sub + 2, x, y]

    eqns = [Eq(v.backward, v.backward + v + ubb + ub + uff + 1.)]

    op0 = Operator(eqns, opt='noop')
    op1 = Operator(eqns, opt='buffering')

    # Check generated code
    assert len(retrieve_iteration_tree(op1)) == 3
    buffers = [i for i in FindSymbols().visit(op1) if i.is_Array]
    assert len(buffers) == 1

    # Note 1: cannot use time_m<4 or time_M>14 or there would be OOB accesses
    # due to `ubb` and `uff`, which read two steps away from the current point,
    # while `u` has in total `nt=10` entries (so last one has index 9). In
    # particular, at `time_M=14` we will read from `uff = u[time/factor + 2] =
    # u[14/2+2] = u[9]`, which is the last available entry in `u`. Likewise,
    # at `time_m=4` we will read from `ubb = u[time/factor - 2`] = u[4/2 - 2] =
    # u[0]`, which is clearly the last accessible entry in `u` while iterating
    # in the backward direction
    # Note 2: Given `factor=2`, we always write to `v` when `time % 2 == 0`, which
    # means that we always write to `v[t1] = v[(time+1)%2] = v[1]`, while `v[0]`
    # remains zero-valued. So the fact that the Eq is also reading from `v` is
    # only relevant to induce the backward iteration direction
    op0.apply(time_m=4, time_M=14)
    op1.apply(time_m=4, time_M=14, v=v1)

    assert np.all(v.data == v1.data)
Exemple #16
0
def test_issue_1901():
    grid = Grid(shape=(2, 2))
    time = grid.time_dim
    x, y = grid.dimensions

    usave = TimeFunction(name='usave', grid=grid, save=10)
    v = TimeFunction(name='v', grid=grid)

    eq = [Eq(v[time, x, y], usave)]

    op = Operator(eq, opt='buffering')

    trees = retrieve_iteration_tree(op)
    assert len(trees) == 2
    assert trees[1].root.dim is time
    assert not trees[1].root.is_Parallel
    assert trees[1].root.is_Sequential  # Obv
Exemple #17
0
def test_time_dependent_split(dse, dle):
    grid = Grid(shape=(10, 10))
    u = TimeFunction(name='u', grid=grid, time_order=2, space_order=2, save=3)
    v = TimeFunction(name='v', grid=grid, time_order=2, space_order=0, save=3)

    # The second equation needs a full loop over x/y for u then
    # a full one over x.y for v
    eq = [Eq(u.forward, 2 + grid.time_dim),
          Eq(v.forward, u.forward.dx + u.forward.dy + 1)]
    op = Operator(eq, dse=dse, dle=dle)

    trees = retrieve_iteration_tree(op)
    assert len(trees) == 2

    op()

    assert np.allclose(u.data[2, :, :], 3.0)
    assert np.allclose(v.data[1, 1:-1, 1:-1], 1.0)
Exemple #18
0
def test_scheduling_after_rewrite():
    """Tests loop scheduling after DSE-induced expression hoisting."""
    grid = Grid((10, 10))
    u1 = TimeFunction(name="u1", grid=grid, save=10, time_order=2)
    u2 = TimeFunction(name="u2", grid=grid, time_order=2)
    sf1 = SparseTimeFunction(name='sf1', grid=grid, npoint=1, nt=10)
    const = Function(name="const", grid=grid, space_order=2)

    # Deliberately inject into u1, rather than u1.forward, to create a WAR
    eqn1 = Eq(u1.forward, u1 + sin(const))
    eqn2 = sf1.inject(u1.forward, expr=sf1)
    eqn3 = Eq(u2.forward, u2 - u1.dt2 + sin(const))

    op = Operator([eqn1] + eqn2 + [eqn3])
    trees = retrieve_iteration_tree(op)

    # Check loop nest structure
    assert len(trees) == 4
    assert all(i.dim == j for i, j in zip(trees[0], grid.dimensions))  # time invariant
    assert trees[1][0].dim == trees[2][0].dim == trees[3][0].dim == grid.time_dim
Exemple #19
0
def autotune(operator, args, level, mode):
    """
    Operator autotuning.

    Parameters
    ----------
    operator : Operator
        Input Operator.
    args : dict_like
        The runtime arguments with which `operator` is run.
    level : str
        The autotuning aggressiveness (basic, aggressive, max). A more
        aggressive autotuning might eventually result in higher runtime
        performance, but the autotuning phase will take longer.
    mode : str
        The autotuning mode (preemptive, runtime). In preemptive mode, the
        output runtime values supplied by the user to `operator.apply` are
        replaced with shadow copies.
    """
    key = [level, mode]
    accepted = configuration._accepted['autotuning']
    if key not in accepted:
        raise ValueError("The accepted `(level, mode)` combinations are `%s`; "
                         "provided `%s` instead" % (accepted, key))

    # We get passed all the arguments, but the cfunction only requires a subset
    at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters])

    # User-provided output data won't be altered in `preemptive` mode
    if mode == 'preemptive':
        output = {i.name: i for i in operator.output}
        copies = {k: output[k]._C_as_ndarray(v).copy()
                  for k, v in args.items() if k in output}
        # WARNING: `copies` keeps references to numpy arrays, which is required
        # to avoid garbage collection to kick in during autotuning and prematurely
        # free the shadow copies handed over to C-land
        at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()})

    # Disable halo exchanges through MPI_PROC_NULL
    if mode in ['preemptive', 'destructive']:
        for p in operator.parameters:
            if isinstance(p, MPINeighborhood):
                at_args.update(MPINeighborhood(p.fields)._arg_values())
                for i in p.fields:
                    setattr(at_args[p.name]._obj, i, MPI.PROC_NULL)
            elif isinstance(p, MPIMsgEnriched):
                at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values())
                for i in at_args[p.name]:
                    i.fromrank = MPI.PROC_NULL
                    i.torank = MPI.PROC_NULL

    roots = [operator.body] + [i.root for i in operator._func_table.values()]
    trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root)

    # Detect the time-stepping Iteration; shrink its iteration range so that
    # each autotuning run only takes a few iterations
    steppers = {i for i in flatten(trees) if i.dim.is_Time}
    if len(steppers) == 0:
        stepper = None
        timesteps = 1
    elif len(steppers) == 1:
        stepper = steppers.pop()
        timesteps = init_time_bounds(stepper, at_args)
        if timesteps is None:
            return args, {}
    else:
        warning("cannot perform autotuning unless there is one time loop; skipping")
        return args, {}

    # Perform autotuning
    timings = {}
    for n, tree in enumerate(trees):
        blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)]

        # Tunable arguments
        try:
            tunable = []
            tunable.append(generate_block_shapes(blockable, args, level))
            tunable.append(generate_nthreads(operator.nthreads, args, level))
            tunable = list(product(*tunable))
        except ValueError:
            # Some arguments are cumpolsory, otherwise autotuning is skipped
            continue

        # Symbolic number of loop-blocking blocks per thread
        nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads

        for bs, nt in tunable:
            # Can we safely autotune over the given time range?
            if not check_time_bounds(stepper, at_args, args, mode):
                break

            # Update `at_args` to use the new tunable arguments
            run = [(k, v) for k, v in bs + nt if k in at_args]
            at_args.update(dict(run))

            # Drop run if not at least one block per thread
            if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1:
                continue

            # Make sure we remain within stack bounds, otherwise skip run
            try:
                stack_footprint = operator._mem_summary['stack']
                if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']:
                    continue
            except TypeError:
                warning("couldn't determine stack size; skipping run %s" % str(i))
                continue
            except AttributeError:
                assert stack_footprint == 0

            # Run the Operator
            operator.cfunction(*list(at_args.values()))
            elapsed = operator._profiler.timer.total

            timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed
            log("run <%s> took %f (s) in %d timesteps" %
                (','.join('%s=%s' % i for i in run), elapsed, timesteps))

            # Prepare for the next autotuning run
            update_time_bounds(stepper, at_args, timesteps, mode)

            # Reset profiling timers
            operator._profiler.timer.reset()

    # The best variant is the one that for a given number of threads had the minium
    # turnaround time
    try:
        runs = 0
        mapper = {}
        for k, v in timings.items():
            for i in v.values():
                runs += len(i)
                record = mapper.setdefault(k, Record())
                record.add(min(i, key=i.get), min(i.values()))
        best = min(mapper, key=mapper.get)
        best = OrderedDict(best + tuple(mapper[best].args))
        best.pop(None, None)
        log("selected <%s>" % (','.join('%s=%s' % i for i in best.items())))
    except ValueError:
        warning("couldn't perform any runs")
        return args, {}

    # Update the argument list with the tuned arguments
    args.update(best)

    # In `runtime` mode, some timesteps have been executed already, so we must
    # adjust the time range
    finalize_time_bounds(stepper, at_args, args, mode)

    # Autotuning summary
    summary = {}
    summary['runs'] = runs
    summary['tpr'] = timesteps  # tpr -> timesteps per run
    summary['tuned'] = dict(best)

    return args, summary