def test_two_homogeneous_buffers(): nt = 10 grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid, save=nt) v1 = TimeFunction(name='v', grid=grid, save=nt) eqns = [ Eq(u.forward, u + v + u.backward + v.backward + 1.), Eq(v.forward, u + v + u.backward + v.backward + 1.) ] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') op2 = Operator(eqns, opt=('buffering', 'fuse')) # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 assert len(retrieve_iteration_tree(op2)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 2 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_from_different_nests(self): """ Check that aliases arising from two sets of equations A and B, characterized by a flow dependence, are scheduled within A's and B's loop nests respectively. """ grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions # noqa t = grid.stepping_dim i = Dimension(name='i') f = Function(name='f', grid=grid) f.data_with_halo[:] = 1. g = Function(name='g', shape=(3, ), dimensions=(i, )) g.data[:] = 2. u = TimeFunction(name='u', grid=grid, space_order=3) v = TimeFunction(name='v', grid=grid, space_order=3) # Leads to 3D aliases eqns = [ Eq(u.forward, ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f + (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f + 1)), Inc(u[t + 1, i, i, i], g + 1), Eq(v.forward, ((v[t, x, y, z] + v[t, x + 1, y + 1, z + 1]) * 3 * u.forward + (v[t, x + 2, y + 2, z + 2] + v[t, x + 3, y + 3, z + 3]) * 3 * u.forward + 1)) ] op0 = Operator(eqns, dse='noop', dle=('noop', {'openmp': True})) op1 = Operator(eqns, dse='aggressive', dle=('advanced', { 'openmp': True })) # Check code generation assert 'bf0' in op1._func_table assert 'bf1' in op1._func_table trees = retrieve_iteration_tree(op1._func_table['bf0'].root) assert len(trees) == 2 assert trees[0][-1].nodes[0].body[0].write.is_Array assert trees[1][-1].nodes[0].body[0].write is u trees = retrieve_iteration_tree(op1._func_table['bf1'].root) assert len(trees) == 2 assert trees[0][-1].nodes[0].body[0].write.is_Array assert trees[1][-1].nodes[0].body[0].write is v # Check numerical output op0(time_M=1) exp = np.copy(u.data[:]) u.data_with_halo[:] = 0. op1(time_M=1) assert np.all(u.data == exp)
def test_over_injection(): nt = 10 grid = Grid(shape=(4, 4)) src = SparseTimeFunction(name='src', grid=grid, npoint=1, nt=nt) rec = SparseTimeFunction(name='rec', grid=grid, npoint=1, nt=nt) u = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) u1 = TimeFunction(name="u", grid=grid, time_order=2, space_order=2, save=nt) src.data[:] = 1. eqns = ([Eq(u.forward, u + 1)] + src.inject(field=u.forward, expr=src) + rec.interpolate(expr=u.forward)) op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) ==\ 5 + bool(configuration['language'] != 'C') buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_read_only_backwards_unstructured(): """ Instead of the class `time-1`, `time`, and `time+1`, here we access the buffered Function via `time-2`, `time-1` and `time+2`. """ nt = 10 grid = Grid(shape=(2, 2)) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(nt): u.data[i, :] = i eqns = [ Eq(v.backward, v + u.backward.backward + u.backward + u.forward.forward + 1.) ] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_m=2) op1.apply(time_m=2, v=v1) assert np.all(v.data == v1.data)
def test_read_only(): nt = 10 grid = Grid(shape=(2, 2)) u = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(nt): u.data[i, :] = i eqns = [Eq(v.forward, v + u.backward + u + u.forward + 1.)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, v=v1) assert np.all(v.data == v1.data)
def _make_guard(self, parregion, *args): partrees = FindNodes(ParallelTree).visit(parregion) if not any(isinstance(i.root, self.DeviceIteration) for i in partrees): return super()._make_guard(parregion, *args) cond = [] # There must be at least one iteration or potential crash if not parregion.is_Affine: trees = retrieve_iteration_tree(parregion.root) tree = trees[0][:parregion.ncollapsed] cond.extend([i.symbolic_size > 0 for i in tree]) # SparseFunctions may occasionally degenerate to zero-size arrays. In such # a case, a copy-in produces a `nil` pointer on the device. To fire up a # parallel loop we must ensure none of the SparseFunction pointers are `nil` symbols = FindSymbols().visit(parregion) sfs = [i for i in symbols if i.is_SparseFunction] if sfs: size = [prod(f._C_get_field(FULL, d).size for d in f.dimensions) for f in sfs] cond.extend([i > 0 for i in size]) # Drop dynamically evaluated conditions (e.g. because the `symbolic_size` # is an integer value rather than a symbol). This avoids ugly and # unnecessary conditionals such as `if (true) { ...}` cond = [i for i in cond if i != true] # Combine all cond elements if cond: parregion = List(body=[Conditional(And(*cond), parregion)]) return parregion
def test_conddim_backwards(): nt = 10 grid = Grid(shape=(4, 4)) time_dim = grid.time_dim x, y = grid.dimensions factor = Constant(name='factor', value=2, dtype=np.int32) time_sub = ConditionalDimension(name="time_sub", parent=time_dim, factor=factor) u = TimeFunction(name='u', grid=grid, time_order=0, save=nt, time_dim=time_sub) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(u.save): u.data[i, :] = i eqns = [Eq(v.backward, v.backward + v + u + 1.)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_m=1, time_M=9) op1.apply(time_m=1, time_M=9, v=v1) assert np.all(v.data == v1.data)
def test_unread_buffered_function(self): nt = 10 grid = Grid(shape=(4, 4)) time = grid.time_dim u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) eqns = [Eq(v.forward, v + 1, implicit_dims=time), Eq(u, v)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 1 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1, v=v1) assert np.all(u.data == u1.data) assert np.all(v.data == v1.data)
def test_subdimensions(self): nt = 10 grid = Grid(shape=(10, 10, 10)) x, y, z = grid.dimensions xi = SubDimension.middle(name='xi', parent=x, thickness_left=2, thickness_right=2) yi = SubDimension.middle(name='yi', parent=y, thickness_left=2, thickness_right=2) zi = SubDimension.middle(name='zi', parent=z, thickness_left=2, thickness_right=2) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) eqn = Eq(u.forward, u + 1).xreplace({x: xi, y: yi, z: zi}) op0 = Operator(eqn, opt='noop') op1 = Operator(eqn, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 assert len([i for i in FindSymbols().visit(op1) if i.is_Array]) == 1 op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_async_degree(self, async_degree): nt = 10 grid = Grid(shape=(4, 4)) u = TimeFunction(name='u', grid=grid, save=nt) u1 = TimeFunction(name='u', grid=grid, save=nt) eqn = Eq(u.forward, u + 1) op0 = Operator(eqn, opt='noop') op1 = Operator(eqn, opt=('buffering', { 'buf-async-degree': async_degree })) # Check generated code assert len(retrieve_iteration_tree(op1)) == 2 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 assert buffers.pop().symbolic_shape[0] == async_degree op0.apply(time_M=nt - 2) op1.apply(time_M=nt - 2, u=u1) assert np.all(u.data == u1.data)
def test_hoisting_if_coupled(self): """ Test that coupled aliases are successfully hoisted out of the time loop. """ grid = Grid((10, 10)) a = Function(name="a", grid=grid, space_order=4) b = Function(name="b", grid=grid, space_order=4) e = TimeFunction(name="e", grid=grid, space_order=4) f = TimeFunction(name="f", grid=grid, space_order=4) subexpr0 = sqrt(1. + 1. / a) subexpr1 = 1 / (8. * subexpr0 - 8. / b) eqns = [ Eq(e.forward, e + 1), Eq(f.forward, f * subexpr0 - f * subexpr1 + e.forward.dx) ] op = Operator(eqns) trees = retrieve_iteration_tree(op) assert len(trees) == 3 arrays = [i for i in FindSymbols().visit(trees[0].root) if i.is_Array] assert len(arrays) == 2 assert all(i._mem_heap and not i._mem_external for i in arrays)
def relax_incr_dimensions(iet, **kwargs): """ This pass adjusts the bounds of blocked Iterations in order to include the "remainder regions". Without the relaxation that occurs in this pass, the only way to iterate over the entire iteration space is to have step increments that are perfect divisors of the iteration space (e.g. in case of an iteration space of size 67 and block size 8 only 64 iterations would be computed, as `67 - 67mod8 = 64`. A simple 1D example: nested Iterations are transformed from: <Iteration x0_blk0; (x_m, x_M, x0_blk0_size)> <Iteration x; (x0_blk0, x0_blk0 + x0_blk0_size - 1, 1)> to: <Iteration x0_blk0; (x_m, x_M, x0_blk0_size)> <Iteration x; (x0_blk0, MIN(x_M, x0_blk0 + x0_blk0_size - 1)), 1)> """ mapper = {} for tree in retrieve_iteration_tree(iet): iterations = [i for i in tree if i.dim.is_Block] if not iterations: continue root = iterations[0] if root in mapper: continue assert all(i.direction is Forward for i in iterations) outer, inner = split(iterations, lambda i: not i.dim.parent.is_Block) # Get root's `symbolic_max` out of each outer Dimension roots_max = {i.dim.root: i.symbolic_max for i in outer} # Process inner iterations and adjust their bounds for n, i in enumerate(inner): # The Iteration's maximum is the MIN of (a) the `symbolic_max` of current # Iteration e.g. `x0_blk0 + x0_blk0_size - 1` and (b) the `symbolic_max` # of the current Iteration's root Dimension e.g. `x_M`. The generated # maximum will be `MIN(x0_blk0 + x0_blk0_size - 1, x_M) # In some corner cases an offset may be added (e.g. after CIRE passes) # E.g. assume `i.symbolic_max = x0_blk0 + x0_blk0_size + 1` and # `i.dim.symbolic_max = x0_blk0 + x0_blk0_size - 1` then the generated # maximum will be `MIN(x0_blk0 + x0_blk0_size + 1, x_M + 2)` root_max = roots_max[i.dim.root] + i.symbolic_max - i.dim.symbolic_max iter_max = evalrel(min, [i.symbolic_max, root_max]) mapper[i] = i._rebuild(limits=(i.symbolic_min, iter_max, i.step)) if mapper: iet = Transformer(mapper, nested=True).visit(iet) headers = [('%s(a,b)' % MIN.name, ('(((a) < (b)) ? (a) : (b))')), ('%s(a,b)' % MAX.name, ('(((a) > (b)) ? (a) : (b))'))] else: headers = [] return iet, {'headers': headers}
def test_minimize_remainders_due_to_autopadding(self): """ Check that the bounds of the Iteration computing the DSE-captured aliasing expressions are relaxed (i.e., slightly larger) so that backend-compiler-generated remainder loops are avoided. """ grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions # noqa t = grid.stepping_dim f = Function(name='f', grid=grid) f.data_with_halo[:] = 1. u = TimeFunction(name='u', grid=grid, space_order=3) u.data_with_halo[:] = 0.5 # Leads to 3D aliases eqn = Eq( u.forward, ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f + (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f + 1)) op0 = Operator(eqn, dse='noop', dle=('advanced', {'openmp': False})) op1 = Operator(eqn, dse='aggressive', dle=('advanced', { 'openmp': False })) x0_blk_size = op1.parameters[-2] y0_blk_size = op1.parameters[-1] z_size = op1.parameters[4] # Check Array shape arrays = [ i for i in FindSymbols().visit(op1._func_table['bf0'].root) if i.is_Array ] assert len(arrays) == 1 a = arrays[0] assert len(a.dimensions) == 3 assert a.halo == ((1, 1), (1, 1), (1, 1)) assert a.padding == ((0, 0), (0, 0), (0, 30)) assert Add(*a.symbolic_shape[0].args) == x0_blk_size + 2 assert Add(*a.symbolic_shape[1].args) == y0_blk_size + 2 assert Add(*a.symbolic_shape[2].args) == z_size + 32 # Check loop bounds trees = retrieve_iteration_tree(op1._func_table['bf0'].root) assert len(trees) == 2 expected_rounded = trees[0].inner assert expected_rounded.symbolic_max ==\ z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1 # Check numerical output op0(time_M=1) exp = np.copy(u.data[:]) u.data_with_halo[:] = 0.5 op1(time_M=1) assert np.all(u.data == exp)
def fold_blockable_tree(iet, blockinner=True): """ Create IterationFolds from sequences of nested Iterations. """ mapper = {} for k, sequence in FindAdjacent(Iteration).visit(iet).items(): # Group based on Dimension groups = [] for subsequence in sequence: for _, v in groupby(subsequence, lambda i: i.dim): i = list(v) if len(i) >= 2: groups.append(i) for i in groups: # Pre-condition: they all must be perfect iterations if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if blockinner is False: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 0: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten(FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: r, remainder = j[0], j[1:] folds = [(tuple(y-x for x, y in zip(i.offsets, r.offsets)), i.nodes) for i in remainder] mapper[r] = IterationFold(folds=folds, **r.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree iet = Transformer(mapper, nested=True).visit(iet) return iet
def test_conddim_backwards_unstructured(): nt = 10 grid = Grid(shape=(4, 4)) time_dim = grid.time_dim x, y = grid.dimensions factor = Constant(name='factor', value=2, dtype=np.int32) time_sub = ConditionalDimension(name="time_sub", parent=time_dim, factor=factor) u = TimeFunction(name='u', grid=grid, time_order=0, save=nt, time_dim=time_sub) v = TimeFunction(name='v', grid=grid) v1 = TimeFunction(name='v', grid=grid) for i in range(u.save): u.data[i, :] = i ub = u[time_sub - 1, x, y] ubb = u[time_sub - 2, x, y] uff = u[time_sub + 2, x, y] eqns = [Eq(v.backward, v.backward + v + ubb + ub + uff + 1.)] op0 = Operator(eqns, opt='noop') op1 = Operator(eqns, opt='buffering') # Check generated code assert len(retrieve_iteration_tree(op1)) == 3 buffers = [i for i in FindSymbols().visit(op1) if i.is_Array] assert len(buffers) == 1 # Note 1: cannot use time_m<4 or time_M>14 or there would be OOB accesses # due to `ubb` and `uff`, which read two steps away from the current point, # while `u` has in total `nt=10` entries (so last one has index 9). In # particular, at `time_M=14` we will read from `uff = u[time/factor + 2] = # u[14/2+2] = u[9]`, which is the last available entry in `u`. Likewise, # at `time_m=4` we will read from `ubb = u[time/factor - 2`] = u[4/2 - 2] = # u[0]`, which is clearly the last accessible entry in `u` while iterating # in the backward direction # Note 2: Given `factor=2`, we always write to `v` when `time % 2 == 0`, which # means that we always write to `v[t1] = v[(time+1)%2] = v[1]`, while `v[0]` # remains zero-valued. So the fact that the Eq is also reading from `v` is # only relevant to induce the backward iteration direction op0.apply(time_m=4, time_M=14) op1.apply(time_m=4, time_M=14, v=v1) assert np.all(v.data == v1.data)
def test_issue_1901(): grid = Grid(shape=(2, 2)) time = grid.time_dim x, y = grid.dimensions usave = TimeFunction(name='usave', grid=grid, save=10) v = TimeFunction(name='v', grid=grid) eq = [Eq(v[time, x, y], usave)] op = Operator(eq, opt='buffering') trees = retrieve_iteration_tree(op) assert len(trees) == 2 assert trees[1].root.dim is time assert not trees[1].root.is_Parallel assert trees[1].root.is_Sequential # Obv
def test_time_dependent_split(dse, dle): grid = Grid(shape=(10, 10)) u = TimeFunction(name='u', grid=grid, time_order=2, space_order=2, save=3) v = TimeFunction(name='v', grid=grid, time_order=2, space_order=0, save=3) # The second equation needs a full loop over x/y for u then # a full one over x.y for v eq = [Eq(u.forward, 2 + grid.time_dim), Eq(v.forward, u.forward.dx + u.forward.dy + 1)] op = Operator(eq, dse=dse, dle=dle) trees = retrieve_iteration_tree(op) assert len(trees) == 2 op() assert np.allclose(u.data[2, :, :], 3.0) assert np.allclose(v.data[1, 1:-1, 1:-1], 1.0)
def test_scheduling_after_rewrite(): """Tests loop scheduling after DSE-induced expression hoisting.""" grid = Grid((10, 10)) u1 = TimeFunction(name="u1", grid=grid, save=10, time_order=2) u2 = TimeFunction(name="u2", grid=grid, time_order=2) sf1 = SparseTimeFunction(name='sf1', grid=grid, npoint=1, nt=10) const = Function(name="const", grid=grid, space_order=2) # Deliberately inject into u1, rather than u1.forward, to create a WAR eqn1 = Eq(u1.forward, u1 + sin(const)) eqn2 = sf1.inject(u1.forward, expr=sf1) eqn3 = Eq(u2.forward, u2 - u1.dt2 + sin(const)) op = Operator([eqn1] + eqn2 + [eqn3]) trees = retrieve_iteration_tree(op) # Check loop nest structure assert len(trees) == 4 assert all(i.dim == j for i, j in zip(trees[0], grid.dimensions)) # time invariant assert trees[1][0].dim == trees[2][0].dim == trees[3][0].dim == grid.time_dim
def autotune(operator, args, level, mode): """ Operator autotuning. Parameters ---------- operator : Operator Input Operator. args : dict_like The runtime arguments with which `operator` is run. level : str The autotuning aggressiveness (basic, aggressive, max). A more aggressive autotuning might eventually result in higher runtime performance, but the autotuning phase will take longer. mode : str The autotuning mode (preemptive, runtime). In preemptive mode, the output runtime values supplied by the user to `operator.apply` are replaced with shadow copies. """ key = [level, mode] accepted = configuration._accepted['autotuning'] if key not in accepted: raise ValueError("The accepted `(level, mode)` combinations are `%s`; " "provided `%s` instead" % (accepted, key)) # We get passed all the arguments, but the cfunction only requires a subset at_args = OrderedDict([(p.name, args[p.name]) for p in operator.parameters]) # User-provided output data won't be altered in `preemptive` mode if mode == 'preemptive': output = {i.name: i for i in operator.output} copies = {k: output[k]._C_as_ndarray(v).copy() for k, v in args.items() if k in output} # WARNING: `copies` keeps references to numpy arrays, which is required # to avoid garbage collection to kick in during autotuning and prematurely # free the shadow copies handed over to C-land at_args.update({k: output[k]._C_make_dataobj(v) for k, v in copies.items()}) # Disable halo exchanges through MPI_PROC_NULL if mode in ['preemptive', 'destructive']: for p in operator.parameters: if isinstance(p, MPINeighborhood): at_args.update(MPINeighborhood(p.fields)._arg_values()) for i in p.fields: setattr(at_args[p.name]._obj, i, MPI.PROC_NULL) elif isinstance(p, MPIMsgEnriched): at_args.update(MPIMsgEnriched(p.name, p.function, p.halos)._arg_values()) for i in at_args[p.name]: i.fromrank = MPI.PROC_NULL i.torank = MPI.PROC_NULL roots = [operator.body] + [i.root for i in operator._func_table.values()] trees = filter_ordered(retrieve_iteration_tree(roots), key=lambda i: i.root) # Detect the time-stepping Iteration; shrink its iteration range so that # each autotuning run only takes a few iterations steppers = {i for i in flatten(trees) if i.dim.is_Time} if len(steppers) == 0: stepper = None timesteps = 1 elif len(steppers) == 1: stepper = steppers.pop() timesteps = init_time_bounds(stepper, at_args) if timesteps is None: return args, {} else: warning("cannot perform autotuning unless there is one time loop; skipping") return args, {} # Perform autotuning timings = {} for n, tree in enumerate(trees): blockable = [i.dim for i in tree if isinstance(i.dim, BlockDimension)] # Tunable arguments try: tunable = [] tunable.append(generate_block_shapes(blockable, args, level)) tunable.append(generate_nthreads(operator.nthreads, args, level)) tunable = list(product(*tunable)) except ValueError: # Some arguments are cumpolsory, otherwise autotuning is skipped continue # Symbolic number of loop-blocking blocks per thread nblocks_per_thread = calculate_nblocks(tree, blockable) / operator.nthreads for bs, nt in tunable: # Can we safely autotune over the given time range? if not check_time_bounds(stepper, at_args, args, mode): break # Update `at_args` to use the new tunable arguments run = [(k, v) for k, v in bs + nt if k in at_args] at_args.update(dict(run)) # Drop run if not at least one block per thread if not configuration['develop-mode'] and nblocks_per_thread.subs(at_args) < 1: continue # Make sure we remain within stack bounds, otherwise skip run try: stack_footprint = operator._mem_summary['stack'] if int(evaluate(stack_footprint, **at_args)) > options['stack_limit']: continue except TypeError: warning("couldn't determine stack size; skipping run %s" % str(i)) continue except AttributeError: assert stack_footprint == 0 # Run the Operator operator.cfunction(*list(at_args.values())) elapsed = operator._profiler.timer.total timings.setdefault(nt, OrderedDict()).setdefault(n, {})[bs] = elapsed log("run <%s> took %f (s) in %d timesteps" % (','.join('%s=%s' % i for i in run), elapsed, timesteps)) # Prepare for the next autotuning run update_time_bounds(stepper, at_args, timesteps, mode) # Reset profiling timers operator._profiler.timer.reset() # The best variant is the one that for a given number of threads had the minium # turnaround time try: runs = 0 mapper = {} for k, v in timings.items(): for i in v.values(): runs += len(i) record = mapper.setdefault(k, Record()) record.add(min(i, key=i.get), min(i.values())) best = min(mapper, key=mapper.get) best = OrderedDict(best + tuple(mapper[best].args)) best.pop(None, None) log("selected <%s>" % (','.join('%s=%s' % i for i in best.items()))) except ValueError: warning("couldn't perform any runs") return args, {} # Update the argument list with the tuned arguments args.update(best) # In `runtime` mode, some timesteps have been executed already, so we must # adjust the time range finalize_time_bounds(stepper, at_args, args, mode) # Autotuning summary summary = {} summary['runs'] = runs summary['tpr'] = timesteps # tpr -> timesteps per run summary['tuned'] = dict(best) return args, summary