def test_equations_mixed_densedata_timedata(self, shape, dimensions): """ Test that equations using a mixture of DenseData and TimeData objects are embedded within the same time loop. """ a = TimeData(name='a', shape=shape, time_order=2, dimensions=dimensions, space_order=2, time_dim=6, save=False) p_aux = Dimension(name='p_aux', size=10) b = DenseData(name='b', shape=shape + (10,), dimensions=dimensions + (p_aux,), space_order=2) b.data[:] = 1.0 b2 = DenseData(name='b2', shape=(10,) + shape, dimensions=(p_aux,) + dimensions, space_order=2) b2.data[:] = 1.0 eqns = [Eq(a.forward, a.laplace + 1.), Eq(b, time*b*a + b)] eqns2 = [Eq(a.forward, a.laplace + 1.), Eq(b2, time*b2*a + b2)] subs = {x.spacing: 2.5, y.spacing: 1.5, z.spacing: 2.0} op = Operator(eqns, subs=subs, dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 assert all(trees[0][i] is trees[1][i] for i in range(3)) op2 = Operator(eqns2, subs=subs, dle='noop') trees = retrieve_iteration_tree(op2) assert len(trees) == 2 assert all(trees[0][i] is trees[1][i] for i in range(3)) # Verify both operators produce the same result op() op2() assert(np.allclose(b2.data[2, ...].reshape(-1) - b.data[..., 2].reshape(-1), 0.))
def decorate(nodes): processed = [] for node in nodes: mapper = {} for tree in retrieve_iteration_tree(node): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: handle = FindSymbols('symbolics').visit(i) try: aligned = [ j for j in handle if j.is_Tensor and j.shape[-1] % get_simd_items(j.dtype) == 0 ] except KeyError: aligned = [] if aligned: simd = omplang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simdinfo[get_simd_flag()])) else: simd = as_tuple(omplang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed.append(Transformer(mapper).visit(node)) return processed
def test_multiple_loop_nests(self): """ Compute a simple stencil S, preceded by an "initialization loop" I and followed by a "random loop" R. * S is the trivial equation ``u[t+1,x,y,z] = u[t,x,y,z] + 1``; * I initializes ``u`` to 0; * R adds 2 to another field ``v`` along the ``z`` dimension but only over the planes ``[x=0, y=2]`` and ``[x=0, y=5]``. Out of these three loop nests, only S should be "offloaded" to YASK; indeed, I is outside the time loop, while R does not loop over space dimensions. This test checks that S is the only loop nest "offloaded" to YASK, and that the numerical output is correct. """ u = TimeData(name='yu4D', shape=(12, 12, 12), dimensions=(x, y, z), space_order=0) v = TimeData(name='yv4D', shape=(12, 12, 12), dimensions=(x, y, z), space_order=0) v.data[:] = 0. eqs = [Eq(u.indexed[0, x, y, z], 0), Eq(u.indexed[1, x, y, z], 0), Eq(u.forward, u + 1.), Eq(v.indexed[t + 1, 0, 2, z], v.indexed[t + 1, 0, 2, z] + 2.), Eq(v.indexed[t + 1, 0, 5, z], v.indexed[t + 1, 0, 5, z] + 2.)] op = Operator(eqs, subs={t.spacing: 1}) op(yu4D=u, yv4D=v, t=1) assert 'run_solution' in str(op) assert len(retrieve_iteration_tree(op)) == 3 assert np.all(u.data[0] == 0.) assert np.all(u.data[1] == 1.) assert np.all(v.data[0] == 0.) assert np.all(v.data[1, 0, 2] == 2.) assert np.all(v.data[1, 0, 5] == 2.)
def test_different_section_nests(self, tu, ti0, t0, t1): eq1 = Eq(ti0, t0*3.) eq2 = Eq(tu, ti0 + t1*3.) op = Operator([eq1, eq2], dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 assert trees[0][-1].nodes[0].expr.rhs == eq1.rhs assert trees[1][-1].nodes[0].expr.rhs == eq2.rhs
def decorate(nodes): processed = [] for node in nodes: mapper = {} for tree in retrieve_iteration_tree(node): for i in tree: if i.is_Parallel: mapper[i] = List(body=i, footer=fence) break transformed = Transformer(mapper).visit(node) mapper = {} for tree in retrieve_iteration_tree(transformed): for i in tree: if i.is_Vectorizable: mapper[i] = List(header=pragma, body=i) transformed = Transformer(mapper).visit(transformed) processed.append(transformed) return processed
def test_directly_indexed_expression(self, fa, ti0, t0, exprs): """ Emulates a potential implementation of boundary condition loops """ eqs = EVAL(exprs, ti0.base, t0) op = Operator(eqs, dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 assert trees[0][-1].nodes[0].expr.rhs == eqs[0].rhs assert trees[1][-1].nodes[0].expr.rhs == eqs[1].rhs
def test_expressions_imperfect_loops(self, ti0, ti1, ti2, t0): eq1 = Eq(ti2, t0 * 3.) eq2 = Eq(ti0, ti1 + 4. + ti2 * 5.) op = Operator([eq1, eq2], dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 outer, inner = trees assert len(outer) == 2 and len(inner) == 3 assert all(i == j for i, j in zip(outer, inner[:-1])) assert outer[-1].nodes[0].expr.rhs == eq1.rhs assert inner[-1].nodes[0].expr.rhs == eq2.rhs
def test_directly_indexed_expression(self, fa, ti0, t0, exprs): """ Test that equations using integer indices are inserted in the right loop nest, at the right loop nest depth. """ eqs = EVAL(exprs, ti0.base, t0) op = Operator(eqs, dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 assert trees[0][-1].nodes[0].expr.rhs == eqs[0].rhs assert trees[1][-1].nodes[0].expr.rhs == eqs[1].rhs
def _yaskize(self, state): """ Create a YASK representation of this Iteration/Expression tree. """ dle_warning("Be patient! The YASK backend is still a WIP") dle_warning("This is the YASK AST that the Devito DLE can build") for node in state.nodes: for tree in retrieve_iteration_tree(node): candidate = tree[-1] # Set up the YASK solution soln = cfac.new_stencil_solution("solution") # Set up the YASK grids grids = FindSymbols(mode='symbolics').visit(candidate) grids = { g.name: soln.new_grid(g.name, *[str(i) for i in g.indices]) for g in grids } # Vector folding API usage example soln.set_fold_len('x', 1) soln.set_fold_len('y', 1) soln.set_fold_len('z', 8) # Perform the translation on an expression basis transformer = sympy2yask(grids) expressions = [e for e in candidate.nodes if e.is_Expression] # yaskASTs = [transformer(e.stencil) for e in expressions] for i in expressions: try: ast = transformer(i.stencil) # Scalar print(ast.format_simple()) # AVX2 intrinsics # print soln.format('avx2') # AVX2 intrinsics to file # import os # path = os.path.join(os.environ.get('YASK_HOME', '.'), 'src') # soln.write(os.path.join(path, 'stencil_code.hpp'), 'avx2') except: pass dle_warning("Falling back to basic DLE optimizations...") return {'nodes': state.nodes}
def __init__(self, expressions, **kwargs): super(OperatorDebug, self).__init__(expressions, **kwargs) self._includes.append('stdio.h') # Minimize the trip count of the sequential loops iterations = set(flatten(retrieve_iteration_tree(self.body))) mapper = { i: i._rebuild(limits=(max(i.offsets) + 2)) for i in iterations if i.is_Sequential } self.body = Transformer(mapper).visit(self.body) # Mark entry/exit points of each non-sequential Iteration tree in the body iterations = [ filter_iterations(i, lambda i: not i.is_Sequential, 'any') for i in retrieve_iteration_tree(self.body) ] iterations = [i[0] for i in iterations if i] mapper = { t: List(header=printmark('In nest %d' % i), body=t) for i, t in enumerate(iterations) } self.body = Transformer(mapper).visit(self.body)
def test_cache_blocking_structure(blockinner, expected): _, op = _new_operator1((10, 31, 45), dle=('blocking', {'blockalways': True, 'blockshape': (2, 9, 2), 'blockinner': blockinner})) # Check presence of remainder loops iterations = retrieve_iteration_tree(op) assert len(iterations) == expected assert not iterations[0][0].is_Remainder assert all(i[0].is_Remainder for i in iterations[1:]) # Check presence of openmp pragmas at the right place _, op = _new_operator1((10, 31, 45), dle=('blocking,openmp', {'blockalways': True, 'blockshape': (2, 9, 2), 'blockinner': blockinner})) iterations = retrieve_iteration_tree(op) assert len(iterations) == expected # All iterations except the last one an outermost parallel loop over blocks assert not iterations[-1][0].is_Parallel for i in iterations[:-1]: outermost = i[0] assert len(outermost.pragmas) == 1 assert 'omp for' in outermost.pragmas[0].value
def fold_blockable_tree(node, exclude_innermost=False): """ Create :class:`IterationFold`s from sequences of nested :class:`Iteration`. """ found = FindAdjacentIterations().visit(node) found.pop('seen_iteration') mapper = {} for k, v in found.items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if exclude_innermost is True: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = NestedTransformer(mapper).visit(node) return processed
def test_consistency_perfect_loops(self, tu, tv, ti0, t0, t1): eq1 = Eq(tu, tv * ti0 * t0 + ti0 * t1) eq2 = Eq(ti0, tu + t0 * 3.) eq3 = Eq(tv, ti0 * tu) op1 = Operator([eq1, eq2, eq3], dse='noop', dle='noop') op2 = Operator([eq2, eq1, eq3], dse='noop', dle='noop') op3 = Operator([eq3, eq2, eq1], dse='noop', dle='noop') trees = [retrieve_iteration_tree(i) for i in [op1, op2, op3]] assert all(len(i) == 1 for i in trees) trees = [i[0] for i in trees] for tree in trees: assert IsPerfectIteration().visit(tree[0]) assert len(tree[-1].nodes) == 3 pivot = set([j.expr for j in trees[0][-1].nodes]) assert all(set([j.expr for j in i[-1].nodes]) == pivot for i in trees)
def test_equations_emulate_bc(self, t0): """ Test that bc-like equations get inserted into the same loop nest as the "main" equations. """ shape = (3, 3, 3) a = DenseData(name='a', shape=shape).indexed b = TimeData(name='b', shape=shape, save=True, time_dim=6).indexed main = Eq(b[time + 1, x, y, z], b[time - 1, x, y, z] + a[x, y, z] + 3.*t0) bcs = [Eq(b[time, 0, y, z], 0.), Eq(b[time, x, 0, z], 0.), Eq(b[time, x, y, 0], 0.)] op = Operator([main] + bcs, dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 4 assert all(id(trees[0][0]) == id(i[0]) for i in trees)
def test_expressions_imperfect_loops(self, ti0, ti1, ti2, t0): """ Test that equations depending only on a subset of all indices appearing across all equations are placed within earlier loops in the loop nest tree. """ eq1 = Eq(ti2, t0*3.) eq2 = Eq(ti0, ti1 + 4. + ti2*5.) op = Operator([eq1, eq2], dse='noop', dle='noop') trees = retrieve_iteration_tree(op) assert len(trees) == 2 outer, inner = trees assert len(outer) == 2 and len(inner) == 3 assert all(i == j for i, j in zip(outer, inner[:-1])) assert outer[-1].nodes[0].expr.rhs == eq1.rhs assert inner[-1].nodes[0].expr.rhs == eq2.rhs
def unfold_blocked_tree(node): """ Unfold nested :class:`IterationFold`. :Example: Given a section of Iteration/Expression tree as below: :: for i = 1 to N-1 // folded for j = 1 to N-1 // folded foo1() Assuming a fold with offset 1 in both /i/ and /j/ and body ``foo2()``, create: :: for i = 1 to N-1 for j = 1 to N-1 foo1() for i = 2 to N-2 for j = 2 to N-2 foo2() """ # Search the unfolding candidates candidates = [] for tree in retrieve_iteration_tree(node): handle = tuple(i for i in tree if i.is_IterationFold) if handle: # Sanity check assert IsPerfectIteration().visit(handle[0]) candidates.append(handle) # Perform unfolding tag = ntags() mapper = {} for tree in candidates: trees = list(zip(*[i.unfold() for i in tree])) # Update tag for i, _tree in enumerate(list(trees)): trees[i] = tuple(j.retag(tag + i) for j in _tree) trees = optimize_unfolded_tree(trees[:-1], trees[-1]) mapper[tree[0]] = List(body=trees) # Insert the unfolded Iterations in the Iteration/Expression tree processed = Transformer(mapper).visit(node) return processed
def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = { i: List(c.Comment('DLE: moved denormals flag')) for i in denormals } # Handle parallelizable loops for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable candidates = filter_iterations(tree, key=key, stop='consecutive') if not candidates: continue # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(candidates) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallelism = omplang['for'] else: parallelism = omplang['collapse'](nparallel) root = candidates[0] mapper[root] = Block(header=omplang['par-region'], body=denormals + [Element(parallelism), root]) processed.append(Transformer(mapper).visit(node)) return {'nodes': processed}
def test_create_elemental_functions_simple(simple_function): roots = [i[-1] for i in retrieve_iteration_tree(simple_function)] retagged = [i._rebuild(properties=tagger(0)) for i in roots] mapper = {i: j._rebuild(properties=(j.properties + (ELEMENTAL,))) for i, j in zip(roots, retagged)} function = Transformer(mapper).visit(simple_function) handle = transform(function, mode='split') block = List(body=handle.nodes + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')])] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,j); } } } void f_0(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec, const int i, const int j) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[5] __attribute__((aligned(64))) = (float (*)[5]) c_vec; float (*restrict d)[5][7] __attribute__((aligned(64))) = (float (*)[5][7]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""")
def test_consistency_coupled_w_ofs(self, exprs, ti0, ti1, ti3): """ Test that no matter what is the order in which the equations are provided to an Operator, the resulting loop nest is the same. The array accesses in the equations may or may not use offsets; these impact the loop bounds, but not the resulting tree structure. """ eq1, eq2, eq3 = EVAL(exprs, ti0.base, ti1.base, ti3.base) op1 = Operator([eq1, eq2, eq3], dse='noop', dle='noop') op2 = Operator([eq2, eq1, eq3], dse='noop', dle='noop') op3 = Operator([eq3, eq2, eq1], dse='noop', dle='noop') trees = [retrieve_iteration_tree(i) for i in [op1, op2, op3]] assert all(len(i) == 1 for i in trees) trees = [i[0] for i in trees] for tree in trees: assert IsPerfectIteration().visit(tree[0]) assert len(tree[-1].nodes) == 3 pivot = set([j.expr for j in trees[0][-1].nodes]) assert all(set([j.expr for j in i[-1].nodes]) == pivot for i in trees)
def test_consistency_coupled_wo_ofs(self, tu, tv, ti0, t0, t1): """ Test that no matter what is the order in which the equations are provided to an Operator, the resulting loop nest is the same. None of the array accesses in the equations use offsets. """ eq1 = Eq(tu, tv*ti0*t0 + ti0*t1) eq2 = Eq(ti0, tu + t0*3.) eq3 = Eq(tv, ti0*tu) op1 = Operator([eq1, eq2, eq3], dse='noop', dle='noop') op2 = Operator([eq2, eq1, eq3], dse='noop', dle='noop') op3 = Operator([eq3, eq2, eq1], dse='noop', dle='noop') trees = [retrieve_iteration_tree(i) for i in [op1, op2, op3]] assert all(len(i) == 1 for i in trees) trees = [i[0] for i in trees] for tree in trees: assert IsPerfectIteration().visit(tree[0]) assert len(tree[-1].nodes) == 3 pivot = set([j.expr for j in trees[0][-1].nodes]) assert all(set([j.expr for j in i[-1].nodes]) == pivot for i in trees)
def _loop_fission(self, state, **kwargs): """ Apply loop fission to innermost :class:`Iteration` objects. This pass is not applied if the number of statements in an Iteration's body is lower than ``self.thresholds['fission'].`` """ processed = [] for node in state.nodes: mapper = {} for tree in retrieve_iteration_tree(node): if len(tree) <= 1: # Heuristically avoided continue candidate = tree[-1] expressions = [e for e in candidate.nodes if e.is_Expression] if len(expressions) < self.thresholds['max_fission']: # Heuristically avoided continue if len(expressions) != len(candidate.nodes): # Dangerous for correctness continue functions = list( set.union(*[set(e.functions) for e in expressions])) wrapped = [e.expr for e in expressions] if not functions or not wrapped: # Heuristically avoided continue # Promote temporaries from scalar to tensors handle = functions[0] dim = handle.indices[-1] size = handle.shape[-1] if any(dim != i.indices[-1] for i in functions): # Dangerous for correctness continue wrapped = promote_scalar_expressions(wrapped, (size, ), (dim, ), True) assert len(wrapped) == len(expressions) rebuilt = [ Expression(s, e.dtype) for s, e in zip(wrapped, expressions) ] # Group statements # TODO: Need a heuristic here to maximize reuse args_frozen = candidate.args_frozen properties = as_tuple( args_frozen['properties']) + (ELEMENTAL, ) args_frozen['properties'] = properties n = self.thresholds['min_fission'] fissioned = [ Iteration(g, **args_frozen) for g in grouper(rebuilt, n) ] mapper[candidate] = List(body=fissioned) processed.append(Transformer(mapper).visit(node)) return {'nodes': processed}
def _minimize_remainders(self, state, **kwargs): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ mapper = {} for tree in retrieve_iteration_tree(state.nodes + state.elemental_functions): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations: continue assert len(vector_iterations) == 1 root = vector_iterations[0] if root.tag is None: continue # Padding writes = [ i for i in FindSymbols('symbolics-writes').visit(root) if i.is_TensorFunction ] padding = [] for i in writes: try: simd_items = get_simd_items(i.dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype( i.dtype).itemsize padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, )) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.end_symbolic if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)) for i in root.uindices: for j in externals: condition.append(root.end_symbolic + padding < j) condition = ' || '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func(name='_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint))) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) nodes = Transformer(mapper).visit(state.nodes) elemental_functions = Transformer(mapper).visit( state.elemental_functions) return {'nodes': nodes, 'elemental_functions': elemental_functions}
def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = OrderedDict([(i, None) for i in denormals]) # Group by outer loop so that we can embed within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and\ not (i.is_Elementizable or i.is_Vectorizable) candidates = filter_iterations(tree, key=key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged # Handle parallelizable loops for group in groups.values(): private = [] for root, tree in group.items(): # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(tree) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallel = omplang['for'] else: parallel = omplang['collapse'](nparallel) mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel, )) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_TensorFunction and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=omplang['par-region'](private), body=denormals + rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region handle = Transformer(mapper).visit(node) if handle is not None: processed.append(handle) return {'nodes': processed}
def _loop_blocking(self, state, **kwargs): """ Apply loop blocking to :class:`Iteration` trees. Blocking is applied to parallel iteration trees. Heuristically, innermost dimensions are not blocked to maximize the trip count of the SIMD loops. Different heuristics may be specified by passing the keywords ``blockshape`` and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate a specific block size for each blocked dimension. For example, for the :class:`Iteration` tree: :: for i for j for k ... one may provide ``blockshape = {i: 4, j: 7}``, in which case the two outer loops will blocked, and the resulting 2-dimensional block will have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. """ exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) blocked = OrderedDict() processed = [] for node in state.nodes: # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(node, exclude_innermost) mapper = {} for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [ i for i in iterations if not i.is_Vectorizable ] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree[0].is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: # Build Iteration over blocks dim = blocked.setdefault( i, Dimension("%s_block" % i.dim.name)) block_size = dim.symbolic_size iter_size = i.dim.size or i.dim.symbolic_size start = i.limits[0] - i.offsets[0] finish = iter_size - i.offsets[1] innersize = iter_size - (-i.offsets[0] + i.offsets[1]) finish = finish - (innersize % block_size) inter_block = Iteration([], dim, [start, finish, block_size], properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block start = inter_block.dim finish = start + block_size intra_block = i._rebuild([], limits=[start, finish, 1], offsets=None, properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] remainder = i._rebuild([], limits=[start, finish, 1], offsets=None) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [ b._rebuild(properties=b.properties + (REMAINDER, )) for b, r in zip(inter_blocks, remainders) if r.dim not in c ] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append( handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed.append(unfold_blocked_tree(rebuilt)) # All blocked dimensions if not blocked: return {'nodes': processed} # Determine the block shape blockshape = self.params.get('blockshape') if not blockshape: # Use trivial heuristic for a suitable blockshape def heuristic(dim_size): ths = 8 # FIXME: This really needs to be improved return ths if dim_size > ths else 1 blockshape = {k: heuristic for k in blocked.keys()} else: try: nitems, nrequired = len(blockshape), len(blocked) blockshape = {k: v for k, v in zip(blocked, blockshape)} if nitems > nrequired: dle_warning("Provided 'blockshape' has more entries than " "blocked loops; dropping entries ...") if nitems < nrequired: dle_warning("Provided 'blockshape' has fewer entries than " "blocked loops; dropping dimensions ...") except TypeError: blockshape = {list(blocked)[0]: blockshape} blockshape.update( {k: None for k in blocked.keys() if k not in blockshape}) # Track any additional arguments required to execute /state.nodes/ arguments = [ BlockingArg(v, k, blockshape[k]) for k, v in blocked.items() ] return { 'nodes': processed, 'arguments': arguments, 'flags': 'blocking' }
def _specialize(self, nodes, parameters): """ Create a YASK representation of this Iteration/Expression tree. ``parameters`` is modified in-place adding YASK-related arguments. """ log("Specializing a Devito Operator for YASK...") # Find offloadable Iteration/Expression trees offloadable = [] for tree in retrieve_iteration_tree(nodes): parallel = filter_iterations(tree, lambda i: i.is_Parallel) if not parallel: # Cannot offload non-parallel loops continue if not (IsPerfectIteration().visit(tree) and all(i.is_Expression for i in tree[-1].nodes)): # Don't know how to offload this Iteration/Expression to YASK continue functions = flatten(i.functions for i in tree[-1].nodes) keys = set((i.indices, i.shape, i.dtype) for i in functions if i.is_TimeData) if len(keys) == 0: continue elif len(keys) > 1: exit("Cannot handle Operators w/ heterogeneous grids") dimensions, shape, dtype = keys.pop() if len(dimensions) == len(tree) and\ all(i.dim == j for i, j in zip(tree, dimensions)): # Detected a "full" Iteration/Expression tree (over both # time and space dimensions) offloadable.append((tree, dimensions, shape, dtype)) # Construct YASK ASTs given Devito expressions. New grids may be allocated. if len(offloadable) == 0: # No offloadable trees found self.context = YaskNullContext() self.yk_soln = YaskNullSolution() processed = nodes log("No offloadable trees found") elif len(offloadable) == 1: # Found *the* offloadable tree for this Operator tree, dimensions, shape, dtype = offloadable[0] self.context = contexts.fetch(dimensions, shape, dtype) # Create a YASK compiler solution for this Operator # Note: this can be dropped as soon as the kernel has been built yc_soln = self.context.make_yc_solution(namespace['jit-yc-soln']) transform = sympy2yask(self.context, yc_soln) try: for i in tree[-1].nodes: transform(i.expr) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], namespace['code-soln-name']) funcall = Element(c.Statement(ccode(funcall))) processed = Transformer({tree[1]: funcall}).visit(nodes) # Track this is an external function call self.func_table[namespace['code-soln-run']] = FunMeta( None, False) # JIT-compile the newly-created YASK kernel self.yk_soln = self.context.make_yk_solution( namespace['jit-yk-soln'], yc_soln) # Now we must drop a pointer to the YASK solution down to C-land parameters.append( Object(namespace['code-soln-name'], namespace['type-solution'], self.yk_soln.rawpointer)) # Print some useful information about the newly constructed solution log("Solution '%s' contains %d grid(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_grids(), yc_soln.get_num_equations())) except: self.yk_soln = YaskNullSolution() processed = nodes log("Unable to offload a candidate tree.") else: exit("Found more than one offloadable trees in a single Operator") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed in YASK, due to the differences # in storage layout employed by Devito and YASK processed = make_grid_accesses(processed) # Update the parameters list adding all necessary YASK grids for i in list(parameters): try: if i.from_YASK: parameters.append( Object(namespace['code-grid-name'](i.name), namespace['type-grid'], i.data.rawpointer)) except AttributeError: # Ignore e.g. Dimensions pass log("Specialization successfully performed!") return processed
def _create_elemental_functions(self, state, **kwargs): """ Extract :class:`Iteration` sub-trees and move them into :class:`Function`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() processed = [] for node in state.nodes: mapper = {} for tree in retrieve_iteration_tree(node, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set( ) # Scalars that *may* have to be passed in not_required = set( ) # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = ScalarFunction(name='%s_start' % name, dtype=np.int32) finish = ScalarFunction(name='%s_finish' % name, dtype=np.int32) args.extend( zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ ScalarFunction(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend( zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Retrieve symbolic arguments for i in fsymbols: if i.is_TensorFunction: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorData: args.append(("%s_vec" % i.name, i)) elif i.is_ConstantData: args.append((i.name, i)) # Retrieve scalar arguments not_required.update( {i.output for i in expressions if i.is_scalar}) maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, ScalarFunction(name=i.name, dtype=np.int32)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new FunCall mapper[root] = List(header=noinline, body=FunCall(name, call)) # Produce the new Function functions.setdefault( name, Function(name, free, 'void', handle, ('static', ))) # Transform the main tree processed.append(Transformer(mapper).visit(node)) return {'nodes': processed, 'elemental_functions': functions.values()}
def _loop_blocking(self, state, **kwargs): """ Apply loop blocking to :class:`Iteration` trees. By default, the blocked :class:`Iteration` objects and the block size are determined heuristically. The heuristic consists of searching the deepest Iteration/Expression tree and blocking all dimensions except: * The innermost (eg, to retain SIMD vectorization); * Those dimensions inducing loop-carried dependencies. The caller may take over the heuristic through ``kwargs['blocking']``, a dictionary indicating the block size of each blocked dimension. For example, for the :class:`Iteration` tree below: :: for i for j for k ... one may pass in ``kwargs['blocking'] = {i: 4, j: 7}``, in which case the two outer loops would be blocked, and the resulting 2-dimensional block would be of size 4x7. """ Region = namedtuple('Region', 'main leftover') blocked = OrderedDict() processed = [] for node in state.nodes: mapper = {} for tree in retrieve_iteration_tree(node): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if 'blockinner' not in self.params: iterations = [ i for i in iterations if not i.is_Vectorizable ] if not iterations: continue root = iterations[0] if not IsPerfectIteration().visit(root): continue # Construct the blocked loop nest, as well as all necessary # remainder loops regions = OrderedDict() blocked_iterations = [] for i in iterations: # Build Iteration over blocks dim = blocked.setdefault( i, Dimension("%s_block" % i.dim.name)) block_size = dim.symbolic_size iter_size = i.dim.size or i.dim.symbolic_size start = i.limits[0] - i.offsets[0] finish = iter_size - i.offsets[1] finish = finish - ((finish - i.offsets[1]) % block_size) inter_block = Iteration([], dim, [start, finish, block_size], properties=as_tuple('parallel')) # Build Iteration within a block start = inter_block.dim finish = start + block_size properties = 'vector-dim' if i.is_Vectorizable else None intra_block = Iteration([], i.dim, [start, finish, 1], i.index, properties=as_tuple(properties)) blocked_iterations.append((inter_block, intra_block)) # Build unitary-increment Iteration over the 'main' region # (the one blocked); necessary to generate code iterating over # non-blocked ("remainder") iterations. start = inter_block.limits[0] finish = inter_block.limits[1] main = Iteration([], i.dim, [start, finish, 1], i.index, properties=i.properties) # Build unitary-increment Iteration over the 'leftover' region: # again as above, this may be necessary when the dimension size # is not a multiple of the block size. start = inter_block.limits[1] finish = iter_size - i.offsets[1] leftover = Iteration([], i.dim, [start, finish, 1], i.index, properties=i.properties) regions[i] = Region(main, leftover) blocked_tree = list(flatten(zip(*blocked_iterations))) blocked_tree = compose_nodes(blocked_tree + [iterations[-1].nodes]) # Build remainder loops remainder_tree = [] for n in range(len(iterations)): for i in combinations(iterations, n + 1): nodes = [ v.leftover if k in i else v.main for k, v in regions.items() ] nodes += [iterations[-1].nodes] remainder_tree.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_tree) rebuilt = Transformer(mapper).visit(node) processed.append(rebuilt) # All blocked dimensions if not blocked: return {'nodes': processed} # Determine the block shape blockshape = self.params.get('blockshape') if not blockshape: # Use trivial heuristic for a suitable blockshape def heuristic(dim_size): ths = 8 # FIXME: This really needs to be improved return ths if dim_size > ths else 1 blockshape = {k: heuristic for k in blocked.keys()} else: try: nitems, nrequired = len(blockshape), len(blocked) blockshape = {k: v for k, v in zip(blocked, blockshape)} if nitems > nrequired: dle_warning("Provided 'blockshape' has more entries than " "blocked loops; dropping entries ...") if nitems < nrequired: dle_warning("Provided 'blockshape' has fewer entries than " "blocked loops; dropping dimensions ...") except TypeError: blockshape = {list(blocked)[0]: blockshape} blockshape.update( {k: None for k in blocked.keys() if k not in blockshape}) # Track any additional arguments required to execute /state.nodes/ arguments = [ BlockingArg(v, k, blockshape[k]) for k, v in blocked.items() ] return { 'nodes': processed, 'arguments': arguments, 'flags': 'blocking' }