def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = { i: List(c.Comment('DLE: moved denormals flag')) for i in denormals } # Handle parallelizable loops for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and not i.is_Vectorizable candidates = filter_iterations(tree, key=key, stop='consecutive') if not candidates: continue # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(candidates) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallelism = omplang['for'] else: parallelism = omplang['collapse'](nparallel) root = candidates[0] mapper[root] = Block(header=omplang['par-region'], body=denormals + [Element(parallelism), root]) processed.append(Transformer(mapper).visit(node)) return {'nodes': processed}
def test_tti_clusters_to_graph(): solver = tti_operator() nodes = FindNodes(Expression).visit(solver.op_fwd.elemental_functions) expressions = [n.expr for n in nodes] stencils = solver.op_fwd._retrieve_stencils(expressions) clusters = clusterize(expressions, stencils) assert len(clusters) == 3 main_cluster = clusters[0] n_output_tensors = len(main_cluster.trace) clusters = rewrite([main_cluster], mode='basic') assert len(clusters) == 1 main_cluster = clusters[0] graph = main_cluster.trace assert len([v for v in graph.values() if v.is_tensor ]) == n_output_tensors # u and v assert all(v.reads or v.readby for v in graph.values())
def test_loops_ompized(fa, fb, fc, fd, t0, t1, t2, t3, exprs, expected, iters): scope = [fa, fb, fc, fd, t0, t1, t2, t3] node_exprs = [Expression(EVAL(i, *scope)) for i in exprs] ast = iters[6](iters[7](node_exprs)) nodes = transform(ast, mode='openmp').nodes assert len(nodes) == 1 ast = nodes[0] iterations = FindNodes(Iteration).visit(ast) assert len(iterations) == len(expected) # Check for presence of pragma omp for i, j in zip(iterations, expected): pragmas = i.pragmas if j is True: assert len(pragmas) == 1 pragma = pragmas[0] assert 'omp for' in pragma.value else: for k in pragmas: assert 'omp for' not in k.value
def make_grid_accesses(node): """ Construct a new Iteration/Expression based on ``node``, in which all :class:`interfaces.Indexed` accesses have been converted into YASK grid accesses. """ def make_grid_gets(expr): mapper = {} indexeds = retrieve_indexed(expr) data_carriers = [i for i in indexeds if i.base.function.from_YASK] for i in data_carriers: name = namespace['code-grid-name'](i.base.function.name) args = [INT(make_grid_gets(j)) for j in i.indices] mapper[i] = make_sharedptr_funcall(namespace['code-grid-get'], args, name) return expr.xreplace(mapper) mapper = {} for i, e in enumerate(FindNodes(Expression).visit(node)): lhs, rhs = e.expr.args # RHS translation rhs = make_grid_gets(rhs) # LHS translation if e.output_function.from_YASK: name = namespace['code-grid-name'](e.output_function.name) args = [rhs] + [INT(make_grid_gets(i)) for i in lhs.indices] handle = make_sharedptr_funcall(namespace['code-grid-put'], args, name) processed = Element(c.Statement(ccode(handle))) else: # Writing to a scalar temporary processed = Expression(e.expr.func(lhs, rhs), dtype=e.dtype) mapper.update({e: processed}) return Transformer(mapper).visit(node)
def _profile_sections(self, nodes): """Introduce C-level profiling nodes within the Iteration/Expression tree.""" mapper = {} for node in nodes: for itspace in FindSections().visit(node).keys(): for i in itspace: if IsPerfectIteration().visit(i): # Insert `TimedList` block. This should come from # the profiler, but we do this manually for now. lname = 'loop_%s_%d' % (i.index, len(mapper)) mapper[i] = TimedList(gname=self.profiler.varname, lname=lname, body=i) self.profiler.add(lname) # Estimate computational properties of the timed section # (operational intensity, memory accesses) expressions = FindNodes(Expression).visit(i) ops = estimate_cost([e.expr for e in expressions]) memory = estimate_memory([e.expr for e in expressions]) self.sections[itspace] = Profile(lname, ops, memory) break processed = Transformer(mapper).visit(List(body=nodes)) return processed
def _padding(self, state, **kwargs): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() for node in state.nodes: # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(node) if not handle: continue shape = max([i.shape for i in handle], key=len) if not shape: continue candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: continue # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(node) exprs = [e for e in exprs if e.output_function in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, TensorFunction(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = [ SubstituteExpression(mapper).visit(n) for n in state.nodes ] # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicData]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = init + as_tuple(processed) + copyback return {'nodes': processed}
def _ompize(self, state, **kwargs): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ processed = [] for node in state.nodes: # Reset denormals flag each time a parallel region is entered denormals = FindNodes(Denormals).visit(state.nodes) mapper = OrderedDict([(i, None) for i in denormals]) # Group by outer loop so that we can embed within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(node): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and\ not (i.is_Elementizable or i.is_Vectorizable) candidates = filter_iterations(tree, key=key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged # Handle parallelizable loops for group in groups.values(): private = [] for root, tree in group.items(): # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(tree) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallel = omplang['for'] else: parallel = omplang['collapse'](nparallel) mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel, )) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_TensorFunction and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=omplang['par-region'](private), body=denormals + rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region handle = Transformer(mapper).visit(node) if handle is not None: processed.append(handle) return {'nodes': processed}
def autotune(operator, arguments, tunable, mode='basic'): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ at_arguments = arguments.copy() # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() # Squeeze dimensions to minimize auto-tuning time iterations = FindNodes(Iteration).visit(operator.body) squeezable = [ i.dim.parent.name for i in iterations if i.is_Sequential and i.dim.is_Buffered ] # Attempted block sizes mapper = OrderedDict([(i.argument.name, i) for i in tunable]) blocksizes = [ OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize'] ] if mode == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for blocksize in blocksizes: illegal = False for k, v in at_arguments.items(): if k in blocksize: val = blocksize[k] handle = at_arguments.get(mapper[k].original_dim.name) if val <= mapper[k].iteration.end(handle): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break elif k in squeezable: at_arguments[k] = options['at_squeezer'] if illegal: continue # Add profiler structs at_arguments.update(operator._extra_arguments()) operator.cfunction(*list(at_arguments.values())) elapsed = sum(operator.profiler.timings.values()) timings[tuple(blocksize.items())] = elapsed info_at("<%s>: %f" % (','.join('%d' % i for i in blocksize.values()), elapsed)) best = dict(min(timings, key=timings.get)) info('Auto-tuned block shape: %s' % best) # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v return tuned
def _create_elemental_functions(self, state, **kwargs): """ Extract :class:`Iteration` sub-trees and move them into :class:`Function`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() processed = [] for node in state.nodes: mapper = {} for tree in retrieve_iteration_tree(node, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set( ) # Scalars that *may* have to be passed in not_required = set( ) # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = ScalarFunction(name='%s_start' % name, dtype=np.int32) finish = ScalarFunction(name='%s_finish' % name, dtype=np.int32) args.extend( zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ ScalarFunction(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend( zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Retrieve symbolic arguments for i in fsymbols: if i.is_TensorFunction: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorData: args.append(("%s_vec" % i.name, i)) elif i.is_ConstantData: args.append((i.name, i)) # Retrieve scalar arguments not_required.update( {i.output for i in expressions if i.is_scalar}) maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, ScalarFunction(name=i.name, dtype=np.int32)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new FunCall mapper[root] = List(header=noinline, body=FunCall(name, call)) # Produce the new Function functions.setdefault( name, Function(name, free, 'void', handle, ('static', ))) # Transform the main tree processed.append(Transformer(mapper).visit(node)) return {'nodes': processed, 'elemental_functions': functions.values()}
def autotune(operator, arguments, tunable): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ at_arguments = arguments.copy() # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() iterations = FindNodes(Iteration).visit(operator.body) dim_mapper = {i.dim.name: i.dim for i in iterations} # Shrink the iteration space of sequential dimensions so that auto-tuner # runs take a negligible amount of time sequentials = [i for i in iterations if i.is_Sequential] if len(sequentials) == 0: timesteps = 1 elif len(sequentials) == 1: sequential = sequentials[0] squeeze = sequential.dim.parent if sequential.dim.is_Buffered else sequential.dim timesteps = sequential.extent(finish=options['at_squeezer']) if timesteps < 0: timesteps = options['at_squeezer'] - timesteps + 1 info_at("Adjusted auto-tuning timestep to %d" % timesteps) at_arguments[squeeze.symbolic_size.name] = timesteps else: info_at("Couldn't understand loop structure, giving up auto-tuning") return arguments # Attempted block sizes mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) blocksizes = [OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize']] if configuration['autotuning'] == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # How many temporaries are allocated on the stack? # Will drop block sizes that might lead to a stack overflow functions = FindSymbols('symbolics').visit(operator.body + operator.elemental_functions) stack_shapes = [i.shape for i in functions if i.is_TensorFunction and i._mem_stack] stack_space = sum(reduce(mul, i, 1) for i in stack_shapes)*operator.dtype().itemsize # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for bs in blocksizes: illegal = False for k, v in at_arguments.items(): if k in bs: val = bs[k] handle = at_arguments.get(mapper[k].original_dim.symbolic_size.name) if val <= mapper[k].iteration.end(handle): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break if illegal: continue # Make sure we remain within stack bounds, otherwise skip block size dim_sizes = {} for k, v in at_arguments.items(): if k in bs: dim_sizes[mapper[k].argument.symbolic_size] = bs[k] elif k in dim_mapper: dim_sizes[dim_mapper[k].symbolic_size] = v try: bs_stack_space = stack_space.xreplace(dim_sizes) except AttributeError: bs_stack_space = stack_space try: if int(bs_stack_space) > options['at_stack_limit']: continue except TypeError: # We should never get here info_at("Couldn't determine stack size, skipping block size %s" % str(bs)) continue # Use AT-specific profiler structs at_arguments[operator.profiler.varname] = operator.profiler.setup() operator.cfunction(*list(at_arguments.values())) elapsed = sum(operator.profiler.timings.values()) timings[tuple(bs.items())] = elapsed info_at("Block shape <%s> took %f (s) in %d time steps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) try: best = dict(min(timings, key=timings.get)) info("Auto-tuned block shape: %s" % best) except ValueError: info("Auto-tuning request, but couldn't find legal block sizes") return arguments # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v # Reset the profiling struct assert operator.profiler.varname in tuned tuned[operator.profiler.varname] = operator.profiler.setup() return tuned
def create_profile(node): """ Create a :class:`Profiler` for the Iteration/Expression tree ``node``. The following code sections are profiled: :: * The whole ``node``; * A sequence of perfectly nested loops that have common :class:`Iteration` dimensions, but possibly different extent. For example: :: for x = 0 to N .. for x = 1 to N-1 .. Both Iterations have dimension ``x``, and will be profiled as a single section, though their extent is different. * Any perfectly nested loops. """ profiler = Profiler() # Group by root Iteration mapper = OrderedDict() for itspace in FindSections().visit(node): mapper.setdefault(itspace[0], []).append(itspace) # Group sections if their iteration spaces overlap key = lambda itspace: set([i.dim for i in itspace]) found = [] for v in mapper.values(): queue = list(v) handle = [] while queue: item = queue.pop(0) if not handle or key(item) == key(handle[0]): handle.append(item) else: # Found a timing section found.append(tuple(handle)) handle = [item] if handle: found.append(tuple(handle)) # Create and track C-level timers mapper = OrderedDict() for i, group in enumerate(found): name = 'section_%d' % i section, remainder = group[0], group[1:] index = len(section) > 1 and not IsPerfectIteration().visit(section[0]) root = section[index] # Prepare to transform the Iteration/Expression tree body = tuple(j[index] for j in group) mapper[root] = TimedList(gname=profiler.varname, lname=name, body=body) for j in remainder: mapper[j[index]] = None # Estimate computational properties of the profiled section expressions = FindNodes(Expression).visit(body) ops = estimate_cost([e.expr for e in expressions]) memory = estimate_memory([e.expr for e in expressions]) # Keep track of the new profiled section profiler.add(name, section, ops, memory) # Transform the Iteration/Expression tree introducing the C-level timers processed = Transformer(mapper).visit(node) return processed, profiler
def optimize_unfolded_tree(unfolded, root): """ Transform folded trees to reduce the memory footprint. Examples ======== Given: .. code-block:: for i = 1 to N - 1 # Folded tree for j = 1 to N - 1 tmp[i,j] = ... for i = 2 to N - 2 # Root for j = 2 to N - 2 ... = ... tmp[i,j] ... The temporary ``tmp`` has shape ``(N-1, N-1)``. However, as soon as the iteration space is blocked, with blocks of shape ``(i_bs, j_bs)``, the ``tmp`` shape can be shrunk to ``(i_bs-1, j_bs-1)``. The resulting iteration tree becomes: .. code-block:: for i = 1 to i_bs + 1 # Folded tree for j = 1 to j_bs + 1 i' = i + i_block - 2 j' = j + j_block - 2 tmp[i,j] = ... # use i' and j' for i = i_block to i_block + i_bs # Root for j = j_block to j_block + j_bs i' = i - x_block j' = j - j_block ... = ... tmp[i',j'] ... """ processed = [] for i, tree in enumerate(unfolded): assert len(tree) == len(root) modified_tree = [] modified_root = [] mapper = {} # "Shrink" the iteration space for t1, t2 in zip(tree, root): start, end, incr = t1.args['limits'] index = Symbol('%ss%d' % (t1.index, i)) t1_uindex = (UnboundedIndex(index, start), ) t2_uindex = (UnboundedIndex(index, -start), ) modified_tree.append( t1._rebuild(limits=[0, end - start, incr], uindices=t1.uindices + t1_uindex)) modified_root.append(t2._rebuild(uindices=t2.uindices + t2_uindex)) mapper[t1.dim] = index # Temporary arrays can now be moved onto the stack exprs = FindNodes(Expression).visit(modified_tree[-1]) if all(not j.is_Remainder for j in modified_tree): shape = tuple(j.bounds_symbolic[1] for j in modified_tree) for j in exprs: j_shape = shape + j.output_function.shape[len(modified_tree):] j.output_function.update(shape=j_shape, onstack=True) # Substitute iteration variables within the folded trees modified_tree = compose_nodes(modified_tree) replaced = xreplace_indices([j.expr for j in exprs], mapper, only_rhs=True) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] processed.append( Transformer(dict(zip(exprs, subs))).visit(modified_tree)) # Introduce the new iteration variables within /root/ modified_root = compose_nodes(modified_root) exprs = FindNodes(Expression).visit(modified_root) candidates = [j.output for j in subs] replaced = xreplace_indices([j.expr for j in exprs], mapper, candidates) subs = [j._rebuild(expr=k) for j, k in zip(exprs, replaced)] root = Transformer(dict(zip(exprs, subs))).visit(modified_root) return processed + [root]