def derive_parameters(iet, drop_locals=False): """ Derive all input parameters (function call arguments) from an IET by collecting all symbols not defined in the tree itself. """ # Pick all free symbols and symbolic functions from the kernel functions = FindSymbols('symbolics').visit(iet) free_symbols = FindSymbols('free-symbols').visit(iet) # Filter out function base symbols and use real function objects function_names = set(flatten([(s.name, s._C_name) for s in functions])) symbols = [s for s in free_symbols if s.name not in function_names] symbols = functions + symbols defines = [s.name for s in FindSymbols('defines').visit(iet)] parameters = tuple(s for s in symbols if s.name not in defines) # Drop globally-visible objects parameters = [p for p in parameters if not isinstance(p, (Literal, Macro))] # Maybe filter out all other compiler-generated objects if drop_locals: parameters = [ p for p in parameters if not isinstance(p, (Array, LocalObject)) ] return parameters
def derive_parameters(nodes, drop_locals=False): """ Derive all input parameters (function call arguments) from an IET by collecting all symbols not defined in the tree itself. """ # Pick all free symbols and symbolic functions from the kernel functions = FindSymbols('symbolics').visit(nodes) free_symbols = FindSymbols('free-symbols').visit(nodes) # Filter out function base symbols and use real function objects function_names = [s.name for s in functions] symbols = [s for s in free_symbols if s.name not in function_names] symbols = functions + symbols defines = [s.name for s in FindSymbols('defines').visit(nodes)] parameters = tuple(s for s in symbols if s.name not in defines) # Drop globally-visible objects parameters = [p for p in parameters if not isinstance(p, Macro)] # Filter out locally-allocated Arrays and Objects if drop_locals: parameters = [ p for p in parameters if not (isinstance(p, Array) and (p._mem_heap or p._mem_stack)) ] parameters = [p for p in parameters if not isinstance(p, LocalObject)] return parameters
def test_make_efuncs(self, exprs, nfuncs, ntimeiters, nests): """Test construction of ElementalFunctions.""" exprs = list(as_tuple(exprs)) grid = Grid(shape=(10, 10)) t = grid.stepping_dim # noqa x, y = grid.dimensions # noqa u = Function(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid) # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = eval(e) op = Operator(exprs) # We create one ElementalFunction for each Iteration nest over space dimensions efuncs = [] for n, tree in enumerate(retrieve_iteration_tree(op)): root = filter_iterations(tree, key=lambda i: i.dim.is_Space, stop='asap') efuncs.append(make_efunc('f%d' % n, root)) assert len(efuncs) == len(nfuncs) == len(ntimeiters) == len(nests) for efunc, nf, nt, nest in zip(efuncs, nfuncs, ntimeiters, nests): # Check the `efunc` parameters assert all(i in efunc.parameters for i in (x.symbolic_min, x.symbolic_max)) assert all(i in efunc.parameters for i in (y.symbolic_min, y.symbolic_max)) functions = FindSymbols().visit(efunc) assert len(functions) == nf assert all(i in efunc.parameters for i in functions) timeiters = [ i for i in FindSymbols('free-symbols').visit(efunc) if i.is_Dimension and i.is_Time ] assert len(timeiters) == nt assert all(i in efunc.parameters for i in timeiters) assert len(efunc.parameters) == 4 + len(functions) + len(timeiters) # Check there's exactly one ArrayCast for each Function assert len(FindNodes(ArrayCast).visit(efunc)) == nf # Check the loop nest structure trees = retrieve_iteration_tree(efunc) assert len(trees) == 1 tree = trees[0] assert all(i.dim.name == j for i, j in zip(tree, nest)) assert efunc.make_call()
def _simdize(self, nodes, state): """ Add compiler-specific or, if not available, OpenMP pragmas to the Iteration/Expression tree to emit SIMD-friendly code. """ ignore_deps = as_tuple(self._compiler_decoration('ignore-deps')) mapper = {} for tree in retrieve_iteration_tree(nodes): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: handle = FindSymbols('symbolics').visit(i) try: aligned = [ j for j in handle if j.is_Tensor and j.shape[-1] % get_simd_items(j.dtype) == 0 ] except KeyError: aligned = [] if aligned: simd = Ompizer.lang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simdinfo[get_simd_flag()])) else: simd = as_tuple(Ompizer.lang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed = Transformer(mapper).visit(nodes) return processed, {}
def is_on_device(maybe_symbol, gpu_fit, only_writes=False): """ True if all given Functions are allocated in the device memory, False otherwise. Parameters ---------- maybe_symbol : Indexed or Function or Node The inspected object. May be a single Indexed or Function, or even an entire piece of IET. gpu_fit : list of Function The Function's which are known to definitely fit in the device memory. This information is given directly by the user through the compiler option `gpu-fit` and is propagated down here through the various stages of lowering. only_writes : bool, optional Only makes sense if `maybe_symbol` is an IET. If True, ignore all Function's that do not appear on the LHS of at least one Expression. Defaults to False. """ try: functions = (maybe_symbol.function,) except AttributeError: assert maybe_symbol.is_Node iet = maybe_symbol functions = set(FindSymbols().visit(iet)) if only_writes: expressions = FindNodes(Expression).visit(iet) functions &= {i.write for i in expressions} fsave = [f for f in functions if f.is_TimeFunction and is_integer(f.save)] if 'all-fallback' in gpu_fit and fsave: warning("TimeFunction %s assumed to fit the GPU memory" % fsave) return True return all(f in gpu_fit for f in fsave)
def _simdize(self, iet): """ Add pragmas to the Iteration/Expression tree to enforce SIMD auto-vectorization by the backend compiler. """ ignore_deps = as_tuple(self._backend_compiler_pragma('ignore-deps')) mapper = {} for tree in retrieve_iteration_tree(iet): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: aligned = [ j for j in FindSymbols('symbolics').visit(i) if j.is_DiscreteFunction ] if aligned: simd = Ompizer.lang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), self.platform.simd_reg_size)) else: simd = as_tuple(Ompizer.lang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed = Transformer(mapper).visit(iet) return processed, {}
def _minimize_remainders(self, iet): """ Adjust ROUNDABLE Iteration bounds so as to avoid the insertion of remainder loops by the backend compiler. """ roundable = [ i for i in FindNodes(Iteration).visit(iet) if i.is_Roundable ] mapper = {} for i in roundable: functions = FindSymbols().visit(i) # Get the SIMD vector length dtypes = {f.dtype for f in functions if f.is_Tensor} assert len(dtypes) == 1 vl = configuration['platform'].simd_items_per_reg(dtypes.pop()) # Round up `i`'s max point so that at runtime only vector iterations # will be performed (i.e., remainder loops won't be necessary) m, M, step = i.limits limits = (m, M + (i.symbolic_size % vl), step) mapper[i] = i._rebuild(limits=limits) iet = Transformer(mapper).visit(iet) return iet, {}
def test_find_symbols_with_duplicates(): grid = Grid(shape=(4, 4)) x, y = grid.dimensions f = TimeFunction(name='f', grid=grid) g = Function(name='g', grid=grid) eq = Eq(f.forward, sin(g) * f + g) op = Operator(eq) exprs = FindNodes(Expression).visit(op) assert len(exprs) == 2 # Two syntactically identical indexeds r0[x, y], but they're different # objects because they are constructed in two different places during # the CIRE pass r0a = exprs[0].output r0b = exprs[1].expr.rhs.args[0].args[0] assert r0a.function is r0b.function assert r0a.name == r0b.name == "r0" assert hash(r0a) == hash(r0b) assert r0a is not r0b # So we expect FindSymbols to catch five Indexeds in total symbols = FindSymbols('indexeds').visit(op) assert len(symbols) == 5
def iet_lower_dimensions(iet): """ Replace all DerivedDimensions within the ``iet``'s expressions with lower-level symbolic objects (other Dimensions or Symbols). * Array indices involving SteppingDimensions are turned into ModuloDimensions. Example: ``u[t+1, x] = u[t, x] + 1 >>> u[t1, x] = u[t0, x] + 1`` * Array indices involving ConditionalDimensions used are turned into integer-division expressions. Example: ``u[t_sub, x] = u[time, x] >>> u[time / 4, x] = u[time, x]`` """ # Lower SteppingDimensions for i in FindNodes(Iteration).visit(iet): if not i.uindices: # Be quick: avoid uselessy reconstructing nodes continue # In an expression, there could be `u[t+1, ...]` and `v[t+1, ...]`, where # `u` and `v` are TimeFunction with circular time buffers (save=None) *but* # different modulo extent. The `t+1` indices above are therefore conceptually # different, so they will be replaced with the proper ModuloDimension through # two different calls to `xreplace` groups = as_mapper(i.uindices, lambda d: d.modulo) for k, v in groups.items(): mapper = {d.origin: d for d in v} rule = lambda i: i.function.is_TimeFunction and i.function._time_size == k replacer = lambda i: xreplace_indices(i, mapper, rule) iet = XSubs(replacer=replacer).visit(iet) # Lower ConditionalDimensions cdims = [d for d in FindSymbols('free-symbols').visit(iet) if isinstance(d, ConditionalDimension)] mapper = {d: IntDiv(d.index, d.factor) for d in cdims} iet = XSubs(mapper).visit(iet) return iet
def _ompize(self, nodes, state): """ Add OpenMP pragmas to the Iteration/Expression tree to emit parallel code """ # Group by outer loop so that we can embed within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(nodes): # Determine the number of consecutive parallelizable Iterations key = lambda i: i.is_Parallel and\ not (i.is_Elementizable or i.is_Vectorizable) candidates = filter_iterations(tree, key=key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged # Handle parallelizable loops mapper = OrderedDict() for group in groups.values(): private = [] for root, tree in group.items(): # Heuristic: if at least two parallel loops are available and the # physical core count is greater than self.thresholds['collapse'], # then omp-collapse the loops nparallel = len(tree) if psutil.cpu_count(logical=False) < self.thresholds['collapse'] or\ nparallel < 2: parallel = omplang['for'] else: parallel = omplang['collapse'](nparallel) mapper[root] = root._rebuild(pragmas=root.pragmas + (parallel, )) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=omplang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region processed = Transformer(mapper).visit(nodes) return processed, {}
def _padding(self, nodes, state): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(nodes) if not handle: return nodes, {} shape = max([i.shape for i in handle], key=len) if not shape: return nodes, {} candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: return nodes, {} # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(nodes) exprs = [e for e in exprs if e.write in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, Array(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = SubstituteExpression(mapper).visit(nodes) # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicFunction]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = List(body=init + as_tuple(processed) + copyback) return processed, {}
def dimensions(self): ret = set().union(*[d._defines for d in self._dimensions]) # During compilation other Dimensions may have been produced dimensions = FindSymbols('dimensions').visit(self) ret.update(d for d in dimensions if d.is_PerfKnob) ret = tuple(sorted(ret, key=attrgetter('name'))) return ret
def _make_clauses(cls, **kwargs): kwargs['chunk_size'] = False clauses = super(DeviceOpenACCIteration, cls)._make_clauses(**kwargs) partree = kwargs['nodes'] deviceptrs = [i.name for i in FindSymbols().visit(partree) if i.is_Array] if deviceptrs: clauses.append("deviceptr(%s)" % ",".join(deviceptrs)) return clauses
def test_minimize_reminders_due_to_autopadding(): """ Check that the bounds of the Iteration computing the DSE-captured aliasing expressions are relaxed (i.e., slightly larger) so that backend-compiler-generated remainder loops are avoided. """ grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions # noqa t = grid.stepping_dim f = Function(name='f', grid=grid) f.data_with_halo[:] = 1. u = TimeFunction(name='u', grid=grid, space_order=3) u.data_with_halo[:] = 0. # Leads to 3D aliases eqn = Eq( u.forward, ((u[t, x, y, z] + u[t, x + 1, y + 1, z + 1]) * 3 * f + (u[t, x + 2, y + 2, z + 2] + u[t, x + 3, y + 3, z + 3]) * 3 * f + 1)) op0 = Operator(eqn, dse='noop', dle=('advanced', {'openmp': False})) op1 = Operator(eqn, dse='aggressive', dle=('advanced', {'openmp': False})) x0_blk_size = op1.parameters[5] y0_blk_size = op1.parameters[9] z_size = op1.parameters[-1] # Check Array shape arrays = [ i for i in FindSymbols().visit(op1._func_table['bf0'].root) if i.is_Array ] assert len(arrays) == 1 a = arrays[0] assert len(a.dimensions) == 3 assert a.halo == ((1, 1), (1, 1), (1, 1)) assert a.padding == ((0, 0), (0, 0), (0, 30)) assert Add(*a.symbolic_shape[0].args) == x0_blk_size + 2 assert Add(*a.symbolic_shape[1].args) == y0_blk_size + 2 assert Add(*a.symbolic_shape[2].args) == z_size + 32 # Check loop bounds trees = retrieve_iteration_tree(op1._func_table['bf0'].root) assert len(trees) == 2 expected_rounded = trees[0].inner assert expected_rounded.symbolic_max ==\ z.symbolic_max + (z.symbolic_max - z.symbolic_min + 3) % 16 + 1 # Check numerical output op0(time_M=1) exp = np.copy(u.data[:]) u.data_with_halo[:] = 0. op1(time_M=1) assert np.all(u.data == exp)
def test_find_symbols_nested(mode, expected): grid = Grid(shape=(4, 4, 4)) call = Call('foo', [ Call('bar', [Symbol(name='x'), Call('baz', [Function(name='f', grid=grid)])]) ]) found = FindSymbols(mode).visit(call) assert [f.name for f in found] == eval(expected)
def make_parallel(self, iet): """ Transform ``iet`` by decorating its parallel :class:`Iteration`s with suitable ``#pragma omp ...`` for thread-level parallelism. """ # Group sequences of loops that should go within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(iet): # Determine the number of consecutive parallelizable Iterations candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged mapper = OrderedDict() for group in groups.values(): private = [] for root, candidates in group.items(): mapper.update(self._make_parallel_tree(root, candidates)) # Track the thread-private and thread-shared variables private.extend([ i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack ]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=self.lang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region processed = Transformer(mapper).visit(iet) # Hack/workaround to the fact that the OpenMP pragmas are not true # IET nodes, so the `nthreads` variables won't be detected as a # Callable parameter unless inserted in a mock Expression if mapper: nt = NThreads() eq = LocalExpression(DummyEq(Symbol(name='nt', dtype=np.int32), nt)) return List(body=[eq, processed]), {'input': [nt]} else: return List(body=processed), {}
def test_find_symbols(): grid = Grid(shape=(4, 4)) x, y = grid.dimensions f = Function(name='f', grid=grid) op = Operator(Eq(f, f[x - 1, y] + f[x + 1, y] + 1)) symbols = FindSymbols('indexeds').visit(op) assert len(symbols) == 3
def derive_parameters(iet, drop_locals=False): """ Derive all input parameters (function call arguments) from an IET by collecting all symbols not defined in the tree itself. """ # Extract all candidate parameters candidates = FindSymbols().visit(iet) # Symbols, Objects, etc, become input parameters as well basics = FindSymbols('basics').visit(iet) candidates.extend(i.function for i in basics) # Filter off duplicates (e.g., `x_size` is extracted by both calls to FindSymbols) candidates = filter_ordered(candidates) # Filter off symbols which are defined somewhere within `iet` defines = [s.name for s in FindSymbols('defines').visit(iet)] parameters = [s for s in candidates if s.name not in defines] # Drop globally-visible objects parameters = [ p for p in parameters if not isinstance(p, (Global, Keyword, Macro)) ] # Maybe filter out all other compiler-generated objects if drop_locals: parameters = [ p for p in parameters if not isinstance(p, (Array, LocalObject)) ] return parameters
def _lower_conditional_dims(iet): """ Lower ConditionalDimensions: index functions involving ConditionalDimensions are turned into integer-division expressions. Examples -------- u[t_sub, x] = u[time, x] becomes u[time / 4, x] = u[time, x] """ cdims = [d for d in FindSymbols('free-symbols').visit(iet) if isinstance(d, ConditionalDimension)] mapper = {d: IntDiv(d.index, d.factor) for d in cdims} iet = XSubs(mapper).visit(iet) return iet
def test_transformer_replace_function_body(block1, block2): """Create a Function and replace its body with another.""" args = FindSymbols().visit(block1) f = Callable('foo', block1, 'void', args) assert str(f.ccode) == """void foo() { for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { for (int k = 0; k < 7; k += 1) { a[i] = a[i] + b[i] + 5.0F; } } } }""" f = Transformer({block1: block2}).visit(f) assert str(f.ccode) == """void foo()
def make_parallel(self, iet): """ Transform ``iet`` by decorating its parallel :class:`Iteration`s with suitable ``#pragma omp ...`` triggering thread-level parallelism. """ # Group sequences of loops that should go within the same parallel region was_tagged = False groups = OrderedDict() for tree in retrieve_iteration_tree(iet): # Determine the number of consecutive parallelizable Iterations candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: was_tagged = False continue # Consecutive tagged Iteration go in the same group is_tagged = any(i.tag is not None for i in tree) key = len(groups) - (is_tagged & was_tagged) handle = groups.setdefault(key, OrderedDict()) handle[candidates[0]] = candidates was_tagged = is_tagged mapper = OrderedDict() for group in groups.values(): private = [] for root, candidates in group.items(): mapper.update(self._make_parallel_tree(root, candidates)) # Track the thread-private and thread-shared variables private.extend([i for i in FindSymbols('symbolics').visit(root) if i.is_Array and i._mem_stack]) # Build the parallel region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' rebuilt = [v for k, v in mapper.items() if k in group] par_region = Block(header=self.lang['par-region'](private), body=rebuilt) for k, v in list(mapper.items()): if isinstance(v, Iteration): mapper[k] = None if v.is_Remainder else par_region return Transformer(mapper).visit(iet)
def _make_clauses(cls, **kwargs): kwargs['chunk_size'] = False clauses = super(DeviceOpenACCIteration, cls)._make_clauses(**kwargs) symbols = FindSymbols().visit(kwargs['nodes']) deviceptrs = [i.name for i in symbols if i.is_Array and i._mem_default] presents = [i.name for i in symbols if (i.is_AbstractFunction and is_on_device(i, kwargs['gpu_fit']) and i.name not in deviceptrs)] # The NVC 20.7 and 20.9 compilers have a bug which triggers data movement for # indirectly indexed arrays (e.g., a[b[i]]) unless a present clause is used if presents: clauses.append("present(%s)" % ",".join(presents)) if deviceptrs: clauses.append("deviceptr(%s)" % ",".join(deviceptrs)) return clauses
def make_parallel(self, iet): """Transform ``iet`` by introducing shared-memory parallelism.""" mapper = OrderedDict() for tree in retrieve_iteration_tree(iet): # Get the first omp-parallelizable Iteration in `tree` candidates = filter_iterations(tree, key=self.key, stop='asap') if not candidates: continue root = candidates[0] # Build the `omp-for` tree partree = self._make_parallel_tree(root, candidates) # Find out the thread-private and thread-shared variables private = [ i for i in FindSymbols().visit(partree) if i.is_Array and i._mem_stack ] # Build the `omp-parallel` region private = sorted(set([i.name for i in private])) private = ('private(%s)' % ','.join(private)) if private else '' partree = Block(header=self.lang['par-region'](self.nthreads.name, private), body=partree) # Do not enter the parallel region if the step increment might be 0; this # would raise a `Floating point exception (core dumped)` in some OpenMP # implementation. Note that using an OpenMP `if` clause won't work if isinstance(root.step, Symbol): cond = Conditional(CondEq(root.step, 0), Element(c.Statement('return'))) partree = List(body=[cond, partree]) mapper[root] = partree iet = Transformer(mapper).visit(iet) return iet, {'input': [self.nthreads] if mapper else []}
def make_yask_kernels(iet, **kwargs): yk_solns = kwargs.pop('yk_solns') mapper = {} for n, (section, trees) in enumerate(find_affine_trees(iet).items()): dimensions = tuple(filter_ordered(i.dim.root for i in flatten(trees))) # Retrieve the section dtype exprs = FindNodes(Expression).visit(section) dtypes = {e.dtype for e in exprs} if len(dtypes) != 1: log("Unable to offload in presence of mixed-precision arithmetic") continue dtype = dtypes.pop() context = contexts.fetch(dimensions, dtype) # A unique name for the 'real' compiler and kernel solutions name = namespace['jit-soln'](Signer._digest(configuration, *[i.root for i in trees])) # Create a YASK compiler solution for this Operator yc_soln = context.make_yc_solution(name) try: # Generate YASK vars and populate `yc_soln` with equations local_vars = yaskit(trees, yc_soln) # Build the new IET nodes yk_soln_obj = YASKSolnObject(namespace['code-soln-name'](n)) funcall = make_sharedptr_funcall(namespace['code-soln-run'], ['time'], yk_soln_obj) funcall = Offloaded(funcall, dtype) mapper[trees[0].root] = funcall mapper.update({i.root: mapper.get(i.root) for i in trees}) # Drop trees # JIT-compile the newly-created YASK kernel yk_soln = context.make_yk_solution(name, yc_soln, local_vars) yk_solns[(dimensions, yk_soln_obj)] = yk_soln # Print some useful information about the newly constructed solution log("Solution '%s' contains %d var(s) and %d equation(s)." % (yc_soln.get_name(), yc_soln.get_num_vars(), yc_soln.get_num_equations())) except NotImplementedError as e: log("Unable to offload a candidate tree. Reason: [%s]" % str(e)) iet = Transformer(mapper).visit(iet) if not yk_solns: log("No offloadable trees found") # Some Iteration/Expression trees are not offloaded to YASK and may # require further processing to be executed through YASK, due to the # different storage layout yk_var_objs = { i.name: YASKVarObject(i.name) for i in FindSymbols().visit(iet) if i.from_YASK } yk_var_objs.update({i: YASKVarObject(i) for i in get_local_vars(yk_solns)}) iet = make_var_accesses(iet, yk_var_objs) # The signature needs to be updated # TODO: this could be done automagically through the iet pass engine, but # currently it only supports *appending* to the parameters list. While here # we actually need to change it as some parameters may disappear (x_m, x_M, ...) parameters = derive_parameters(iet, True) iet = iet._rebuild(parameters=parameters) return iet, {}
def autotune(operator, arguments, parameters, tunable): """ Acting as a high-order function, take as input an operator and a list of operator arguments to perform empirical autotuning. Some of the operator arguments are marked as tunable. """ # We get passed all the arguments, but the cfunction only requires a subset at_arguments = OrderedDict([(p.name, arguments[p.name]) for p in parameters]) # User-provided output data must not be altered output = [i.name for i in operator.output] for k, v in arguments.items(): if k in output: at_arguments[k] = v.copy() iterations = FindNodes(Iteration).visit(operator.body) dim_mapper = {i.dim.name: i.dim for i in iterations} # Shrink the iteration space of time-stepping dimension so that auto-tuner # runs will finish quickly steppers = [i for i in iterations if i.dim.is_Time] if len(steppers) == 0: timesteps = 1 elif len(steppers) == 1: stepper = steppers[0] start = at_arguments[stepper.dim.min_name] timesteps = stepper.extent(start=start, finish=options['at_squeezer']) - 1 if timesteps < 0: timesteps = options['at_squeezer'] - timesteps perf("AutoTuner: Number of timesteps adjusted to %d" % timesteps) at_arguments[stepper.dim.min_name] = start at_arguments[stepper.dim.max_name] = timesteps if stepper.dim.is_Stepping: at_arguments[stepper.dim.parent.min_name] = start at_arguments[stepper.dim.parent.max_name] = timesteps else: warning("AutoTuner: Couldn't understand loop structure; giving up") return arguments # Attempted block sizes ... mapper = OrderedDict([(i.argument.symbolic_size.name, i) for i in tunable]) # ... Defaults (basic mode) blocksizes = [ OrderedDict([(i, v) for i in mapper]) for v in options['at_blocksize'] ] # ... Always try the entire iteration space (degenerate block) itershape = [ mapper[i].iteration.symbolic_extent.subs(arguments) for i in mapper ] blocksizes.append( OrderedDict([(i, mapper[i].iteration.extent(0, j - 1)) for i, j in zip(mapper, itershape)])) # ... More attempts if auto-tuning in aggressive mode if configuration['autotuning'].level == 'aggressive': blocksizes = more_heuristic_attempts(blocksizes) # How many temporaries are allocated on the stack? # Will drop block sizes that might lead to a stack overflow functions = FindSymbols('symbolics').visit(operator.body + operator.elemental_functions) stack_shapes = [ i.symbolic_shape for i in functions if i.is_Array and i._mem_stack ] stack_space = sum(reduce(mul, i, 1) for i in stack_shapes) * operator._dtype().itemsize # Note: there is only a single loop over 'blocksize' because only # square blocks are tested timings = OrderedDict() for bs in blocksizes: illegal = False for k, v in at_arguments.items(): if k in bs: val = bs[k] start = mapper[k].original_dim.symbolic_start.subs(arguments) end = mapper[k].original_dim.symbolic_end.subs(arguments) if val <= mapper[k].iteration.extent(start, end): at_arguments[k] = val else: # Block size cannot be larger than actual dimension illegal = True break if illegal: continue # Make sure we remain within stack bounds, otherwise skip block size dim_sizes = {} for k, v in at_arguments.items(): if k in bs: dim_sizes[mapper[k].argument.symbolic_size] = bs[k] elif k in dim_mapper: dim_sizes[dim_mapper[k].symbolic_size] = v try: bs_stack_space = stack_space.xreplace(dim_sizes) except AttributeError: bs_stack_space = stack_space try: if int(bs_stack_space) > options['at_stack_limit']: continue except TypeError: # We should never get here warning( "AutoTuner: Couldn't determine stack size; skipping block shape %s" % str(bs)) continue # Use AutoTuner-specific profiler structs timer = operator.profiler.timer.reset() at_arguments[operator.profiler.name] = timer operator.cfunction(*list(at_arguments.values())) elapsed = sum(getattr(timer._obj, i) for i, _ in timer._obj._fields_) timings[tuple(bs.items())] = elapsed perf("AutoTuner: Block shape <%s> took %f (s) in %d timesteps" % (','.join('%d' % i for i in bs.values()), elapsed, timesteps)) try: best = dict(min(timings, key=timings.get)) perf("AutoTuner: Selected block shape %s" % best) except ValueError: warning("AutoTuner: Couldn't find legal block shapes") return arguments # Build the new argument list tuned = OrderedDict() for k, v in arguments.items(): tuned[k] = best[k] if k in mapper else v # Reset the profiling struct assert operator.profiler.name in tuned tuned[operator.profiler.name] = operator.profiler.timer.reset() return tuned
def _minimize_remainders(self, nodes, state): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ mapper = {} for tree in retrieve_iteration_tree(nodes): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] if root.tag is None: continue # Padding writes = [ i for i in FindSymbols('symbolics-writes').visit(root) if i.is_Array ] padding = [] for i in writes: try: simd_items = get_simd_items(i.dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype( i.dtype).itemsize padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, )) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.end_symbolic if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)) for i in root.uindices: for j in externals: condition.append(root.end_symbolic + padding < j) condition = ' || '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func(name='_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint))) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(nodes) return processed, {}
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) maybe_required = set() # Scalars that *may* have to be passed in not_required = set() # Elemental function locally declared scalars # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) args.extend(zip([ccode(j) for j in bounds], (start, finish))) # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] args.extend(zip([ccode(j.start) for j in i.uindices], ufunc)) limits = [Symbol(start.name), Symbol(finish.name), 1] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) not_required.update({i.dim}, set(j.index for j in i.uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) expressions = FindNodes(Expression).visit(free) fsymbols = FindSymbols('symbolics').visit(free) # Add all definitely-required arguments not_required.update({i.output for i in expressions if i.is_scalar}) for i in fsymbols: if i in not_required: continue elif i.is_Array: args.append( ("(%s*)%s" % (c.dtype_to_ctype(i.dtype), i.name), i)) elif i.is_TensorFunction: args.append(("%s_vec" % i.name, i)) elif i.is_Scalar: args.append((i.name, i)) # Add all maybe-required arguments that turn out to be required maybe_required.update( set(FindSymbols(mode='free-symbols').visit(free))) for i in fsymbols: not_required.update({as_symbol(i), i.indexify()}) for j in i.symbolic_shape: maybe_required.update(j.free_symbols) required = filter_sorted(maybe_required - not_required, key=attrgetter('name')) args.extend([(i.name, Scalar(name=i.name, dtype=i.dtype)) for i in required]) call, params = zip(*args) handle = flatten([p.rtargs for p in params]) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', handle, ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def opsit(trees, count): node_factory = OPSNodeFactory() expressions = [] for tree in trees: expressions.extend(FindNodes(Expression).visit(tree.inner)) it_range = [] it_dims = 0 for tree in trees: if isinstance(tree, IterationTree): it_range = [it.bounds() for it in tree] it_dims = len(tree) block = OPSBlock(namespace['ops_block'](count)) block_init = Element( cgen.Initializer( block, Call("ops_decl_block", [it_dims, String(block.name)], False))) ops_expressions = [] accesses = defaultdict(set) for i in reversed(expressions): extend_accesses(accesses, get_accesses(i.expr)) ops_expressions.insert(0, Expression(make_ops_ast(i.expr, node_factory))) ops_stencils_initializers, ops_stencils = generate_ops_stencils(accesses) to_remove = [ f.name for f in FindSymbols('defines').visit(List(body=expressions)) ] parameters = FindSymbols('symbolics').visit(List(body=ops_expressions)) parameters = [ p for p in parameters if p.name != 'OPS_ACC_size' and p.name not in to_remove ] parameters = sorted(parameters, key=lambda i: (i.is_Constant, i.name)) arguments = FindSymbols('symbolics').visit(List(body=expressions)) arguments = [a for a in arguments if a.name not in to_remove] arguments = sorted(arguments, key=lambda i: (i.is_Constant, i.name)) ops_expressions = [ Expression(fix_ops_acc(e.expr, [p.name for p in parameters])) for e in ops_expressions ] callable_kernel = Callable(namespace['ops_kernel'](count), ops_expressions, "void", parameters) dat_declarations = [] argname_to_dat = {} for a in arguments: if a.is_Constant: continue dat_dec, dat_sym = to_ops_dat(a, block) dat_declarations.extend(dat_dec) argname_to_dat.update(dat_sym) par_loop_range_arr = SymbolicArray(name=namespace['ops_range'](count), dimensions=(len(it_range) * 2, ), dtype=np.int32) range_vals = [] for mn, mx in it_range: range_vals.append(mn) range_vals.append(mx) par_loop_range_init = Expression( ClusterizedEq(Eq(par_loop_range_arr, ListInitializer(range_vals)))) ops_args = get_ops_args([p for p in parameters], ops_stencils, argname_to_dat) par_loop = Call("ops_par_loop", [ FunctionPointer(callable_kernel.name), String(callable_kernel.name), block, it_dims, par_loop_range_arr, *ops_args ]) return (callable_kernel, [par_loop_range_init, block_init] + ops_stencils_initializers + dat_declarations + [Call("ops_partition", [String("")])], List(body=[par_loop]), it_dims)
def _create_elemental_functions(self, nodes, state): """ Extract :class:`Iteration` sub-trees and move them into :class:`Callable`s. Currently, only tagged, elementizable Iteration objects are targeted. """ noinline = self._compiler_decoration('noinline', c.Comment('noinline?')) functions = OrderedDict() mapper = {} for tree in retrieve_iteration_tree(nodes, mode='superset'): # Search an elementizable sub-tree (if any) tagged = filter_iterations(tree, lambda i: i.tag is not None, 'asap') if not tagged: continue root = tagged[0] if not root.is_Elementizable: continue target = tree[tree.index(root):] # Elemental function arguments args = [] # Found so far (scalars, tensors) defined_args = {} # Map of argument values defined by loop bounds # Build a new Iteration/Expression tree with free bounds free = [] for i in target: name, bounds = i.dim.name, i.bounds_symbolic # Iteration bounds start = Scalar(name='%s_start' % name, dtype=np.int32) finish = Scalar(name='%s_finish' % name, dtype=np.int32) defined_args[start.name] = bounds[0] defined_args[finish.name] = bounds[1] # Iteration unbounded indices ufunc = [ Scalar(name='%s_ub%d' % (name, j), dtype=np.int32) for j in range(len(i.uindices)) ] defined_args.update( {uf.name: j.start for uf, j in zip(ufunc, i.uindices)}) limits = [ Scalar(name=start.name, dtype=np.int32), Scalar(name=finish.name, dtype=np.int32), 1 ] uindices = [ UnboundedIndex(j.index, i.dim + as_symbol(k)) for j, k in zip(i.uindices, ufunc) ] free.append( i._rebuild(limits=limits, offsets=None, uindices=uindices)) # Construct elemental function body, and inspect it free = NestedTransformer(dict((zip(target, free)))).visit(root) # Insert array casts for all non-defined f_symbols = FindSymbols('symbolics').visit(free) defines = [s.name for s in FindSymbols('defines').visit(free)] casts = [ ArrayCast(f) for f in f_symbols if f.is_Tensor and f.name not in defines ] free = (List(body=casts), free) for i in derive_parameters(free): if i.name in defined_args: args.append((defined_args[i.name], i)) elif i.is_Dimension: d = Scalar(name=i.name, dtype=i.dtype) args.append((d, d)) else: args.append((i, i)) call, params = zip(*args) name = "f_%d" % root.tag # Produce the new Call mapper[root] = List(header=noinline, body=Call(name, call)) # Produce the new Callable functions.setdefault( name, Callable(name, free, 'void', flatten(params), ('static', ))) # Transform the main tree processed = Transformer(mapper).visit(nodes) return processed, {'elemental_functions': functions.values()}
def _minimize_remainders(self, iet): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ # The innermost dimension is the one that might get padded p_dim = -1 mapper = {} for tree in retrieve_iteration_tree(iet): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] # Padding writes = [i.write for i in FindNodes(Expression).visit(root) if i.write.is_Array] padding = [] for i in writes: try: simd_items = self.platform.simd_items_per_reg(i.dtype) except KeyError: return iet, {} padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding) i.update(padding=i._padding[:p_dim] + (padded,)) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.symbolic_max if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root) if i.is_Tensor) for i in root.uindices: for j in externals: condition.append(root.symbolic_max + padding < j) condition = ' && '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func('_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint)) ) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(iet) return processed, {}