def parse_kwargs(**kwargs): """ Parse keyword arguments provided to an Operator. This routine is especially useful for backwards compatibility. """ # `dle` dle = kwargs.pop("dle", configuration['dle']) if not dle or isinstance(dle, str): mode, options = dle, {} elif isinstance(dle, tuple): if len(dle) == 0: mode, options = 'noop', {} elif isinstance(dle[-1], dict): if len(dle) == 2: mode, options = dle else: mode, options = tuple(flatten(i.split(',') for i in dle[:-1])), dle[-1] else: mode, options = tuple(flatten(i.split(',') for i in dle)), {} else: raise InvalidOperator("Illegal `dle=%s`" % str(dle)) # `dle`, options options.setdefault('blockinner', configuration['dle-options'].get('blockinner', False)) options.setdefault('blocklevels', configuration['dle-options'].get('blocklevels', None)) options.setdefault('openmp', configuration['openmp']) options.setdefault('mpi', configuration['mpi']) kwargs['options'] = options # `dle`, mode if mode is None: mode = 'noop' elif mode == 'noop': mode = tuple(i for i in ['mpi', 'openmp'] if options[i]) or 'noop' kwargs['mode'] = mode # `dse` dse = kwargs.pop("dse", configuration['dse']) if not dse: kwargs['dse'] = 'noop' elif isinstance(dse, str): kwargs['dse'] = dse else: try: kwargs['dse'] = ','.join(dse) except: raise InvalidOperator("Illegal `dse=%s`" % str(dse)) # Attach `platform` too for convenience, so we don't need `configuration` in # most compilation passes kwargs['platform'] = configuration['platform'] return kwargs
def make_next_cbk(rel, d, direction): """ Create a callable that given a symbol returns a sympy.Relational usable to express, in symbolic form, whether the next fetch/prefetch will be executed. """ if rel is None: if direction is Forward: return lambda s: Le(s, d.symbolic_max) else: return lambda s: Ge(s, d.symbolic_min) else: # Only case we know how to deal with, today, is the one induced # by a ConditionalDimension with structured condition (e.g. via `factor`) if not (rel.is_Equality and rel.rhs == 0 and isinstance(rel.lhs, Mod)): raise InvalidOperator( "Unable to understand data streaming pattern") _, v = rel.lhs.args if direction is Forward: # The LHS rounds `s` up to the nearest multiple of `v` return lambda s: Le(Mul( ((s + v - 1) / v), v, evaluate=False), d.symbolic_max) else: # The LHS rounds `s` down to the nearest multiple of `v` return lambda s: Ge(Mul( (s / v), v, evaluate=False), d.symbolic_min)
def _specialize_iet(cls, graph, **kwargs): options = kwargs['options'] passes = as_tuple(kwargs['mode']) # Fetch passes to be called passes_mapper = cls._make_passes_mapper(**kwargs) # Call passes for i in passes: try: passes_mapper[i](graph) except KeyError: raise InvalidOperator("Unknown passes `%s`" % str(passes)) # Force-call `mpi` if requested via global option if 'mpi' not in passes and options['mpi']: passes_mapper['mpi'](graph) # Force-call `openmp` if requested via global option if 'openmp' not in passes and options['openmp']: passes_mapper['openmp'](graph) # Symbol definitions data_manager = DataManager() data_manager.place_definitions(graph) data_manager.place_casts(graph) return graph
def make_cond(rel, d, direction, iteration): """ Create a symbolic condition which, once resolved at runtime, returns True if `iteration` is within the Dimension `d`'s min/max bounds, False otherwise. """ if rel is None: if direction is Forward: cond = Le(iteration, d.symbolic_max) else: cond = Ge(iteration, d.symbolic_min) else: # Only case we know how to deal with, today, is the one induced # by a ConditionalDimension with structured condition (e.g. via `factor`) if not (rel.is_Equality and rel.rhs == 0 and isinstance(rel.lhs, Mod)): raise InvalidOperator("Unable to understand data streaming pattern") _, v = rel.lhs.args if direction is Forward: # The LHS rounds `s` up to the nearest multiple of `v` cond = Le(Mul(((iteration + v - 1) / v), v, evaluate=False), d.symbolic_max) else: # The LHS rounds `s` down to the nearest multiple of `v` cond = Ge(Mul((iteration / v), v, evaluate=False), d.symbolic_min) if cond is true: return None else: return cond
def _build(cls, expressions, **kwargs): # Sanity check passes = as_tuple(kwargs['mode']) if any(i not in cls._known_passes for i in passes): raise InvalidOperator("Unknown passes `%s`" % str(passes)) return super(CustomOperator, cls)._build(expressions, **kwargs)
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = {} # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) iet = iet_insert_C_decls(iet) iet = self._build_casts(iet) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(iet) # Finish instantiation super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def _normalize_kwargs(cls, **kwargs): kwargs = super()._normalize_kwargs(**kwargs) if kwargs['options']['min-storage']: raise InvalidOperator('You should not use `min-storage` with `advanced-fsg ' ' as they work in opposite directions') return kwargs
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['openmp'] = oo.pop('openmp') o['mpi'] = oo.pop('mpi') o['parallel'] = o['openmp'] # Backwards compatibility # Buffering o['buf-async-degree'] = oo.pop('buf-async-degree', None) # Fusion o['fuse-tasks'] = oo.pop('fuse-tasks', False) # Blocking o['blockinner'] = oo.pop('blockinner', False) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) o['blockeager'] = oo.pop('blockeager', cls.BLOCK_EAGER) o['blocklazy'] = oo.pop('blocklazy', not o['blockeager']) o['blockrelax'] = oo.pop('blockrelax', cls.BLOCK_RELAX) o['skewing'] = oo.pop('skewing', False) o['par-tile'] = ParTile(oo.pop('par-tile', False), default=16) # CIRE o['min-storage'] = oo.pop('min-storage', False) o['cire-rotate'] = oo.pop('cire-rotate', False) o['cire-maxpar'] = oo.pop('cire-maxpar', False) o['cire-ftemps'] = oo.pop('cire-ftemps', False) o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN) o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE) # Shared-memory parallelism o['par-collapse-ncores'] = oo.pop('par-collapse-ncores', cls.PAR_COLLAPSE_NCORES) o['par-collapse-work'] = oo.pop('par-collapse-work', cls.PAR_COLLAPSE_WORK) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = oo.pop('par-dynamic-work', cls.PAR_DYNAMIC_WORK) o['par-nested'] = oo.pop('par-nested', cls.PAR_NESTED) # Misc o['optcomms'] = oo.pop('optcomms', True) o['linearize'] = oo.pop('linearize', False) # Recognised but unused by the CPU backend oo.pop('par-disabled', None) oo.pop('gpu-fit', None) if oo: raise InvalidOperator("Unrecognized optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Evaluable) for i in expressions): raise InvalidOperator("Only `devito.Evaluable` are allowed.") # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on iteration spaces and data dependences clusters = cls._lower_clusters(expressions, profiler, **kwargs) # Lower Clusters to a ScheduleTree stree = cls._lower_stree(clusters, **kwargs) # Lower ScheduleTree to an Iteration/Expression Tree iet, byproduct = cls._lower_iet(stree, profiler, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **iet.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(byproduct.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(byproduct.includes) # Required for the jit-compilation op._compiler = kwargs['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update(OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update(OrderedDict([(i.root.name, i) for i in byproduct.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted(flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = flatten(c.dimensions for c in clusters) + byproduct.dimensions op._dimensions = sorted(set(op._dimensions), key=attrgetter('name')) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def __init__(self, passes, template, platform): try: passes = passes.split(',') except AttributeError: # Already in tuple format if not all(i in self.passes_mapper for i in passes): raise InvalidOperator("Unknown passes `%s`" % str(passes)) self.passes = passes super(CustomRewriter, self).__init__(template, platform)
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['openmp'] = oo.pop('openmp') o['mpi'] = oo.pop('mpi') o['parallel'] = o['openmp'] # Backwards compatibility # Buffering o['buf-async-degree'] = oo.pop('buf-async-degree', None) # Blocking o['blockinner'] = oo.pop('blockinner', False) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) # CIRE o['min-storage'] = oo.pop('min-storage', False) o['cire-rotate'] = oo.pop('cire-rotate', False) o['cire-maxpar'] = oo.pop('cire-maxpar', False) o['cire-maxalias'] = oo.pop('cire-maxalias', False) o['cire-ftemps'] = oo.pop('cire-ftemps', False) o['cire-repeats'] = { 'invariants': oo.pop('cire-repeats-inv', cls.CIRE_REPEATS_INV), 'sops': oo.pop('cire-repeats-sops', cls.CIRE_REPEATS_SOPS) } o['cire-mincost'] = { 'invariants': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV), 'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS) } # Shared-memory parallelism o['par-collapse-ncores'] = oo.pop('par-collapse-ncores', cls.PAR_COLLAPSE_NCORES) o['par-collapse-work'] = oo.pop('par-collapse-work', cls.PAR_COLLAPSE_WORK) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = oo.pop('par-dynamic-work', cls.PAR_DYNAMIC_WORK) o['par-nested'] = oo.pop('par-nested', cls.PAR_NESTED) # Recognised but unused by the CPU backend oo.pop('par-disabled', None) oo.pop('gpu-direct', None) oo.pop('gpu-fit', None) if oo: raise InvalidOperator("Unrecognized optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _build(cls, expressions, **kwargs): # Sanity check passes = as_tuple(kwargs['mode']) for i in passes: if i not in cls._known_passes: if i in cls._known_passes_disabled: warning("Got explicit pass `%s`, but it's unsupported on an " "Operator of type `%s`" % (i, str(cls))) else: raise InvalidOperator("Unknown pass `%s`" % i) return super(DeviceOpenMPCustomOperator, cls)._build(expressions, **kwargs)
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['mpi'] = oo.pop('mpi') # Strictly unneccesary, but make it clear that this Operator *will* # generate OpenMP code, bypassing any `openmp=False` provided in # input to Operator oo.pop('openmp') # Buffering o['buf-async-degree'] = oo.pop('buf-async-degree', None) # Blocking o['blockinner'] = oo.pop('blockinner', True) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) # CIRE o['min-storage'] = False o['cire-rotate'] = False o['cire-onstack'] = False o['cire-maxpar'] = oo.pop('cire-maxpar', True) o['cire-maxalias'] = oo.pop('cire-maxalias', False) o['cire-repeats'] = { 'invariants': oo.pop('cire-repeats-inv', cls.CIRE_REPEATS_INV), 'sops': oo.pop('cire-repeats-sops', cls.CIRE_REPEATS_SOPS) } o['cire-mincost'] = { 'invariants': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV), 'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS) } # GPU parallelism o['par-collapse-ncores'] = 1 # Always use a collapse clause o['par-collapse-work'] = 1 # Always use a collapse clause o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = np.inf # Always use static scheduling o['par-nested'] = np.inf # Never use nested parallelism o['par-disabled'] = oo.pop('par-disabled', True) # No host parallelism by default o['gpu-direct'] = oo.pop('gpu-direct', True) o['gpu-fit'] = as_tuple(oo.pop('gpu-fit', None)) if oo: raise InvalidOperator("Unsupported optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['mpi'] = oo.pop('mpi') o['parallel'] = True # Buffering o['buf-async-degree'] = oo.pop('buf-async-degree', None) # Fusion o['fuse-tasks'] = oo.pop('fuse-tasks', False) # Blocking o['blockinner'] = oo.pop('blockinner', True) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) o['skewing'] = oo.pop('skewing', False) # CIRE o['min-storage'] = False o['cire-rotate'] = False o['cire-maxpar'] = oo.pop('cire-maxpar', True) o['cire-ftemps'] = oo.pop('cire-ftemps', False) o['cire-mingain'] = oo.pop('cire-mingain', cls.CIRE_MINGAIN) o['cire-schedule'] = oo.pop('cire-schedule', cls.CIRE_SCHEDULE) # GPU parallelism o['par-tile'] = oo.pop('par-tile', False) # Parallelize using a tile-like clause o['par-collapse-ncores'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-collapse-work'] = 1 # Always collapse (meaningful if `par-tile=False`) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = np.inf # Always use static scheduling o['par-nested'] = np.inf # Never use nested parallelism o['par-disabled'] = oo.pop('par-disabled', True) # No host parallelism by default o['gpu-fit'] = as_tuple( oo.pop('gpu-fit', cls._normalize_gpu_fit(**kwargs))) # Misc o['linearize'] = oo.pop('linearize', False) if oo: raise InvalidOperator("Unsupported optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['mpi'] = oo.pop('mpi') o['parallel'] = True # Buffering o['buf-async-degree'] = oo.pop('buf-async-degree', None) # Blocking o['blockinner'] = oo.pop('blockinner', True) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) # CIRE o['min-storage'] = False o['cire-rotate'] = False o['cire-maxpar'] = oo.pop('cire-maxpar', True) o['cire-maxalias'] = oo.pop('cire-maxalias', False) o['cire-ftemps'] = oo.pop('cire-ftemps', False) o['cire-mincost'] = { 'invariants': { 'scalar': 1, 'tensor': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV), }, 'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS) } # GPU parallelism o['par-collapse-ncores'] = 1 # Always use a collapse clause o['par-collapse-work'] = 1 # Always use a collapse clause o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = np.inf # Always use static scheduling o['par-nested'] = np.inf # Never use nested parallelism o['par-disabled'] = oo.pop('par-disabled', True) # No host parallelism by default o['gpu-direct'] = oo.pop('gpu-direct', True) o['gpu-fit'] = as_tuple( oo.pop('gpu-fit', cls._normalize_gpu_fit(**kwargs))) if oo: raise InvalidOperator("Unsupported optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _normalize_kwargs(cls, **kwargs): # Will be populated with dummy values; this method is actually overriden # by the subclasses o = {} oo = kwargs['options'] # Execution modes o['mpi'] = False o['parallel'] = False if oo: raise InvalidOperator("Unrecognized optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def fetch(self, platform=None, mode=None, language='C', **kwargs): """ Retrieve an Operator for the given `<platform, mode, language>`. """ if mode not in OperatorRegistry._modes: # DLE given as an arbitrary sequence of passes mode = 'custom' if language not in OperatorRegistry._languages: raise ValueError("Unknown language `%s`" % language) for cls in platform.__class__.mro(): for (p, m, l), kls in self.items(): if issubclass(p, cls) and m == mode and l == language: return kls raise InvalidOperator("Cannot compile an Operator for `%s`" % str( (p, m, l)))
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['openmp'] = oo.pop('openmp') o['mpi'] = oo.pop('mpi') # Blocking o['blockinner'] = oo.pop('blockinner', False) o['blocklevels'] = oo.pop('blocklevels', cls.BLOCK_LEVELS) # CIRE o['min-storage'] = oo.pop('min-storage', False) o['cire-repeats'] = { 'invariants': oo.pop('cire-repeats-inv', cls.CIRE_REPEATS_INV), 'sops': oo.pop('cire-repeats-sops', cls.CIRE_REPEATS_SOPS) } o['cire-mincost'] = { 'invariants': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV), 'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS) } # Shared-memory parallelism o['par-collapse-ncores'] = oo.pop('par-collapse-ncores', cls.PAR_COLLAPSE_NCORES) o['par-collapse-work'] = oo.pop('par-collapse-work', cls.PAR_COLLAPSE_WORK) o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = oo.pop('par-dynamic-work', cls.PAR_DYNAMIC_WORK) o['par-nested'] = oo.pop('par-nested', cls.PAR_NESTED) if oo: raise InvalidOperator("Unrecognized optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def _normalize_kwargs(cls, **kwargs): o = {} oo = kwargs['options'] # Execution modes o['mpi'] = oo.pop('mpi') # Strictly unneccesary, but make it clear that this Operator *will* # generate OpenMP code, bypassing any `openmp=False` provided in # input to Operator oo.pop('openmp') # CIRE o['min-storage'] = False o['cire-repeats'] = { 'invariants': oo.pop('cire-repeats-inv', cls.CIRE_REPEATS_INV), 'sops': oo.pop('cire-repeats-sops', cls.CIRE_REPEATS_SOPS) } o['cire-mincost'] = { 'invariants': oo.pop('cire-mincost-inv', cls.CIRE_MINCOST_INV), 'sops': oo.pop('cire-mincost-sops', cls.CIRE_MINCOST_SOPS) } # GPU parallelism o['par-collapse-ncores'] = 1 # Always use a collapse clause o['par-collapse-work'] = 1 # Always use a collapse clause o['par-chunk-nonaffine'] = oo.pop('par-chunk-nonaffine', cls.PAR_CHUNK_NONAFFINE) o['par-dynamic-work'] = np.inf # Always use static scheduling o['par-nested'] = np.inf # Never use nested parallelism if oo: raise InvalidOperator("Unsupported optimization options: [%s]" % ", ".join(list(oo))) kwargs['options'].update(o) return kwargs
def callback(self, clusters, prefix): if not prefix: return clusters d = prefix[-1].dim subiters = flatten( [c.ispace.sub_iterators.get(d, []) for c in clusters]) subiters = {i for i in subiters if i.is_Stepping} if not subiters: return clusters # Collect the index access functions along `d`, e.g., `t + 1` where `t` is # a SteppingDimension for `d = time` mapper = DefaultOrderedDict(lambda: DefaultOrderedDict(set)) for c in clusters: indexeds = [ a.indexed for a in c.scope.accesses if a.function.is_Tensor ] for i in indexeds: try: iaf = i.indices[d] except KeyError: continue # Sanity checks sis = iaf.free_symbols & subiters if len(sis) == 0: continue elif len(sis) == 1: si = sis.pop() else: raise InvalidOperator( "Cannot use multiple SteppingDimensions " "to index into a Function") size = i.function.shape_allocated[d] assert is_integer(size) mapper[size][si].add(iaf) # Construct the ModuloDimensions mds = [] for size, v in mapper.items(): for si, iafs in list(v.items()): # Offsets are sorted so that the semantic order (t0, t1, t2) follows # SymPy's index ordering (t, t-1, t+1) afer modulo replacement so # that associativity errors are consistent. This corresponds to # sorting offsets {-1, 0, 1} as {0, -1, 1} assigning -inf to 0 siafs = sorted(iafs, key=lambda i: -np.inf if i - si == 0 else (i - si)) for iaf in siafs: name = '%s%d' % (si.name, len(mds)) offset = uxreplace(iaf, {si: d.root}) mds.append( ModuloDimension(name, si, offset, size, origin=iaf)) # Replacement rule for ModuloDimensions def rule(size, e): try: return e.function.shape_allocated[d] == size except (AttributeError, KeyError): return False # Reconstruct the Clusters processed = [] for c in clusters: # Apply substitutions to expressions # Note: In an expression, there could be `u[t+1, ...]` and `v[t+1, # ...]`, where `u` and `v` are TimeFunction with circular time # buffers (save=None) *but* different modulo extent. The `t+1` # indices above are therefore conceptually different, so they will # be replaced with the proper ModuloDimension through two different # calls to `xreplace_indices` exprs = c.exprs groups = as_mapper(mds, lambda d: d.modulo) for size, v in groups.items(): mapper = {md.origin: md for md in v} func = partial(xreplace_indices, mapper=mapper, key=partial(rule, size)) exprs = [e.apply(func) for e in exprs] # Augment IterationSpace ispace = IterationSpace(c.ispace.intervals, { **c.ispace.sub_iterators, **{ d: tuple(mds) } }, c.ispace.directions) processed.append(c.rebuild(exprs=exprs, ispace=ispace)) return processed
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize_iet(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_args = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update(OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) self.dimensions.extend([i.argument for i in self.dle_args if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Insert data and pointer casts for array parameters and profiling structs nodes = self._build_casts(nodes) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering: indexification, substitution rules, specialization expressions = [indexify(i) for i in expressions] expressions = [i.xreplace(subs) for i in expressions] expressions = self._specialize_exprs(expressions) # Expression analysis self.input = filter_sorted(flatten(e.reads for e in expressions)) self.output = filter_sorted(flatten(e.writes for e in expressions)) self.dimensions = filter_sorted(flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = schedule(clusters) stree = section(stree) # Lower Sections to an Iteration/Expression tree (IET) iet = iet_build(stree) # Insert code for C-level performance profiling iet, self.profiler = self._profile_sections(iet) # Translate into backend-specific representation iet = self._specialize_iet(iet, **kwargs) # Insert the required symbol declarations iet = iet_insert_C_decls(iet, self.func_table) # Insert data and pointer casts for array parameters and profiling structs iet = self._build_casts(iet) # Derive parameters as symbols not defined in the kernel itself parameters = self._build_parameters(iet) # Finish instantiation super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis self.dtype = self._retrieve_dtype(expressions) self.input, self.output, self.dimensions = self._retrieve_symbols(expressions) stencils = self._retrieve_stencils(expressions) # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + [i for i in self.dimensions if i.size is None] # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine (DSE) for symbolic optimization clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table = OrderedDict([(i.name, FunMeta(i, True)) for i in dle_state.elemental_functions]) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension)]) self._includes.extend(list(dle_state.includes)) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(dle_state.nodes, parameters) # Introduce all required C declarations nodes = self._insert_declarations(nodes) # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Default attributes required for compilation self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._lib = None self._cfunction = None # Set the direction of time acoording to the given TimeAxis time.reverse = time_axis == Backward # Expression lowering expressions = [indexify(s) for s in expressions] expressions = [s.xreplace(subs) for s in expressions] # Analysis 1 - required *also after* the Operator construction self.dtype = self._retrieve_dtype(expressions) self.output = self._retrieve_output_fields(expressions) # Analysis 2 - required *for* the Operator construction ordering = self._retrieve_loop_ordering(expressions) stencils = self._retrieve_stencils(expressions) # Group expressions based on their Stencil clusters = clusterize(expressions, stencils) # Apply the Devito Symbolic Engine for symbolic optimization clusters = rewrite(clusters, mode=dse) # Wrap expressions with Iterations according to dimensions nodes = self._schedule_expressions(clusters, ordering) # Introduce C-level profiling infrastructure self.sections = OrderedDict() nodes = self._profile_sections(nodes) # Parameters of the Operator (Dimensions necessary for data casts) parameters = FindSymbols('kernel-data').visit(nodes) dimensions = FindSymbols('dimensions').visit(nodes) dimensions += [d.parent for d in dimensions if d.is_Buffered] parameters += filter_ordered([d for d in dimensions if d.size is None], key=operator.attrgetter('name')) # Resolve and substitute dimensions for loop index variables subs = {} nodes = ResolveIterationVariable().visit(nodes, subs=subs) nodes = SubstituteExpression(subs=subs).visit(nodes) # Apply the Devito Loop Engine for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) parameters += [i.argument for i in dle_state.arguments] self._includes.extend(list(dle_state.includes)) # Introduce all required C declarations nodes, elemental_functions = self._insert_declarations( dle_state, parameters) self.elemental_functions = elemental_functions # Track the DLE output, as it might be useful at execution time self._dle_state = dle_state # Finish instantiation super(OperatorBasic, self).__init__(self.name, nodes, 'int', parameters, ())
def parse_kwargs(**kwargs): """ Parse keyword arguments provided to an Operator. """ # `dse` -- deprecated, dropped dse = kwargs.pop("dse", None) if dse is not None: warning( "The `dse` argument is deprecated. " "The optimization level is now controlled via the `opt` argument") # `dle` -- deprecated, replaced by `opt` if 'dle' in kwargs: warning( "The `dle` argument is deprecated. " "The optimization level is now controlled via the `opt` argument") dle = kwargs.pop('dle') if 'opt' in kwargs: warning( "Both `dle` and `opt` were passed; ignoring `dle` argument") opt = kwargs.pop('opt') else: warning("Setting `opt=%s`" % str(dle)) opt = dle elif 'opt' in kwargs: opt = kwargs.pop('opt') else: opt = configuration['opt'] if not opt or isinstance(opt, str): mode, options = opt, {} elif isinstance(opt, tuple): if len(opt) == 0: mode, options = 'noop', {} elif isinstance(opt[-1], dict): if len(opt) == 2: mode, options = opt else: mode, options = tuple(flatten(i.split(',') for i in opt[:-1])), opt[-1] else: mode, options = tuple(flatten(i.split(',') for i in opt)), {} else: raise InvalidOperator("Illegal `opt=%s`" % str(opt)) # `opt`, deprecated kwargs kwopenmp = kwargs.get('openmp', options.get('openmp')) if kwopenmp is None: openmp = kwargs.get('language', configuration['language']) == 'openmp' else: openmp = kwopenmp # `opt`, options options = dict(options) options.setdefault('openmp', openmp) options.setdefault('mpi', configuration['mpi']) for k, v in configuration['opt-options'].items(): options.setdefault(k, v) kwargs['options'] = options # `opt`, mode if mode is None: mode = 'noop' kwargs['mode'] = mode # `platform` platform = kwargs.get('platform') if platform is not None: if not isinstance(platform, str): raise ValueError("Argument `platform` should be a `str`") if platform not in configuration._accepted['platform']: raise InvalidOperator("Illegal `platform=%s`" % str(platform)) kwargs['platform'] = platform_registry[platform]() else: kwargs['platform'] = configuration['platform'] # `language` language = kwargs.get('language') if language is not None: if not isinstance(language, str): raise ValueError("Argument `language` should be a `str`") if language not in configuration._accepted['language']: raise InvalidOperator("Illegal `language=%s`" % str(language)) kwargs['language'] = language elif kwopenmp is not None: # Handle deprecated `openmp` kwarg for backward compatibility kwargs['language'] = 'openmp' if openmp else 'C' else: kwargs['language'] = configuration['language'] # `compiler` compiler = kwargs.get('compiler') if compiler is not None: if not isinstance(compiler, str): raise ValueError("Argument `compiler` should be a `str`") if compiler not in configuration._accepted['compiler']: raise InvalidOperator("Illegal `compiler=%s`" % str(compiler)) kwargs['compiler'] = compiler_registry[compiler]( platform=kwargs['platform'], language=kwargs['language']) elif any([platform, language]): kwargs['compiler'] =\ configuration['compiler'].__new_from__(platform=kwargs['platform'], language=kwargs['language']) else: kwargs['compiler'] = configuration['compiler'] return kwargs
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, sympy.Eq) for i in expressions): raise InvalidOperator("Only SymPy expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) time_axis = kwargs.get("time_axis", Forward) dse = kwargs.get("dse", configuration['dse']) dle = kwargs.get("dle", configuration['dle']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self.func_table = OrderedDict() # Expression lowering and analysis expressions = [LoweredEq(e, subs=subs) for e in expressions] self.dtype = retrieve_dtype(expressions) self.input, self.output, self.dimensions = retrieve_symbols( expressions) # Set the direction of time acoording to the given TimeAxis for time in [d for d in self.dimensions if d.is_Time]: if not time.is_Stepping: time.reverse = time_axis == Backward # Parameters of the Operator (Dimensions necessary for data casts) parameters = self.input + self.dimensions # Group expressions based on their iteration space and data dependences, # and apply the Devito Symbolic Engine (DSE) for flop optimization clusters = clusterize(expressions) clusters = rewrite(clusters, mode=set_dse_mode(dse)) # Lower Clusters to an Iteration/Expression tree (IET) nodes = iet_build(clusters, self.dtype) # Introduce C-level profiling infrastructure nodes, self.profiler = self._profile_sections(nodes, parameters) # Translate into backend-specific representation (e.g., GPU, Yask) nodes = self._specialize(nodes, parameters) # Apply the Devito Loop Engine (DLE) for loop optimization dle_state = transform(nodes, *set_dle_mode(dle)) # Update the Operator state based on the DLE self.dle_arguments = dle_state.arguments self.dle_flags = dle_state.flags self.func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in dle_state.elemental_functions])) parameters.extend([i.argument for i in self.dle_arguments]) self.dimensions.extend([ i.argument for i in self.dle_arguments if isinstance(i.argument, Dimension) ]) self._includes.extend(list(dle_state.includes)) # Introduce the required symbol declarations nodes = iet_insert_C_decls(dle_state.nodes, self.func_table) # Initialise ArgumentEngine self.argument_engine = ArgumentEngine(clusters.ispace, parameters, self.dle_arguments) parameters = self.argument_engine.arguments # Finish instantiation super(Operator, self).__init__(self.name, nodes, 'int', parameters, ())
def _build(cls, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") name = kwargs.get("name", "Kernel") dse = kwargs.get("dse", configuration['dse']) # Python-level (i.e., compile time) and C-level (i.e., run time) performance profiler = create_profile('timers') # Lower input expressions to internal expressions (e.g., attaching metadata) expressions = cls._lower_exprs(expressions, **kwargs) # Group expressions based on their iteration space and data dependences # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...) clusters = clusterize(expressions, dse_mode=set_dse_mode(dse)) # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) # Instrument the IET for C-level profiling iet = profiler.instrument(iet) # Wrap the IET with a Callable parameters = derive_parameters(iet, True) op = Callable(name, iet, 'int', parameters, ()) # Lower IET to a Target-specific IET op, target_state = cls._specialize_iet(op, **kwargs) # Make it an actual Operator op = Callable.__new__(cls, **op.args) Callable.__init__(op, **op.args) # Header files, etc. op._headers = list(cls._default_headers) op._headers.extend(target_state.headers) op._globals = list(cls._default_globals) op._includes = list(cls._default_includes) op._includes.extend(profiler._default_includes) op._includes.extend(target_state.includes) # Required for the jit-compilation op._compiler = configuration['compiler'] op._lib = None op._cfunction = None # References to local or external routines op._func_table = OrderedDict() op._func_table.update( OrderedDict([(i, MetaCall(None, False)) for i in profiler._ext_calls])) op._func_table.update( OrderedDict([(i.root.name, i) for i in target_state.funcs])) # Internal state. May be used to store information about previous runs, # autotuning reports, etc op._state = cls._initialize_state(**kwargs) # Produced by the various compilation passes op._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) op._output = filter_sorted(flatten(e.writes for e in expressions)) op._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) op._dimensions.extend(target_state.dimensions) op._dtype, op._dspace = clusters.meta op._profiler = profiler return op
def __init__(self, expressions, **kwargs): expressions = as_tuple(expressions) # Input check if any(not isinstance(i, Eq) for i in expressions): raise InvalidOperator("Only `devito.Eq` expressions are allowed.") self.name = kwargs.get("name", "Kernel") subs = kwargs.get("subs", {}) dse = kwargs.get("dse", configuration['dse']) # Header files, etc. self._headers = list(self._default_headers) self._includes = list(self._default_includes) self._globals = list(self._default_globals) # Required for compilation self._compiler = configuration['compiler'] self._lib = None self._cfunction = None # References to local or external routines self._func_table = OrderedDict() # Internal state. May be used to store information about previous runs, # autotuning reports, etc self._state = self._initialize_state(**kwargs) # Form and gather any required implicit expressions expressions = self._add_implicit(expressions) # Expression lowering: evaluation of derivatives, indexification, # substitution rules, specialization expressions = [i.evaluate for i in expressions] expressions = [indexify(i) for i in expressions] expressions = self._apply_substitutions(expressions, subs) expressions = self._specialize_exprs(expressions) # Expression analysis self._input = filter_sorted( flatten(e.reads + e.writes for e in expressions)) self._output = filter_sorted(flatten(e.writes for e in expressions)) self._dimensions = filter_sorted( flatten(e.dimensions for e in expressions)) # Group expressions based on their iteration space and data dependences # Several optimizations are applied (fusion, lifting, flop reduction via DSE, ...) clusters = clusterize(expressions, dse_mode=set_dse_mode(dse)) self._dtype, self._dspace = clusters.meta # Lower Clusters to a Schedule tree stree = st_build(clusters) # Lower Schedule tree to an Iteration/Expression tree (IET) iet = iet_build(stree) iet, self._profiler = self._profile_sections(iet) iet = self._specialize_iet(iet, **kwargs) # Derive all Operator parameters based on the IET parameters = derive_parameters(iet, True) # Finalization: introduce declarations, type casts, etc iet = self._finalize(iet, parameters) super(Operator, self).__init__(self.name, iet, 'int', parameters, ())
def exit(emsg): """ Handle fatal errors. """ raise InvalidOperator("YASK Error [%s]. Exiting..." % emsg)
def __init__(self, function, contracted_dims, accessv, n, async_degree): self.function = function self.accessv = accessv contraction_mapper = {} index_mapper = {} dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size along `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.xreplace({d: 0, d.spacing: 1}) for i in indices] size = max(slots) - min(slots) + 1 if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension bd = CustomDimension('db%d' % n, 0, size-1, size, d) contraction_mapper[d] = dims[dims.index(d)] = bd if size > 1: # Create the necessary SteppingDimensions for indexing sd = SteppingDimension(name='sb%d' % n, parent=bd) index_mapper.update({i: i.xreplace({d: sd}) for i in indices}) else: # Special case, no need to keep a SteppingDimension around index_mapper.update({i: 0 for i in indices}) self.contraction_mapper = contraction_mapper self.index_mapper = index_mapper # Track the SubDimensions used to index into `function` subdims_mapper = DefaultOrderedDict(set) for e in accessv.mapper: try: # Case 1: implicitly via SubDomains m = {d.root: v for d, v in e.subdomain.dimension_map.items()} except AttributeError: # Case 2: explicitly via the lower-level SubDimension API m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): subdims_mapper[d].add(v) if any(len(v) > 1 for v in subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) self.subdims_mapper = None raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in subdims_mapper.items()} self.buffer = Array(name='%sb' % function.name, dimensions=dims, dtype=function.dtype, halo=function.halo, space='mapped')