def _loop_wrapping(self, iet): """ Emit a performance warning if WRAPPABLE Iterations are found, as these are a symptom that unnecessary memory is being allocated. """ for i in FindNodes(Iteration).visit(iet): if not i.is_Wrappable: continue perf_adv("Functions using modulo iteration along Dimension `%s` " "may safely allocate a one slot smaller buffer" % i.dim) return iet, {}
def analyze(self, iet): """ Analyze the Sections in the given IET. This populates `self._sections`. """ sections = FindNodes(Section).visit(iet) for s in sections: if s.name in self._sections: continue bundles = FindNodes(ExpressionBundle).visit(s) # Total operation count ops = sum(i.ops*i.ispace.size for i in bundles) # Operation count at each section iteration sops = sum(i.ops for i in bundles) # Total memory traffic mapper = {} for i in bundles: for k, v in i.traffic.items(): mapper.setdefault(k, []).append(v) traffic = 0 for i in mapper.values(): try: traffic += IntervalGroup.generate('union', *i).size except (ValueError, TypeError): # Over different iteration spaces traffic += sum(j.size for j in i) # Each ExpressionBundle lives in its own iteration space itermaps = [i.ispace.dimension_map for i in bundles if i.ops != 0] # Track how many grid points are written within `s` points = set() for i in bundles: if any(e.write.is_TimeFunction for e in i.exprs): points.add(i.size) points = sum(points, S.Zero) self._sections[s.name] = SectionData(ops, sops, points, traffic, itermaps)
def test_no_fusion_simple(self): """ If ConditionalDimensions are present, then Clusters must not be fused so that ultimately Eqs get scheduled to different loop nests. """ grid = Grid(shape=(4, 4, 4)) time = grid.time_dim f = TimeFunction(name='f', grid=grid) g = Function(name='g', grid=grid) h = Function(name='h', grid=grid) # No ConditionalDimensions yet. Will be fused and optimized eqns = [Eq(f.forward, f + 1), Eq(h, f + 1), Eq(g, f + 1)] op = Operator(eqns) exprs = FindNodes(Expression).visit(op._func_table['bf0'].root) assert len(exprs) == 4 assert exprs[1].expr.rhs is exprs[0].output assert exprs[2].expr.rhs is exprs[0].output assert exprs[3].expr.rhs is exprs[0].output # Now with a ConditionalDimension. No fusion, no optimization ctime = ConditionalDimension(name='ctime', parent=time, condition=time > 4) eqns = [ Eq(f.forward, f + 1), Eq(h, f + 1), Eq(g, f + 1, implicit_dims=[ctime]) ] op = Operator(eqns) exprs = FindNodes(Expression).visit(op._func_table['bf0'].root) assert len(exprs) == 3 assert exprs[1].expr.rhs is exprs[0].output assert exprs[2].expr.rhs is exprs[0].output exprs = FindNodes(Expression).visit(op._func_table['bf1'].root) assert len(exprs) == 1
def test_topofusion_w_subdims_conddims_v3(self): """ Like `test_topofusion_w_subdims_conddims_v2` but with an extra anti-dependence, which causes scheduling over more loop nests. """ grid = Grid(shape=(4, 4, 4)) time = grid.time_dim f = TimeFunction(name='f', grid=grid, time_order=2) g = TimeFunction(name='g', grid=grid, time_order=2) h = TimeFunction(name='h', grid=grid, time_order=2) fsave = TimeFunction(name='fsave', grid=grid, time_order=2, save=5) gsave = TimeFunction(name='gsave', grid=grid, time_order=2, save=5) ctime = ConditionalDimension(name='ctime', parent=time, condition=time > 4) eqns = [Eq(f.forward, f + 1, subdomain=grid.interior), Eq(g.forward, g + 1, subdomain=grid.interior), Eq(fsave, f.dt2, implicit_dims=[ctime]), Eq(h, f.dt2.dx + g, subdomain=grid.interior), Eq(gsave, g.dt2, implicit_dims=[ctime])] op = Operator(eqns) # Check generated code -- expect the gsave equation to be scheduled together # in the same loop nest with the fsave equation assert len(op._func_table) == 3 exprs = FindNodes(Expression).visit(op._func_table['bf0'].root) assert len(exprs) == 2 assert exprs[0].write is f assert exprs[1].write is g exprs = FindNodes(Expression).visit(op._func_table['bf1'].root) assert len(exprs) == 3 assert exprs[1].write is fsave assert exprs[2].write is gsave exprs = FindNodes(Expression).visit(op._func_table['bf2'].root) assert len(exprs) == 2 assert exprs[1].write is h
def test_stencil_nowrite_implies_haloupdate(self): grid = Grid(shape=(12, )) x = grid.dimensions[0] t = grid.stepping_dim f = TimeFunction(name='f', grid=grid) g = Function(name='g', grid=grid) op = Operator(Eq(g, f[t, x - 1] + f[t, x + 1] + 1.)) calls = FindNodes(Call).visit(op) assert len(calls) == 1
def process(self, iet): sync_spots = FindNodes(SyncSpot).visit(iet) if not sync_spots: return iet, {} def key(s): # The SyncOps are to be processed in the following order return [ WaitLock, WithLock, Delete, FetchUpdate, FetchPrefetch, PrefetchUpdate, WaitPrefetch ].index(s) callbacks = { WaitLock: self._make_waitlock, WithLock: self._make_withlock, Delete: self._make_delete, FetchUpdate: self._make_fetchupdate, FetchPrefetch: self._make_fetchprefetch, PrefetchUpdate: self._make_prefetchupdate } postponed_callbacks = {WaitPrefetch: self._make_waitprefetch} all_callbacks = [callbacks, postponed_callbacks] pieces = namedtuple('Pieces', 'init finalize funcs objs')([], [], [], Objs()) # The processing is a two-step procedure; first, we apply the `callbacks`; # then, the `postponed_callbacks`, as these depend on objects produced by the # `callbacks` subs = {} for cbks in all_callbacks: for n in sync_spots: mapper = as_mapper(n.sync_ops, lambda i: type(i)) for _type in sorted(mapper, key=key): try: subs[n] = cbks[_type](subs.get(n, n), mapper[_type], pieces, iet) except KeyError: pass iet = Transformer(subs).visit(iet) # Add initialization and finalization code init = List(body=pieces.init, footer=c.Line()) finalize = List(header=c.Line(), body=pieces.finalize) iet = iet._rebuild(body=(init, ) + iet.body + (finalize, )) return iet, { 'efuncs': pieces.funcs, 'includes': ['pthread.h'], 'args': [i.size for i in pieces.objs.threads if not is_integer(i.size)] }
def fold_blockable_tree(node, exclude_innermost=False): """ Create IterationFolds from sequences of nested Iterations. """ found = FindAdjacent(Iteration).visit(node) mapper = {} for k, v in found.items(): for i in v: # Pre-condition: they all must be perfect iterations assert len(i) > 1 if any(not IsPerfectIteration().visit(j) for j in i): continue # Only retain consecutive trees having same depth trees = [retrieve_iteration_tree(j)[0] for j in i] handle = [] for j in trees: if len(j) != len(trees[0]): break handle.append(j) trees = handle if not trees: continue # Check foldability pairwise_folds = list(zip(*reversed(trees))) if any(not is_foldable(j) for j in pairwise_folds): continue # Maybe heuristically exclude innermost Iteration if exclude_innermost is True: pairwise_folds = pairwise_folds[:-1] # Perhaps there's nothing to fold if len(pairwise_folds) == 1: continue # TODO: we do not currently support blocking if any of the foldable # iterations writes to user data (need min/max loop bounds?) exprs = flatten( FindNodes(Expression).visit(j.root) for j in trees[:-1]) if any(j.write.is_Input for j in exprs): continue # Perform folding for j in pairwise_folds: root, remainder = j[0], j[1:] folds = [(tuple(y - x for x, y in zip(i.offsets, root.offsets)), i.nodes) for i in remainder] mapper[root] = IterationFold(folds=folds, **root.args) for k in remainder: mapper[k] = None # Insert the IterationFolds in the Iteration/Expression tree processed = Transformer(mapper, nested=True).visit(node) return processed
def test_avoid_haloupdate_if_distr_but_sequential(self): grid = Grid(shape=(12, )) x = grid.dimensions[0] t = grid.stepping_dim f = TimeFunction(name='f', grid=grid) # There is an anti-dependence between the first and second Eqs, so # the compiler places them in different x-loops. However, none of the # two loops should be preceded by a halo exchange, though for different # reasons: # * the equation in the first loop has no stencil # * the equation in the second loop is inherently sequential, so the # compiler should be sufficiently smart to see that there is no point # in adding a halo exchange op = Operator([Eq(f, f + 1), Eq(f, f[t, x - 1] + f[t, x + 1] + 1.)]) iterations = FindNodes(Iteration).visit(op) assert len(iterations) == 3 calls = FindNodes(Call).visit(op) assert len(calls) == 0
def _simdize(self, iet): # No SIMD-ization for devices. We then drop the VECTOR property # so that later passes can perform more aggressive transformations mapper = {} for i in FindNodes(Iteration).visit(iet): if i.is_Vectorizable: properties = [p for p in i.properties if p is not VECTOR] mapper[i] = i._rebuild(properties=properties) iet = Transformer(mapper).visit(iet) return iet, {}
def mpiize(iet, **kwargs): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ mode = kwargs.pop('mode') language = kwargs.pop('language') sregistry = kwargs.pop('sregistry') # To produce unique object names generators = {'msg': generator(), 'comm': generator(), 'comp': generator()} sync_heb = HaloExchangeBuilder('basic', language, sregistry, **generators) user_heb = HaloExchangeBuilder(mode, language, sregistry, **generators) mapper = {} for hs in FindNodes(HaloSpot).visit(iet): heb = user_heb if isinstance(hs, OverlappableHaloSpot) else sync_heb mapper[hs] = heb.make(hs) efuncs = sync_heb.efuncs + user_heb.efuncs objs = filter_sorted(sync_heb.objs + user_heb.objs) iet = Transformer(mapper, nested=True).visit(iet) # Must drop the PARALLEL tag from the Iterations within which halo # exchanges are performed mapper = {} for tree in retrieve_iteration_tree(iet): for i in reversed(tree): if i in mapper: # Already seen this subtree, skip break if FindNodes(Call).visit(i): mapper.update({ n: n._rebuild(properties=set(n.properties) - {PARALLEL}) for n in tree[:tree.index(i) + 1] }) break iet = Transformer(mapper, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'efuncs': efuncs, 'args': objs}
def test_cache_blocking_structure(blockinner, exp_calls, exp_iters): # Check code structure _, op = _new_operator2((10, 31, 45), time_order=2, opt=('blocking', { 'blockinner': blockinner })) calls = FindNodes(Call).visit(op) assert len(calls) == exp_calls trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 1 tree = trees[0] assert len(tree) == exp_iters if blockinner: assert all(tree[i].dim.is_Incr for i in range(exp_iters)) else: assert all(tree[i].dim.is_Incr for i in range(exp_iters - 1)) assert not tree[-1].dim.is_Incr # Check presence of openmp pragmas at the right place _, op = _new_operator2((10, 31, 45), time_order=2, opt=('blocking', { 'openmp': True, 'blockinner': blockinner })) trees = retrieve_iteration_tree(op._func_table['bf0'].root) assert len(trees) == 1 tree = trees[0] assert len(tree.root.pragmas) == 1 assert 'omp for' in tree.root.pragmas[0].value # Also, with omp parallelism enabled, the step increment must be != 0 # to avoid omp segfaults at scheduling time (only certain omp implementations, # including Intel's) conditionals = FindNodes(Conditional).visit(op._func_table['bf0'].root) assert len(conditionals) == 1 conds = conditionals[0].condition.args expected_guarded = tree[:2 + blockinner] assert len(conds) == len(expected_guarded) assert all(i.lhs == j.step for i, j in zip(conds, expected_guarded))
def test_subsampling(self): grid = Grid(shape=(40, )) x = grid.dimensions[0] t = grid.stepping_dim time = grid.time_dim nt = 9 f = TimeFunction(name='f', grid=grid) f.data_with_halo[:] = 1. # Setup subsampled function factor = 4 nsamples = (nt + factor - 1) // factor times = ConditionalDimension('t_sub', parent=time, factor=factor) fsave = TimeFunction(name='fsave', grid=grid, save=nsamples, time_dim=times) eqns = [Eq(f.forward, f[t, x - 1] + f[t, x + 1]), Eq(fsave, f)] op = Operator(eqns) op.apply(time=nt - 1) assert np.all(f.data_ro_domain[0] == fsave.data_ro_domain[nsamples - 1]) glb_pos_map = f.grid.distributor.glb_pos_map if LEFT in glb_pos_map[x]: assert np.all(fsave.data_ro_domain[nsamples - 1, nt - 1:] == 256.) else: assert np.all(fsave.data_ro_domain[nsamples - 1, :-(nt - 1)] == 256.) # Also check there are no redundant halo exchanges calls = FindNodes(Call).visit(op) assert len(calls) == 1 # In particular, there is no need for a halo exchange within the conditional conditional = FindNodes(Conditional).visit(op) assert len(conditional) == 1 assert len(FindNodes(Call).visit(conditional[0])) == 0
def mpi_gpu_direct(iet, **kwargs): """ Modify MPI Callables to enable multiple GPUs performing GPU-Direct communication. """ mapper = {} for node in FindNodes((IsendCall, IrecvCall)).visit(iet): header = c.Pragma('omp target data use_device_ptr(%s)' % node.arguments[0].name) mapper[node] = Block(header=header, body=node) iet = Transformer(mapper).visit(iet) return iet, {}
def _dist_parallelize(self, iet): """ Add MPI routines performing halo exchanges to emit distributed-memory parallel code. """ # Build send/recv Callables and Calls heb = HaloExchangeBuilder(self.params['mpi']) call_trees, calls = heb.make(FindNodes(HaloSpot).visit(iet)) # Transform the IET by adding in the `haloupdate` Calls iet = Transformer(calls, nested=True).visit(iet) return iet, {'includes': ['mpi.h'], 'call_trees': call_trees}
def instrument(self, iet): sections = FindNodes(Section).visit(iet) # Transform the Iteration/Expression tree introducing Advisor calls that # resume and stop data collection mapper = { i: List(body=[Call(self._api_resume), i, Call(self._api_pause)]) for i in sections } iet = Transformer(mapper).visit(iet) return iet
def test_no_fusion_convoluted(self): """ Conceptually like `test_no_fusion_simple`, but with more expressions and non-trivial data flow. """ grid = Grid(shape=(4, 4, 4)) time = grid.time_dim f = TimeFunction(name='f', grid=grid) g = Function(name='g', grid=grid) h = Function(name='h', grid=grid) ctime = ConditionalDimension(name='ctime', parent=time, condition=time > 4) eqns = [ Eq(f.forward, f + 1), Eq(h, f + 1), Eq(g, f + 1, implicit_dims=[ctime]), Eq(f.forward, f + 1, implicit_dims=[ctime]), Eq(f.forward, f + 1), Eq(g, f + 1) ] op = Operator(eqns) exprs = FindNodes(Expression).visit(op._func_table['bf0'].root) assert len(exprs) == 3 assert exprs[1].expr.rhs is exprs[0].output assert exprs[2].expr.rhs is exprs[0].output exprs = FindNodes(Expression).visit(op._func_table['bf1'].root) assert len(exprs) == 3 exprs = FindNodes(Expression).visit(op._func_table['bf2'].root) assert len(exprs) == 3 assert exprs[1].expr.rhs is exprs[0].output assert exprs[2].expr.rhs is exprs[0].output
def iet_insert_casts(iet, parameters): """ Transform the input IET inserting the necessary type casts. The type casts are placed at the top of the IET. Parameters ---------- iet : Node The input Iteration/Expression tree. parameters : tuple, optional The symbol that might require casting. """ # Make the generated code less verbose: if a non-Array parameter does not # appear in any Expression, that is, if the parameter is merely propagated # down to another Call, then there's no need to cast it exprs = FindNodes(Expression).visit(iet) calls = FindNodes(Call).visit(iet) need_cast = { i for i in set().union(*[i.functions for i in exprs]) if i.is_Tensor } need_cast.update({ i for i in set().union(*[i.arguments for i in calls]) if not isinstance(i, Call) and not isinstance(i, (int, Byref)) and i.is_Tensor }) need_cast.update({ i.function for i in set().union(*[i.arguments for i in calls]) if not isinstance(i, Call) and not isinstance(i, (int, Byref)) and i.is_ArrayAccess }) need_cast.update({i for i in parameters if i.is_Array}) casts = [ArrayCast(i) for i in parameters if i in need_cast] iet = List(body=casts + [iet]) return iet
def test_subdimmiddle_notparallel(self): """ Tests application of an Operator consisting of a subdimension defined over different sub-regions, explicitly created through the use of :class:`SubDimension`s. Different from ``test_subdimmiddle_parallel`` because an interior dimension cannot be evaluated in parallel. """ grid = Grid(shape=(20, 20)) x, y = grid.dimensions t = grid.stepping_dim thickness = 4 u = TimeFunction(name='u', save=None, grid=grid, space_order=0, time_order=1) xi = SubDimension.middle(name='xi', parent=x, thickness_left=thickness, thickness_right=thickness) yi = SubDimension.middle(name='yi', parent=y, thickness_left=thickness, thickness_right=thickness) # flow dependencies in x and y which should force serial execution # in reverse direction centre = Eq(u[t + 1, xi, yi], u[t, xi, yi] + u[t + 1, xi + 1, yi + 1]) u.data[0, 10, 10] = 1.0 op = Operator([centre]) iterations = FindNodes(Iteration).visit(op) assert all(i.is_Affine and i.is_Sequential for i in iterations if i.dim == xi) assert all(i.is_Affine and i.is_Parallel for i in iterations if i.dim == yi) op.apply(time_m=0, time_M=0) for i in range(4, 11): assert u.data[1, i, i] == 1.0 u.data[1, i, i] = 0.0 assert np.all(u.data[1, :] == 0)
def instrument(self, iet, timer): # Look for the presence of a time loop within the IET of the Operator mapper = {} for i in FindNodes(Iteration).visit(iet): if i.dim.is_Time: # The calls to Advisor's Collection Control API are only for Operators # with a time loop mapper[i] = List(header=c.Statement('%s()' % self._api_resume), body=i, footer=c.Statement('%s()' % self._api_pause)) return Transformer(mapper).visit(iet) # Return the IET intact if no time loop is found return iet
def instrument(self, iet, timer): """ Instrument the given IET for C-level performance profiling. """ sections = FindNodes(Section).visit(iet) if sections: mapper = {} for i in sections: n = i.name assert n in timer.fields mapper[i] = i._rebuild(body=TimedList(timer=timer, lname=n, body=i.body)) return Transformer(mapper, nested=True).visit(iet) else: return iet
def test_consistency_anti_dependences(self, exprs, axis, expected, visit, ti0, ti1, ti3, tu, tv, tw): """ Test that anti dependences end up generating multi loop nests, rather than a single loop nest enclosing all of the equations. """ eq1, eq2, eq3 = EVAL(exprs, ti0.base, ti1.base, ti3.base, tu.base, tv.base, tw.base) op = Operator([eq1, eq2, eq3], dse='noop', dle='noop', time_axis=axis) trees = retrieve_iteration_tree(op) assert len(trees) == len(expected) assert ["".join(i.dim.name for i in j) for j in trees] == expected iters = FindNodes(Iteration).visit(op) assert "".join(i.dim.name for i in iters) == visit
def test_gpu_direct(self): grid = Grid(shape=(3, 3, 3)) u = TimeFunction(name='u', grid=grid) op = Operator(Eq(u.forward, u.dx+1), opt=('advanced', {'gpu-direct': True}), language='openmp') for f, v in op._func_table.items(): for node in FindNodes(Block).visit(v.root): if type(node.children[0][0]) in (IrecvCall, IsendCall): assert node.header[0].value ==\ ('omp target data use_device_ptr(%s)' % node.children[0][0].arguments[0].name)
def _generate_mpi(self, iet, **kwargs): if configuration['mpi'] is False: return iet halo_spots = FindNodes(HaloSpot).visit(iet) # For each MPI-distributed TensorFunction, generate all necessary # C-level routines to perform a halo update callables = OrderedDict() for hs in halo_spots: for f, v in hs.fmapper.items(): callables[f] = [update_halo(f, v.loc_indices)] callables[f].append(sendrecv(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices)) callables[f].append(copy(f, v.loc_indices, True)) callables = flatten(callables.values()) # Replace HaloSpots with suitable calls performing the halo update mapper = {} for hs in halo_spots: for f, v in hs.fmapper.items(): stencil = [int(i) for i in hs.mask[f].values()] comm = f.grid.distributor._C_comm nb = f.grid.distributor._C_neighbours.obj loc_indices = list(v.loc_indices.values()) dsizes = [d.symbolic_size for d in f.dimensions] parameters = [f] + stencil + [comm, nb] + loc_indices + dsizes call = Call('halo_exchange_%s' % f.name, parameters) mapper.setdefault(hs, []).append(call) # Sorting is for deterministic code generation. However, in practice, # we don't expect `cstructs` to contain more than one element because # there should always be one grid per Operator (though we're not really # enforcing it) cstructs = { f.grid.distributor._C_neighbours.cdef for f in flatten(i.fmapper for i in halo_spots) } self._globals.extend(sorted(cstructs, key=lambda i: i.tpname)) self._includes.append('mpi.h') self._func_table.update( OrderedDict([(i.name, MetaCall(i, True)) for i in callables])) # Add in the halo update calls mapper = {k: List(body=v + list(k.body)) for k, v in mapper.items()} iet = Transformer(mapper, nested=True).visit(iet) return iet
def test_subdimmiddle_parallel(self): """ Tests application of an Operator consisting of a subdimension defined over different sub-regions, explicitly created through the use of :class:`SubDimension`s. """ grid = Grid(shape=(20, 20)) x, y = grid.dimensions t = grid.stepping_dim thickness = 4 u = TimeFunction(name='u', save=None, grid=grid, space_order=0, time_order=1) xi = SubDimension.middle(name='xi', parent=x, thickness_left=thickness, thickness_right=thickness) yi = SubDimension.middle(name='yi', parent=y, thickness_left=thickness, thickness_right=thickness) # a 5 point stencil that can be computed in parallel centre = Eq( u[t + 1, xi, yi], u[t, xi, yi] + u[t, xi - 1, yi] + u[t, xi + 1, yi] + u[t, xi, yi - 1] + u[t, xi, yi + 1]) u.data[0, 10, 10] = 1.0 op = Operator([centre]) iterations = FindNodes(Iteration).visit(op) assert all(i.is_Affine and i.is_Parallel for i in iterations if i.dim in [xi, yi]) op.apply(time_m=0, time_M=0) assert np.all(u.data[1, 9:12, 10] == 1.0) assert np.all(u.data[1, 10, 9:12] == 1.0) # Other than those, it should all be 0 u.data[1, 9:12, 10] = 0.0 u.data[1, 10, 9:12] = 0.0 assert np.all(u.data[1, :] == 0)
def make_grid_accesses(node, yk_grid_objs): """ Construct a new Iteration/Expression based on ``node``, in which all :class:`types.Indexed` accesses have been converted into YASK grid accesses. """ def make_grid_gets(expr): mapper = {} indexeds = retrieve_indexed(expr) data_carriers = [i for i in indexeds if i.base.function.from_YASK] for i in data_carriers: args = [ ListInitializer([INT(make_grid_gets(j)) for j in i.indices]) ] mapper[i] = make_sharedptr_funcall( namespace['code-grid-get'], args, yk_grid_objs[i.base.function.name]) return expr.xreplace(mapper) mapper = {} for i, e in enumerate(FindNodes(Expression).visit(node)): if e.is_ForeignExpression: continue lhs, rhs = e.expr.args # RHS translation rhs = make_grid_gets(rhs) # LHS translation if e.write.from_YASK: args = [rhs] args += [ ListInitializer([INT(make_grid_gets(i)) for i in lhs.indices]) ] call = namespace['code-grid-add' if e. is_Increment else 'code-grid-put'] handle = make_sharedptr_funcall(call, args, yk_grid_objs[e.write.name]) processed = ForeignExpression(handle, e.dtype, is_Increment=e.is_Increment) else: # Writing to a scalar temporary processed = e._rebuild(expr=e.expr.func(lhs, rhs)) mapper.update({e: processed}) return Transformer(mapper).visit(node)
def test_multiple_steppers(self, expr, exp_uindices, exp_mods): """Tests generation of multiple, mixed time stepping indices.""" grid = Grid(shape=(3, 3, 3)) x, y, z = grid.dimensions u = TimeFunction(name='u', grid=grid) # noqa v = TimeFunction(name='v', grid=grid, time_order=4) # noqa op = Operator(eval(expr), dle='noop') iters = FindNodes(Iteration).visit(op) time_iter = [i for i in iters if i.dim.is_Time] assert len(time_iter) == 1 time_iter = time_iter[0] # Check uindices in Iteration header signatures = [i._properties for i in time_iter.uindices] assert len(signatures) == len(exp_uindices) assert all(i in signatures for i in exp_uindices) # Check uindices within each TimeFunction exprs = [i.expr for i in FindNodes(Expression).visit(op)] assert(i.indices[i.function._time_position].modulo == exp_mods[i.function.name] for i in flatten(retrieve_indexed(i) for i in exprs))
def test_issue_1592(self): grid = Grid(shape=(11, 11)) time = grid.time_dim time_sub = ConditionalDimension('t_sub', parent=time, factor=2) v = TimeFunction(name="v", grid=grid, space_order=4, time_dim=time_sub, save=5) w = Function(name="w", grid=grid, space_order=4) Operator(Eq(w, v.dx))(time=6) op = Operator(Eq(v.forward, v.dx)) op.apply(time=6) exprs = FindNodes(Expression).visit(op) assert exprs[-1].expr.lhs.indices[0] == IntDiv(time, 2) + 1
def test_stencil_nowrite_implies_haloupdate_anyway(self): grid = Grid(shape=(12, )) x = grid.dimensions[0] t = grid.stepping_dim f = TimeFunction(name='f', grid=grid) g = Function(name='g', grid=grid) # It does a halo update, even though there's no data dependence, # because when the halo updates are placed, the compiler conservatively # assumes there might have been another equation writing to `f` before. op = Operator(Eq(g, f[t, x - 1] + f[t, x + 1] + 1.)) calls = FindNodes(Call).visit(op) assert len(calls) == 1
def find_affine_trees(iet): """ Find affine trees. A tree is affine when all of the array accesses are constant/affine functions of the Iteration variables and the Iteration bounds are fixed (but possibly symbolic). Parameters ---------- iet : `Node` The searched tree Returns ------- list of `Node` Each item in the list is the root of an affine tree """ affine = OrderedDict() roots = [i for i in FindNodes(Iteration).visit(iet) if i.dim.is_Time] for root in roots: sections = FindNodes(Section).visit(root) for section in sections: for tree in retrieve_iteration_tree(section): if not all(i.is_Affine for i in tree): # Non-affine array accesses not supported break exprs = [ i.expr for i in FindNodes(Expression).visit(tree.root) ] grid = ReducerMap([('', i.grid) for i in exprs if i.grid]).unique('') writeto_dimensions = tuple(i.dim.root for i in tree) if grid.dimensions == writeto_dimensions: affine.setdefault(section, []).append(tree) else: break return affine
def apply(self, func, **kwargs): """ Apply ``func`` to all nodes in the Graph. This changes the state of the Graph. """ dag = self._create_call_graph() # Apply `func` for i in dag.topological_sort(): self.efuncs[i], metadata = func(self.efuncs[i], **kwargs) # Track any new Dimensions introduced by `func` self.dimensions.extend(list(metadata.get('dimensions', []))) # Track any new #include and #define required by `func` self.includes.extend(list(metadata.get('includes', []))) self.includes = filter_ordered(self.includes) self.headers.extend(list(metadata.get('headers', []))) self.headers = filter_ordered(self.headers) # Tracky any new external function self.ffuncs.extend(list(metadata.get('ffuncs', []))) self.ffuncs = filter_ordered(self.ffuncs) # Track any new ElementalFunctions self.efuncs.update(OrderedDict([(i.name, i) for i in metadata.get('efuncs', [])])) # If there's a change to the `args` and the `iet` is an efunc, then # we must update the call sites as well, as the arguments dropped down # to the efunc have just increased args = as_tuple(metadata.get('args')) if args: # `extif` avoids redundant updates to the parameters list, due # to multiple children wanting to add the same input argument extif = lambda v: list(v) + [e for e in args if e not in v] stack = [i] + dag.all_downstreams(i) for n in stack: efunc = self.efuncs[n] calls = [c for c in FindNodes(Call).visit(efunc) if c.name in stack] mapper = {c: c._rebuild(arguments=extif(c.arguments)) for c in calls} efunc = Transformer(mapper).visit(efunc) if efunc.is_Callable: efunc = efunc._rebuild(parameters=extif(efunc.parameters)) self.efuncs[n] = efunc # Apply `func` to the external functions for i in range(len(self.ffuncs)): self.ffuncs[i], _ = func(self.ffuncs[i], **kwargs)