def exprs(dims): a = Array(name='a', shape=(3,), dimensions=(dims["i"],)).indexify() b = Array(name='b', shape=(3,), dimensions=(dims["i"],)).indexify() return [Expression(DummyEq(a, a + b + 5.)), Expression(DummyEq(a, b - a)), Expression(DummyEq(a, 4 * (b * a))), Expression(DummyEq(a, (6. / b) + (8. * a)))]
def test_cse(exprs, expected): """Test common subexpressions elimination.""" grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=2) # noqa tv = TimeFunction(name="tv", grid=grid, space_order=2) # noqa tw = TimeFunction(name="tw", grid=grid, space_order=2) # noqa tz = TimeFunction(name="tz", grid=grid, space_order=2) # noqa ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() # noqa ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() # noqa t0 = Scalar(name='t0') # noqa t1 = Scalar(name='t1') # noqa t2 = Scalar(name='t2') # noqa # List comprehension would need explicit locals/globals mappings to eval for i, e in enumerate(list(exprs)): exprs[i] = DummyEq(indexify(eval(e).evaluate)) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed = _cse(exprs, make) assert len(processed) == len(expected) assert all(str(i.rhs) == j for i, j in zip(processed, expected))
def _make_sendrecv(self, f, hse, key, **kwargs): comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in hse.loc_indices ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, padding=0, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') gather = Call('gather_%s' % key, [bufg] + list(bufg.shape) + [f] + ofsg) scatter = Call('scatter_%s' % key, [bufs] + list(bufs.shape) + [f] + ofss) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) count = reduce(mul, bufs.shape, 1) rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, Macro('MPI_STATUS_IGNORE')]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % key, iet, 'void', parameters, ('static', ))
def sendrecv(f, fixed): """Construct an IET performing a halo exchange along arbitrary dimension and side.""" assert f.is_Function assert f.grid is not None comm = f.grid.distributor._C_comm buf_dims = [Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') dat_dims = [Dimension(name='dat_%s' % d.root) for d in f.dimensions] dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype, scope='external') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') parameters = [bufg] + list(bufg.shape) + [dat] + list(dat.shape) + ofsg gather = Call('gather_%s' % f.name, parameters) parameters = [bufs] + list(bufs.shape) + [dat] + list(dat.shape) + ofss scatter = Call('scatter_%s' % f.name, parameters) # The scatter must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [bufs, count, Macro(numpy_to_mpitypes(f.dtype)), fromrank, '13', comm, rrecv]) send = Call('MPI_Isend', [bufg, count, Macro(numpy_to_mpitypes(f.dtype)), torank, '13', comm, rsend]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, Macro('MPI_STATUS_IGNORE')]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=[ArrayCast(dat), iet_insert_C_decls(iet)]) parameters = ([dat] + list(dat.shape) + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm]) return Callable('sendrecv_%s' % f.name, iet, 'void', parameters, ('static',))
def test_array(): grid = Grid(shape=(3, 3)) d = Dimension(name='d') a = Array(name='a', dimensions=grid.dimensions, dtype=np.int32, halo=((1, 1), (2, 2)), padding=((2, 2), (2, 2)), space='remote', scope='stack') pkl_a = pickle.dumps(a) new_a = pickle.loads(pkl_a) assert new_a.name == a.name assert new_a.dtype is np.int32 assert new_a.dimensions[0].name == 'x' assert new_a.dimensions[1].name == 'y' assert new_a.halo == ((1, 1), (2, 2)) assert new_a.padding == ((2, 2), (2, 2)) assert new_a.space == 'remote' assert new_a.scope == 'stack' # Now with a pointer array pa = PointerArray(name='pa', dimensions=d, array=a) pkl_pa = pickle.dumps(pa) new_pa = pickle.loads(pkl_pa) assert new_pa.name == pa.name assert new_pa.dim.name == 'd' assert new_pa.array.name == 'a'
def test_strides_forwarding1(): grid = Grid(shape=(4, 4)) a = Array(name='a', dimensions=grid.dimensions, shape=grid.shape) bar = Callable('bar', DummyExpr(a[0, 0], 0), 'void', parameters=[a.indexed]) call = Call(bar.name, [a.indexed]) foo = Callable('foo', call, 'void', parameters=[a]) # Emulate what the compiler would do graph = Graph(foo) graph.efuncs['bar'] = bar linearize(graph, mode=True, sregistry=SymbolRegistry()) # Despite `a` is passed via `a.indexed`, and since it's an Array (which # have symbolic shape), we expect the stride exprs to be placed in `bar`, # and in `bar` only, as `foo` doesn't really use `a`, it just propagates it # down to `bar` foo = graph.root bar = graph.efuncs['bar'] assert len(foo.body.body) == 1 assert foo.body.body[0].is_Call assert len(bar.body.body) == 5 assert bar.body.body[0].write.name == 'y_fsz0' assert bar.body.body[2].write.name == 'y_stride0'
def _make_copy(self, f, hse, key, swap=False): buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in hse.loc_indices: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype, padding=0) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in hse.loc_indices else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather_%s' % key else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter_%s' % key iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=(PARALLEL, AFFINE)) parameters = [buf] + list(buf.shape) + [f] + f_offsets return Callable(name, iet, 'void', parameters, ('static', ))
def test_yreplace_time_invariants(exprs, expected): grid = Grid((3, 3, 3)) dims = grid.dimensions tu = TimeFunction(name="tu", grid=grid, space_order=4).indexify() tv = TimeFunction(name="tv", grid=grid, space_order=4).indexify() tw = TimeFunction(name="tw", grid=grid, space_order=4).indexify() ti0 = Array(name='ti0', shape=(3, 5, 7), dimensions=dims).indexify() ti1 = Array(name='ti1', shape=(3, 5, 7), dimensions=dims).indexify() t0 = Scalar(name='t0').indexify() t1 = Scalar(name='t1').indexify() exprs = EVAL(exprs, tu, tv, tw, ti0, ti1, t0, t1) counter = generator() make = lambda: Scalar(name='r%d' % counter()).indexify() processed, found = yreplace(exprs, make, make_is_time_invariant(exprs), lambda i: estimate_cost(i) > 0) assert len(found) == len(expected) assert all(str(i.rhs) == j for i, j in zip(found, expected))
def _padding(self, nodes, state): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(nodes) if not handle: return nodes, {} shape = max([i.shape for i in handle], key=len) if not shape: return nodes, {} candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: return nodes, {} # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(nodes) exprs = [e for e in exprs if e.write in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, Array(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = SubstituteExpression(mapper).visit(nodes) # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicFunction]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = List(body=init + as_tuple(processed) + copyback) return processed, {}
def copy(f, fixed, swap=False): """ Construct a :class:`Callable` capable of copying: :: * an arbitrary convex region of ``f`` into a contiguous :class:`Array`, OR * if ``swap=True``, a contiguous :class:`Array` into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) dat_dims = [] dat_offsets = [] dat_indices = [] for d in f.dimensions: dat_dims.append(Dimension(name='dat_%s' % d.root)) offset = Symbol(name='o%s' % d.root) dat_offsets.append(offset) dat_indices.append(offset + (d.root if d not in fixed else 0)) dat = Array(name='dat', dimensions=dat_dims, dtype=f.dtype) if swap is False: eq = DummyEq(buf[buf_indices], dat[dat_indices]) name = 'gather_%s' % f.name else: eq = DummyEq(dat[dat_indices], buf[buf_indices]) name = 'scatter_%s' % f.name iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): iet = Iteration(iet, i, d.symbolic_size - 1) # -1 as Iteration generates <= iet = List(body=[ArrayCast(dat), ArrayCast(buf), iet]) parameters = [buf] + list(buf.shape) + [dat] + list( dat.shape) + dat_offsets return Callable(name, iet, 'void', parameters, ('static', ))
def new_ops_arg(self, indexed): """ Create an :class:`Indexed` node using OPS representation. Parameters ---------- indexed : :class:`Indexed` Indexed object using devito representation. Returns ------- :class:`Indexed` Indexed node using OPS representation. """ # Build the OPS arg identifier time_index = split_affine(indexed.indices[TimeFunction._time_position]) ops_arg_id = '%s%s%s' % (indexed.name, time_index.var, time_index.shift) if ops_arg_id not in self.ops_args: # Create the indexed object ops_arg = Array(name=ops_arg_id, dimensions=[Dimension(name=namespace['ops_acc'])], dtype=indexed.dtype) self.ops_args[ops_arg_id] = ops_arg else: ops_arg = self.ops_args[ops_arg_id] # Get the space indices space_indices = [ e for i, e in enumerate(indexed.indices) if i != TimeFunction._time_position ] # Define the Macro used in OPS arg index access_macro = Macro( 'OPS_ACC%d(%s)' % (list(self.ops_args).index(ops_arg_id), ','.join( str(split_affine(i).shift) for i in space_indices))) # Create Indexed object representing the OPS arg access new_indexed = Indexed(ops_arg.indexed, access_macro) return new_indexed
def _make_copy(self, f, fixed, swap=False): """ Construct a Callable performing a copy of: * an arbitrary convex region of ``f`` into a contiguous Array, OR * if ``swap=True``, a contiguous Array into an arbitrary convex region of ``f``. """ buf_dims = [] buf_indices = [] for d in f.dimensions: if d not in fixed: buf_dims.append(Dimension(name='buf_%s' % d.root)) buf_indices.append(d.root) buf = Array(name='buf', dimensions=buf_dims, dtype=f.dtype) f_offsets = [] f_indices = [] for d in f.dimensions: offset = Symbol(name='o%s' % d.root) f_offsets.append(offset) f_indices.append(offset + (d.root if d not in fixed else 0)) if swap is False: eq = DummyEq(buf[buf_indices], f[f_indices]) name = 'gather%dd' % f.ndim else: eq = DummyEq(f[f_indices], buf[buf_indices]) name = 'scatter%dd' % f.ndim iet = Expression(eq) for i, d in reversed(list(zip(buf_indices, buf_dims))): # The -1 below is because an Iteration, by default, generates <= iet = Iteration(iet, i, d.symbolic_size - 1, properties=PARALLEL) iet = List(body=[ArrayCast(f), ArrayCast(buf), iet]) # Optimize the memory copy with the DLE from devito.dle import transform state = transform(iet, 'simd', {'openmp': self._threaded}) parameters = [buf] + list(buf.shape) + [f] + f_offsets + state.input return Callable(name, state.nodes, 'void', parameters, ('static', )), state.input
def test_arrays_defined_over_subdims(self): """ Check code generation when an Array uses a SubDimension. """ grid = Grid(shape=(3,)) x, = grid.dimensions xi, = grid.interior.dimensions f = Function(name='f', grid=grid) a = Array(name='a', dimensions=(xi,), dtype=grid.dtype) op = Operator([Eq(a[xi], 1), Eq(f, f + a[xi + 1], subdomain=grid.interior)], openmp=False) assert len(op.parameters) == 6 # neither `x_size` nor `xi_size` are expected here assert not any(i.name in ('x_size', 'xi_size') for i in op.parameters) # Try running it -- regardless of what it will produce, this should run # ie, this checks this error isn't raised: # "ValueError: No value found for parameter xi_size" op()
def promote_scalar_expressions(exprs, shape, indices, onstack): """ Transform a collection of scalar expressions into tensor expressions. """ processed = [] # Fist promote the LHS mapper = {} for k, v in FlowGraph(exprs).items(): if v.is_scalar: # Create a new function symbol data = Array(name=k.name, shape=shape, dimensions=indices, onstack=onstack) indexed = Indexed(data.indexed, *indices) mapper[k] = indexed processed.append(Eq(indexed, v.rhs)) else: processed.append(Eq(k, v.rhs)) # Propagate the transformed LHS through the expressions processed = [Eq(n.lhs, n.rhs.xreplace(mapper)) for n in processed] return processed
def process(cluster, chosen, aliases, sregistry, platform): clusters = [] subs = {} for alias, writeto, aliaseds, distances in aliases.iter(cluster.ispace): if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.dimensions] # The halo of the Array halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array sharing = 'local' if any(d.is_Incr for d in writeto.dimensions) else 'shared' # Finally create the temporary Array that will store `alias` array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, sharing=sharing) # The access Dimensions may differ from `writeto.dimensions`. This may # happen e.g. if ShiftedDimensions are introduced (`a[x,y]` -> `a[xs,y]`) adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # The expression computing `alias` adims = [aliases.index_mapper.get(d, d) for d in writeto.dimensions] # x -> xs indices = [d - (0 if writeto[d].is_Null else writeto[d].lower) for d in adims] expression = Eq(array[indices], uxreplace(alias, subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in zip(aliaseds, distances): assert all(i.dim in distance.labels for i in writeto) indices = [d - i.lower + distance[i.dim] for d, i in zip(adims, writeto)] subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` IterationSpace ispace = cluster.ispace.add(writeto).augment(aliases.index_mapper) # Optimization: if possible, the innermost IterationInterval is # rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: vl = platform.simd_items_per_reg(cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Finally, build a new Cluster for `alias` built = cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace) clusters.insert(0, built) return clusters, subs
def array(name, shape, dimensions, onstack=False): return Array(name=name, shape=shape, dimensions=dimensions, onstack=onstack, onheap=(not onstack))
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = ClusterGroup() rules = OrderedDict() for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # Construct an iteration space suitable for /alias/ intervals, sub_iterators, directions = cluster.ispace.args intervals = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, i.limits)) for i in cluster.ispace.intervals ] ispace = IterationSpace(intervals, sub_iterators, directions) # Optimization: perhaps we can lift the cluster outside the time dimension if all(time_invariants[i] for i in alias.aliased): ispace = ispace.project(lambda i: not i.is_Time) # Build a symbolic function for /alias/ intervals = ispace.intervals halo = [(abs(intervals[i].lower), abs(intervals[i].upper)) for i in indices] function = Array(name=template(), dimensions=indices, halo=halo) access = tuple(i - intervals[i].lower for i in indices) expression = Eq(Indexed(function.indexed, *access), origin) # Construct a data space suitable for /alias/ mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(intervals) for k, v in mapper.items() if k } dspace = DataSpace([i.zero() for i in intervals], parts) # Create a new Cluster for /alias/ alias_clusters.append(Cluster([expression], ispace, dspace)) # Add substitution rules for aliased, distance in alias.with_distance: access = [ i - intervals[i].lower + j for i, j in distance if i in indices ] temporary = Indexed(function.indexed, *tuple(access)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Group clusters together if possible alias_clusters = groupby(alias_clusters).finalize() alias_clusters.sort(key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Template for captured redundancies shape = tuple(i.symbolic_extent for i in indices) make = lambda i: Array( name=template(i), shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation expressions = [] stencils = [] rules = OrderedDict() for c, (origin, alias) in enumerate(aliases.items()): if all(i not in candidates for i in alias.aliased): continue # Build alias expression function = make(c) expressions.append(Eq(Indexed(function, *indices), origin)) # Build substitution rules for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] temporary = Indexed(function, *tuple(coordinates)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Build cluster stencil stencil = alias.anti_stencil.anti(cluster.stencil) if all(time_invariants[i] for i in alias.aliased): # Optimization: drop time dimension if time-invariant and the # alias involves a complex calculation stencil = stencil.section(g.time_indices) stencils.append(stencil) # Create the alias clusters alias_clusters = clusterize(expressions, stencils) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def __init__(self, function, contracted_dims, accessv, n, async_degree): self.function = function self.accessv = accessv contraction_mapper = {} index_mapper = {} dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size along `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.xreplace({d: 0, d.spacing: 1}) for i in indices] size = max(slots) - min(slots) + 1 if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension bd = CustomDimension('db%d' % n, 0, size-1, size, d) contraction_mapper[d] = dims[dims.index(d)] = bd if size > 1: # Create the necessary SteppingDimensions for indexing sd = SteppingDimension(name='sb%d' % n, parent=bd) index_mapper.update({i: i.xreplace({d: sd}) for i in indices}) else: # Special case, no need to keep a SteppingDimension around index_mapper.update({i: 0 for i in indices}) self.contraction_mapper = contraction_mapper self.index_mapper = index_mapper # Track the SubDimensions used to index into `function` subdims_mapper = DefaultOrderedDict(set) for e in accessv.mapper: try: # Case 1: implicitly via SubDomains m = {d.root: v for d, v in e.subdomain.dimension_map.items()} except AttributeError: # Case 2: explicitly via the lower-level SubDimension API m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): subdims_mapper[d].add(v) if any(len(v) > 1 for v in subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) self.subdims_mapper = None raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in subdims_mapper.items()} self.buffer = Array(name='%sb' % function.name, dimensions=dims, dtype=function.dtype, halo=function.halo, space='mapped')
def process(candidates, aliases, cluster, template): """ Create Clusters from aliasing expressions. """ clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be required to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: perf_adv("Could not optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin.xreplace(subs)) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Optimize the `alias` IterationSpace: if possible, the innermost # IterationInterval is rounded up to a multiple of the vector length try: it = ispace.itintervals[-1] if ROUNDABLE in cluster.properties[it.dim]: from devito.parameters import configuration vl = configuration['platform'].simd_items_per_reg( cluster.dtype) ispace = ispace.add(Interval(it.dim, 0, it.interval.size % vl)) except (TypeError, KeyError): pass # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` clusters.append( cluster.rebuild(exprs=[expression], ispace=ispace, dspace=dspace)) return clusters, subs
def fc(self, grid, x, y): return Array(name='fc', shape=(3, 5), dimensions=(x, y)).indexed
def fa(self, grid, x): return Array(name='fa', shape=(3,), dimensions=(x,)).indexed
def fa(self, grid): return Array(name='fa', dimensions=(grid.dimensions[0],), shape=(3,)).indexed
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search aliasing expressions and capture them into vector temporaries. Examples -------- 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ # For more information about "aliases", refer to collect.__doc__ aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries graph = FlowGraph(cluster.exprs) time_invariants = { v.rhs: graph.time_invariant(v) for v in graph.values() } # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in graph.items(): # Cost check (to keep the memory footprint under control) naliases = len(aliases.get(v.rhs)) cost = estimate_cost(v, True) * naliases test0 = lambda: cost >= self.MIN_COST_ALIAS and naliases > 1 test1 = lambda: cost >= self.MIN_COST_ALIAS_INV and time_invariants[ v.rhs] if test0() or test1(): candidates[v.rhs] = k else: processed.append(v) # Create alias Clusters and all necessary substitution rules # for the new temporaries alias_clusters = [] subs = {} for origin, alias in aliases.items(): if all(i not in candidates for i in alias.aliased): continue # The write-to Intervals writeto = [ Interval(i.dim, *alias.relaxed_diameter.get(i.dim, (0, 0))) for i in cluster.ispace.intervals if not i.dim.is_Time ] writeto = IntervalGroup(writeto) # Optimization: no need to retain a SpaceDimension if it does not # induce a flow/anti dependence (below, `i.offsets` captures this, by # telling how much halo will be needed to honour such dependences) dep_inducing = [i for i in writeto if any(i.offsets)] try: index = writeto.index(dep_inducing[0]) writeto = IntervalGroup(writeto[index:]) except IndexError: warning("Couldn't optimize some of the detected redundancies") # Create a temporary to store `alias` dimensions = [d.root for d in writeto.dimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] array = Array(name=template(), dimensions=dimensions, halo=halo, dtype=cluster.dtype) # Build up the expression evaluating `alias` access = tuple(i.dim - i.lower for i in writeto) expression = Eq(array[access], origin) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, distance in alias.with_distance: assert all(i.dim in distance.labels for i in writeto) access = [i.dim - i.lower + distance[i.dim] for i in writeto] if aliased in candidates: # It would *not* be in `candidates` if part of a composite alias subs[candidates[aliased]] = array[access] subs[aliased] = array[access] # Construct the `alias` IterationSpace intervals, sub_iterators, directions = cluster.ispace.args ispace = IterationSpace(intervals.add(writeto), sub_iterators, directions) # Construct the `alias` DataSpace mapper = detect_accesses(expression) parts = { k: IntervalGroup(build_intervals(v)).add(ispace.intervals) for k, v in mapper.items() if k } dspace = DataSpace(cluster.dspace.intervals, parts) # Create a new Cluster for `alias` alias_clusters.append(Cluster([expression], ispace, dspace)) # Switch temporaries in the expression trees processed = [e.xreplace(subs) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def _make_sendrecv(self, f, fixed, extra=None): extra = extra or [] comm = f.grid.distributor._obj_comm buf_dims = [ Dimension(name='buf_%s' % d.root) for d in f.dimensions if d not in fixed ] bufg = Array(name='bufg', dimensions=buf_dims, dtype=f.dtype, scope='heap') bufs = Array(name='bufs', dimensions=buf_dims, dtype=f.dtype, scope='heap') ofsg = [Symbol(name='og%s' % d.root) for d in f.dimensions] ofss = [Symbol(name='os%s' % d.root) for d in f.dimensions] fromrank = Symbol(name='fromrank') torank = Symbol(name='torank') args = [bufg] + list(bufg.shape) + [f] + ofsg + extra gather = Call('gather%dd' % f.ndim, args) args = [bufs] + list(bufs.shape) + [f] + ofss + extra scatter = Call('scatter%dd' % f.ndim, args) # The `gather` is unnecessary if sending to MPI.PROC_NULL gather = Conditional(CondNe(torank, Macro('MPI_PROC_NULL')), gather) # The `scatter` must be guarded as we must not alter the halo values along # the domain boundary, where the sender is actually MPI.PROC_NULL scatter = Conditional(CondNe(fromrank, Macro('MPI_PROC_NULL')), scatter) srecv = MPIStatusObject(name='srecv') ssend = MPIStatusObject(name='ssend') rrecv = MPIRequestObject(name='rrecv') rsend = MPIRequestObject(name='rsend') count = reduce(mul, bufs.shape, 1) recv = Call('MPI_Irecv', [ bufs, count, Macro(dtype_to_mpitype(f.dtype)), fromrank, Integer(13), comm, rrecv ]) send = Call('MPI_Isend', [ bufg, count, Macro(dtype_to_mpitype(f.dtype)), torank, Integer(13), comm, rsend ]) waitrecv = Call('MPI_Wait', [rrecv, srecv]) waitsend = Call('MPI_Wait', [rsend, ssend]) iet = List(body=[recv, gather, send, waitsend, waitrecv, scatter]) iet = List(body=iet_insert_C_decls(iet)) parameters = ([f] + list(bufs.shape) + ofsg + ofss + [fromrank, torank, comm] + extra) return Callable('sendrecv%dd' % f.ndim, iet, 'void', parameters, ('static', ))
def fc(self, grid): return Array(name='fc', dimensions=(grid.dimensions[0], grid.dimensions[1]), shape=(3, 5)).indexed
def ti3(self, grid): return Array(name='ti3', shape=(3, 5, 7), dimensions=grid.dimensions).indexify()
def lower_schedule(cluster, schedule, chosen, sregistry, options): """ Turn a Schedule into a sequence of Clusters. """ onstack = options['cire-onstack'] clusters = [] subs = {} for alias, writeto, ispace, aliaseds, indicess in schedule: if all(i not in chosen for i in aliaseds): continue # The Dimensions defining the shape of Array # Note: with SubDimensions, we may have the following situation: # # for zi = z_m + zi_ltkn; zi <= z_M - zi_rtkn; ... # r[zi] = ... # # Instead of `r[zi - z_m - zi_ltkn]` we have just `r[zi]`, so we'll need # as much room as in `zi`'s parent to avoid going OOB # Aside from ugly generated code, the reason we do not rather shift the # indices is that it prevents future passes to transform the loop bounds # (e.g., MPI's comp/comm overlap does that) dimensions = [d.parent if d.is_Sub else d for d in writeto.itdimensions] halo = [(abs(i.lower), abs(i.upper)) for i in writeto] # The data sharing mode of the Array. It can safely be `shared` only if # all of the PARALLEL `cluster` Dimensions appear in `writeto` parallel = [d for d, v in cluster.properties.items() if PARALLEL in v] sharing = 'shared' if set(parallel) == set(writeto.itdimensions) else 'local' # The memory region of the Array. On the heap, unless the user has # explicitly requested allocation on the stack scope = 'stack' if onstack else 'heap' array = Array(name=sregistry.make_name(), dimensions=dimensions, halo=halo, dtype=cluster.dtype, scope=scope, sharing=sharing) indices = [] for i in writeto: try: # E.g., `xs` sub_iterators = writeto.sub_iterators[i.dim] assert len(sub_iterators) == 1 indices.append(sub_iterators[0]) except KeyError: # E.g., `z` -- a non-shifted Dimension indices.append(i.dim - i.lower) expression = Eq(array[indices], alias) # Create the substitution rules so that we can use the newly created # temporary in place of the aliasing expressions for aliased, indices in zip(aliaseds, indicess): subs[aliased] = array[indices] if aliased in chosen: subs[chosen[aliased]] = array[indices] else: # Perhaps part of a composite alias ? pass # Construct the `alias` DataSpace accesses = detect_accesses(expression) parts = {k: IntervalGroup(build_intervals(v)).add(ispace.intervals).relaxed for k, v in accesses.items() if k} dspace = DataSpace(cluster.dspace.intervals, parts) # Drop parallelism if using ModuloDimensions (due to rotations) properties = dict(cluster.properties) for d, v in cluster.properties.items(): if any(i.is_Modulo for i in ispace.sub_iterators[d]): properties[d] = normalize_properties(v, {SEQUENTIAL}) # Finally, build the `alias` Cluster clusters.append(cluster.rebuild(exprs=expression, ispace=ispace, dspace=dspace, properties=properties)) return clusters, subs
def array(name, shape, dimensions, scope='heap'): return Array(name=name, shape=shape, dimensions=dimensions, scope=scope)
def __init__(self, function, contracted_dims, accessv, options, sregistry, bds=None, mds=None): # Parse compilation options async_degree = options['buf-async-degree'] space = options['buf-mem-space'] dtype = options['buf-dtype'](function) self.function = function self.accessv = accessv self.contraction_mapper = {} self.index_mapper = defaultdict(dict) self.sub_iterators = defaultdict(list) self.subdims_mapper = DefaultOrderedDict(set) # Create the necessary ModuloDimensions for indexing into the buffer # E.g., `u[time,x] + u[time+1,x] -> `ub[sb0,x] + ub[sb1,x]`, where `sb0` # and `sb1` are ModuloDimensions starting at `time` and `time+1` respectively dims = list(function.dimensions) for d in contracted_dims: assert d in function.dimensions # Determine the buffer size, and therefore the span of the ModuloDimension, # along the contracting Dimension `d` indices = filter_ordered(i.indices[d] for i in accessv.accesses) slots = [i.subs({d: 0, d.spacing: 1}) for i in indices] try: size = max(slots) - min(slots) + 1 except TypeError: # E.g., special case `slots=[-1 + time/factor, 2 + time/factor]` # Resort to the fast vector-based comparison machinery (rather than # the slower sympy.simplify) slots = [Vector(i) for i in slots] size = int((vmax(*slots) - vmin(*slots) + 1)[0]) if async_degree is not None: if async_degree < size: warning("Ignoring provided asynchronous degree as it'd be " "too small for the required buffer (provided %d, " "but need at least %d for `%s`)" % (async_degree, size, function.name)) else: size = async_degree # Replace `d` with a suitable CustomDimension `bd` name = sregistry.make_name(prefix='db') bd = bds.setdefault((d, size), CustomDimension(name, 0, size-1, size, d)) self.contraction_mapper[d] = dims[dims.index(d)] = bd # Finally create the ModuloDimensions as children of `bd` if size > 1: # Note: indices are sorted so that the semantic order (sb0, sb1, sb2) # follows SymPy's index ordering (time, time-1, time+1) after modulo # replacement, so that associativity errors are consistent. This very # same strategy is also applied in clusters/algorithms/Stepper p, _ = offset_from_centre(d, indices) indices = sorted(indices, key=lambda i: -np.inf if i - p == 0 else (i - p)) for i in indices: name = sregistry.make_name(prefix='sb') md = mds.setdefault((bd, i), ModuloDimension(name, bd, i, size)) self.index_mapper[d][i] = md self.sub_iterators[d.root].append(md) else: assert len(indices) == 1 self.index_mapper[d][indices[0]] = 0 # Track the SubDimensions used to index into `function` for e in accessv.mapper: m = {i.root: i for i in e.free_symbols if isinstance(i, Dimension) and (i.is_Sub or not i.is_Derived)} for d, v in m.items(): self.subdims_mapper[d].add(v) if any(len(v) > 1 for v in self.subdims_mapper.values()): # Non-uniform SubDimensions. At this point we're going to raise # an exception. It's either illegal or still unsupported for v in self.subdims_mapper.values(): for d0, d1 in combinations(v, 2): if d0.overlap(d1): raise InvalidOperator("Cannot apply `buffering` to `%s` as it " "is accessed over the overlapping " " SubDimensions `<%s, %s>`" % (function, d0, d1)) raise NotImplementedError("`buffering` does not support multiple " "non-overlapping SubDimensions yet.") else: self.subdims_mapper = {d: v.pop() for d, v in self.subdims_mapper.items()} # Build and sanity-check the buffer IterationIntervals self.itintervals_mapper = {} for e in accessv.mapper: for i in e.ispace.itintervals: v = self.itintervals_mapper.setdefault(i.dim, i.args) if v != self.itintervals_mapper[i.dim]: raise NotImplementedError("Cannot apply `buffering` as the buffered " "function `%s` is accessed over multiple, " "non-compatible iteration spaces along the " "Dimension `%s`" % (function.name, i.dim)) # Also add IterationIntervals for initialization along `x`, should `xi` be # the only written Dimension in the `x` hierarchy for d, (interval, _, _) in list(self.itintervals_mapper.items()): for i in d._defines: self.itintervals_mapper.setdefault(i, (interval.relaxed, (), Forward)) # Finally create the actual buffer self.buffer = Array(name=sregistry.make_name(prefix='%sb' % function.name), dimensions=dims, dtype=dtype, halo=function.halo, space=space)