def promote_scalar_expressions(exprs, shape, indices, onstack): """ Transform a collection of scalar expressions into tensor expressions. """ processed = [] # Fist promote the LHS graph = temporaries_graph(exprs) mapper = {} for k, v in graph.items(): if v.is_scalar: # Create a new function symbol data = TensorFunction(name=k.name, shape=shape, dimensions=indices, onstack=onstack) indexed = Indexed(data.indexed, *indices) mapper[k] = indexed processed.append(Eq(indexed, v.rhs)) else: processed.append(Eq(k, v.rhs)) # Propagate the transformed LHS through the expressions processed = [Eq(n.lhs, n.rhs.xreplace(mapper)) for n in processed] return processed
def _padding(self, state, **kwargs): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() for node in state.nodes: # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(node) if not handle: continue shape = max([i.shape for i in handle], key=len) if not shape: continue candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: continue # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(node) exprs = [e for e in exprs if e.output_function in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, TensorFunction(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = [ SubstituteExpression(mapper).visit(n) for n in state.nodes ] # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicData]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = init + as_tuple(processed) + copyback return {'nodes': processed}
def _eliminate_inter_stencil_redundancies(self, cluster, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect_aliases.__doc__ mapper, aliases = collect_aliases(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices shape = g.space_shape # Template for captured redundancies name = self.conventions['redundancy'] + "%d" template = lambda i: TensorFunction( name=name % i, shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds[ 'min-cost-time-hoist'] and g.time_invariant(v): candidates[v.rhs] = k elif cost >= self.thresholds[ 'min-cost-space-hoist'] and naliases > 1: candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation found = [] rules = OrderedDict() stencils = [] for c, (origin, alias) in enumerate(aliases.items()): temporary = Indexed(template(c), *indices) found.append(Eq(temporary, origin)) # Track the stencil of each TensorFunction introduced stencils.append(alias.anti_stencil.anti(cluster.stencil)) for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] rules[candidates[aliased]] = Indexed(template(c), *tuple(coordinates)) # Create the alias clusters alias_clusters = clusterize(found, stencils) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]
def tensorfunction(name, shape, dimensions, onstack=False): return TensorFunction(name=name, shape=shape, dimensions=dimensions, onstack=onstack)
def _eliminate_inter_stencil_redundancies(self, cluster, template, **kwargs): """ Search for redundancies across the expressions and expose them to the later stages of the optimisation pipeline by introducing new temporaries of suitable rank. Two type of redundancies are sought: * Time-invariants, and * Across different space points Examples ======== Let ``t`` be the time dimension, ``x, y, z`` the space dimensions. Then: 1) temp = (a[x,y,z]+b[x,y,z])*c[t,x,y,z] >>> ti[x,y,z] = a[x,y,z] + b[x,y,z] temp = ti[x,y,z]*c[t,x,y,z] 2) temp1 = 2.0*a[x,y,z]*b[x,y,z] temp2 = 3.0*a[x,y,z+1]*b[x,y,z+1] >>> ti[x,y,z] = a[x,y,z]*b[x,y,z] temp1 = 2.0*ti[x,y,z] temp2 = 3.0*ti[x,y,z+1] """ if cluster.is_sparse: return cluster # For more information about "aliases", refer to collect.__doc__ mapper, aliases = collect(cluster.exprs) # Redundancies will be stored in space-varying temporaries g = cluster.trace indices = g.space_indices time_invariants = {v.rhs: g.time_invariant(v) for v in g.values()} # Template for captured redundancies shape = tuple(i.symbolic_size for i in indices) make = lambda i: TensorFunction( name=template(i), shape=shape, dimensions=indices).indexed # Find the candidate expressions processed = [] candidates = OrderedDict() for k, v in g.items(): # Cost check (to keep the memory footprint under control) naliases = len(mapper.get(v.rhs, [])) cost = estimate_cost(v, True) * naliases if cost >= self.thresholds['min-cost-alias'] and\ (naliases > 1 or time_invariants[v.rhs]): candidates[v.rhs] = k else: processed.append(Eq(k, v.rhs)) # Create temporaries capturing redundant computation expressions = [] stencils = [] rules = OrderedDict() for c, (origin, alias) in enumerate(aliases.items()): if all(i not in candidates for i in alias.aliased): continue # Build alias expression function = make(c) expressions.append(Eq(Indexed(function, *indices), origin)) # Build substitution rules for aliased, distance in alias.with_distance: coordinates = [ sum([i, j]) for i, j in distance.items() if i in indices ] temporary = Indexed(function, *tuple(coordinates)) rules[candidates[aliased]] = temporary rules[aliased] = temporary # Build cluster stencil stencil = alias.anti_stencil.anti(cluster.stencil) if all(time_invariants[i] for i in alias.aliased): # Optimization: drop time dimension if time-invariant and the # alias involves a complex calculation stencil = stencil.section(g.time_indices) stencils.append(stencil) # Create the alias clusters alias_clusters = clusterize(expressions, stencils, indices) alias_clusters = sorted(alias_clusters, key=lambda i: i.is_dense) # Switch temporaries in the expression trees processed = [e.xreplace(rules) for e in processed] return alias_clusters + [cluster.rebuild(processed)]