def decorate(nodes): processed = [] for node in nodes: mapper = {} for tree in retrieve_iteration_tree(node): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: handle = FindSymbols('symbolics').visit(i) try: aligned = [ j for j in handle if j.is_Tensor and j.shape[-1] % get_simd_items(j.dtype) == 0 ] except KeyError: aligned = [] if aligned: simd = omplang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simdinfo[get_simd_flag()])) else: simd = as_tuple(omplang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed.append(Transformer(mapper).visit(node)) return processed
def _simdize(self, nodes, state): """ Add compiler-specific or, if not available, OpenMP pragmas to the Iteration/Expression tree to emit SIMD-friendly code. """ ignore_deps = as_tuple(self._compiler_decoration('ignore-deps')) mapper = {} for tree in retrieve_iteration_tree(nodes): vector_iterations = [i for i in tree if i.is_Vectorizable] for i in vector_iterations: handle = FindSymbols('symbolics').visit(i) try: aligned = [ j for j in handle if j.is_Tensor and j.shape[-1] % get_simd_items(j.dtype) == 0 ] except KeyError: aligned = [] if aligned: simd = Ompizer.lang['simd-for-aligned'] simd = as_tuple( simd(','.join([j.name for j in aligned]), simdinfo[get_simd_flag()])) else: simd = as_tuple(Ompizer.lang['simd-for']) mapper[i] = i._rebuild(pragmas=i.pragmas + ignore_deps + simd) processed = Transformer(mapper).visit(nodes) return processed, {}
def _padding(self, nodes, state): """ Introduce temporary buffers padded to the nearest multiple of the vector length, to maximize data alignment. At the bottom of the kernel, the values in the padded temporaries will be copied back into the input arrays. """ mapper = OrderedDict() # Assess feasibility of the transformation handle = FindSymbols('symbolics-writes').visit(nodes) if not handle: return nodes, {} shape = max([i.shape for i in handle], key=len) if not shape: return nodes, {} candidates = [i for i in handle if i.shape[-1] == shape[-1]] if not candidates: return nodes, {} # Retrieve the maximum number of items in a SIMD register when processing # the expressions in /node/ exprs = FindNodes(Expression).visit(nodes) exprs = [e for e in exprs if e.write in candidates] assert len(exprs) > 0 dtype = exprs[0].dtype assert all(e.dtype == dtype for e in exprs) try: simd_items = get_simd_items(dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(dtype).itemsize shapes = { k: k.shape[:-1] + (roundm(k.shape[-1], simd_items), ) for k in candidates } mapper.update( OrderedDict([(k.indexed, Array(name='p%s' % k.name, shape=shapes[k], dimensions=k.indices, onstack=k._mem_stack).indexed) for k in candidates])) # Substitute original arrays with padded buffers processed = SubstituteExpression(mapper).visit(nodes) # Build Iteration trees for initialization and copy-back of padded arrays mapper = OrderedDict([(k, v) for k, v in mapper.items() if k.function.is_SymbolicFunction]) init = copy_arrays(mapper, reverse=True) copyback = copy_arrays(mapper) processed = List(body=init + as_tuple(processed) + copyback) return processed, {}
def _minimize_remainders(self, state, **kwargs): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ mapper = {} for tree in retrieve_iteration_tree(state.nodes + state.elemental_functions): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations: continue assert len(vector_iterations) == 1 root = vector_iterations[0] if root.tag is None: continue # Padding writes = [ i for i in FindSymbols('symbolics-writes').visit(root) if i.is_TensorFunction ] padding = [] for i in writes: try: simd_items = get_simd_items(i.dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype( i.dtype).itemsize padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: i.update(shape=i.shape[:-1] + (i.shape[-1] + padding, )) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.end_symbolic if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root)) for i in root.uindices: for j in externals: condition.append(root.end_symbolic + padding < j) condition = ' || '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func(name='_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint))) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) nodes = Transformer(mapper).visit(state.nodes) elemental_functions = Transformer(mapper).visit( state.elemental_functions) return {'nodes': nodes, 'elemental_functions': elemental_functions}
def _minimize_remainders(self, nodes, state): """ Reshape temporary tensors and adjust loop trip counts to prevent as many compiler-generated remainder loops as possible. """ # The innermost dimension is the one that might get padded p_dim = -1 mapper = {} for tree in retrieve_iteration_tree(nodes): vector_iterations = [i for i in tree if i.is_Vectorizable] if not vector_iterations or len(vector_iterations) > 1: continue root = vector_iterations[0] if root.tag is None: continue # Padding writes = [i.write for i in FindNodes(Expression).visit(root) if i.write.is_Array] padding = [] for i in writes: try: simd_items = get_simd_items(i.dtype) except KeyError: # Fallback to 16 (maximum expectable padding, for AVX512 registers) simd_items = simdinfo['avx512f'] / np.dtype(i.dtype).itemsize padding.append(simd_items - i.shape[-1] % simd_items) if len(set(padding)) == 1: padding = padding[0] for i in writes: padded = (i._padding[p_dim][0], i._padding[p_dim][1] + padding) i.update(padding=i._padding[:p_dim] + (padded,)) else: # Padding must be uniform -- not the case, so giving up continue # Dynamic trip count adjustment endpoint = root.symbolic_max if not endpoint.is_Symbol: continue condition = [] externals = set(i.symbolic_shape[-1] for i in FindSymbols().visit(root) if i.is_Tensor) for i in root.uindices: for j in externals: condition.append(root.symbolic_max + padding < j) condition = ' && '.join(ccode(i) for i in condition) endpoint_padded = endpoint.func('_%s' % endpoint.name) init = cgen.Initializer( cgen.Value("const int", endpoint_padded), cgen.Line('(%s) ? %s : %s' % (condition, ccode(endpoint + padding), endpoint)) ) # Update the Iteration bound limits = list(root.limits) limits[1] = endpoint_padded.func(endpoint_padded.name) rebuilt = list(tree) rebuilt[rebuilt.index(root)] = root._rebuild(limits=limits) mapper[tree[0]] = List(header=init, body=compose_nodes(rebuilt)) processed = Transformer(mapper).visit(nodes) return processed, {}