def retag(self, tag_value=None): """ Create a new Iteration object which is identical to ``self``, except for the tag. If provided, ``tag_value`` is used as new tag; otherwise, an internally generated tag is used. """ if self.tag is None: return self._rebuild() properties = [tagger(tag_value or (ntags() + 1)) if i.name == 'tag' else i for i in self.properties] return self._rebuild(properties=properties)
def test_create_elemental_functions_simple(simple_function): roots = [i[-1] for i in retrieve_iteration_tree(simple_function)] retagged = [i._rebuild(properties=tagger(0)) for i in roots] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(simple_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.elemental_functions) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int i = 0; i < 3; i += 1) { for (int j = 0; j < 5; j += 1) { f_0(0,7,(float*)a,(float*)b,(float*)c,(float*)d,i,i_size,j,j_size,k_size); } } } void f_0(const int k_start, const int k_finish,""" """ float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i, const int i_size, const int j, const int j_size, const int k_size) { float (*restrict a) __attribute__((aligned(64))) = (float (*)) a_vec; float (*restrict b) __attribute__((aligned(64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__((aligned(64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__((aligned(64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = k_start; k < k_finish; k += 1) { a[i] = a[i] + b[i] + 5.0F; a[i] = -a[i]*c[i][j] + b[i]*d[i][j][k]; } }""")
def _loop_blocking(self, nodes, state): """ Apply loop blocking to :class:`Iteration` trees. Blocking is applied to parallel iteration trees. Heuristically, innermost dimensions are not blocked to maximize the trip count of the SIMD loops. Different heuristics may be specified by passing the keywords ``blockshape`` and ``blockinner`` to the DLE. The former, a dictionary, is used to indicate a specific block size for each blocked dimension. For example, for the :class:`Iteration` tree: :: for i for j for k ... one may provide ``blockshape = {i: 4, j: 7}``, in which case the two outer loops will blocked, and the resulting 2-dimensional block will have size 4x7. The latter may be set to True to also block innermost parallel :class:`Iteration` objects. """ exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree[0].is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: name = "%s%d_block" % (i.dim.name, len(mapper)) # Build Iteration over blocks dim = blocked.setdefault(i, Dimension(name=name)) bsize = dim.symbolic_size bstart = i.limits[0] binnersize = i.dim.symbolic_extent + (i.offsets[1] - i.offsets[0]) bfinish = i.dim.symbolic_end - (binnersize % bsize) - 1 inter_block = Iteration([], dim, [bstart, bfinish, bsize], offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + bsize - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild( [], limits=[bfinish + 1, i.dim.symbolic_end, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [ b._rebuild(properties=b.properties + (REMAINDER, )) for b, r in zip(inter_blocks, remainders) if r.dim not in c ] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) # All blocked dimensions if not blocked: return processed, {} # Determine the block shape blockshape = self.params.get('blockshape') if not blockshape: # Use trivial heuristic for a suitable blockshape def heuristic(dim_size): ths = 8 # FIXME: This really needs to be improved return ths if dim_size > ths else 1 blockshape = {k: heuristic for k in blocked.keys()} else: try: nitems, nrequired = len(blockshape), len(blocked) blockshape = {k: v for k, v in zip(blocked, blockshape)} if nitems > nrequired: dle_warning("Provided 'blockshape' has more entries than " "blocked loops; dropping entries ...") if nitems < nrequired: dle_warning("Provided 'blockshape' has fewer entries than " "blocked loops; dropping dimensions ...") except TypeError: blockshape = {list(blocked)[0]: blockshape} blockshape.update( {k: None for k in blocked.keys() if k not in blockshape}) # Track any additional arguments required to execute /state.nodes/ arguments = [ BlockingArg(v, k, blockshape[k]) for k, v in blocked.items() ] return processed, {'arguments': arguments, 'flags': 'blocking'}
def _loop_blocking(self, nodes, state): """Apply loop blocking to PARALLEL Iteration trees.""" exclude_innermost = not self.params.get('blockinner', False) ignore_heuristic = self.params.get('blockalways', False) # Make sure loop blocking will span as many Iterations as possible fold = fold_blockable_tree(nodes, exclude_innermost) mapper = {} blocked = OrderedDict() for tree in retrieve_iteration_tree(fold): # Is the Iteration tree blockable ? iterations = [i for i in tree if i.is_Parallel] if exclude_innermost: iterations = [i for i in iterations if not i.is_Vectorizable] if len(iterations) <= 1: continue root = iterations[0] if not IsPerfectIteration().visit(root): # Illegal/unsupported continue if not tree.root.is_Sequential and not ignore_heuristic: # Heuristic: avoid polluting the generated code with blocked # nests (thus increasing JIT compilation time and affecting # readability) if the blockable tree isn't embedded in a # sequential loop (e.g., a timestepping loop) continue # Decorate intra-block iterations with an IterationProperty TAG = tagger(len(mapper)) # Build all necessary Iteration objects, individually. These will # subsequently be composed to implement loop blocking. inter_blocks = [] intra_blocks = [] remainders = [] for i in iterations: # Build Iteration over blocks name = "%s%d_block" % (i.dim.name, len(mapper)) dim = blocked.setdefault(i, BlockDimension(i.dim, name=name)) binnersize = i.symbolic_size + (i.offsets[1] - i.offsets[0]) bmax = i.dim.symbolic_max - (binnersize % dim.step) inter_block = Iteration([], dim, bmax, offsets=i.offsets, properties=PARALLEL) inter_blocks.append(inter_block) # Build Iteration within a block limits = (dim, dim + dim.step - 1, 1) intra_block = i._rebuild([], limits=limits, offsets=(0, 0), properties=i.properties + (TAG, ELEMENTAL)) intra_blocks.append(intra_block) # Build unitary-increment Iteration over the 'leftover' region. # This will be used for remainder loops, executed when any # dimension size is not a multiple of the block size. remainder = i._rebuild([], limits=[bmax + 1, i.dim.symbolic_max, 1], offsets=(i.offsets[1], i.offsets[1])) remainders.append(remainder) # Build blocked Iteration nest blocked_tree = compose_nodes(inter_blocks + intra_blocks + [iterations[-1].nodes]) # Build remainder Iterations remainder_trees = [] for n in range(len(iterations)): for c in combinations([i.dim for i in iterations], n + 1): # First all inter-block Interations nodes = [b._rebuild(properties=b.properties + (REMAINDER,)) for b, r in zip(inter_blocks, remainders) if r.dim not in c] # Then intra-block or remainder, for each dim (in order) properties = (REMAINDER, TAG, ELEMENTAL) for b, r in zip(intra_blocks, remainders): handle = r if b.dim in c else b nodes.append(handle._rebuild(properties=properties)) nodes.extend([iterations[-1].nodes]) remainder_trees.append(compose_nodes(nodes)) # Will replace with blocked loop tree mapper[root] = List(body=[blocked_tree] + remainder_trees) rebuilt = Transformer(mapper).visit(fold) # Finish unrolling any previously folded Iterations processed = unfold_blocked_tree(rebuilt) return processed, {'dimensions': list(blocked.values())}
def test_create_efuncs_complex(complex_function): roots = [i[-1] for i in retrieve_iteration_tree(complex_function)] retagged = [j._rebuild(properties=tagger(i)) for i, j in enumerate(roots)] mapper = { i: j._rebuild(properties=(j.properties + (ELEMENTAL, ))) for i, j in zip(roots, retagged) } function = Transformer(mapper).visit(complex_function) handle = transform(function, mode='split') block = List(body=[handle.nodes] + handle.efuncs) output = str(block.ccode) # Make output compiler independent output = [ i for i in output.split('\n') if all([j not in i for j in ('#pragma', '/*')]) ] assert '\n'.join(output) == \ ("""void foo(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec) { for (int i = 0; i <= 3; i += 1) { f_0((float *)a,(float *)b,i_size,i,4,0); for (int j = 0; j <= 5; j += 1) { f_1((float *)a,(float *)b,(float *)c,(float *)d,i_size,j_size,k_size,i,j,7,0); } f_2((float *)a,(float *)b,i_size,i,4,0); } } void f_0(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int sf_M, const int sf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int s = sf_m; s <= sf_M; s += 1) { b[i] = a[i] + pow(b[i], 2) + 3; } } void f_1(float *restrict a_vec, float *restrict b_vec,""" """ float *restrict c_vec, float *restrict d_vec,""" """ const int i_size, const int j_size, const int k_size,""" """ const int i, const int j, const int kf_M, const int kf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; float (*restrict c)[j_size] __attribute__ ((aligned (64))) = (float (*)[j_size]) c_vec; float (*restrict d)[j_size][k_size] __attribute__ ((aligned (64))) =""" """ (float (*)[j_size][k_size]) d_vec; for (int k = kf_m; k <= kf_M; k += 1) { a[i] = a[i]*b[i]*c[i][j]*d[i][j][k]; a[i] = 4*(a[i] + c[i][j])*(b[i] + d[i][j][k]); } } void f_2(float *restrict a_vec, float *restrict b_vec,""" """ const int i_size, const int i, const int qf_M, const int qf_m) { float (*restrict a) __attribute__ ((aligned (64))) = (float (*)) a_vec; float (*restrict b) __attribute__ ((aligned (64))) = (float (*)) b_vec; for (int q = qf_m; q <= qf_M; q += 1) { a[i] = 8.0F*a[i] + 6.0F/b[i]; } }""")