def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import ( static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [ iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) ] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any( _tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag)) ] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound + hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment("%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state.copy_and_assign( iname, hw_axis_expr).copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition( kernel, loop_iname, sched_index, codegen_state) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) static_lbound = static_min_of_pw_aff( lbound, constants_only=False) static_ubound = static_max_of_pw_aff( ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs( dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims( dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = codegen_state.intersect(impl_slab) inner = build_loop_nest( intersect_kernel_with_slab( kernel, slab, iname), sched_index+1, new_codegen_state) # }}} if cmt is not None: from cgen import Comment result.append(Comment(cmt)) from cgen import Initializer, POD, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, ])) else: result.append( kernel.target.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner)) return gen_code_block(result)
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) if hw_inames_left is None: hw_inames_left = [iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return build_loop_nest(kernel, sched_index, codegen_state) global_size, local_size = kernel.get_grid_sizes() hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() tag = kernel.iname_to_tag.get(iname) from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex assert isinstance(tag, UniqueTag) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) and kernel.iname_to_tag.get(other_iname).key == tag.key and other_iname != iname] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition( kernel, iname, sched_index, codegen_state) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] from loopy.codegen import add_comment for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: cmt = None # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, new_codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block return gen_code_block(result)
def base_index_and_length(self, set, iname, context=None, n_allowed_params_in_length=None): """ :arg n_allowed_params_in_length: Simplifies the 'length' argument so that only the first that many params (in the domain of *set*) occur. """ if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff, find_max_of_pwaff_with_params) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff) lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) static_lbound = static_min_of_pw_aff(lbound, constants_only=False) static_ubound = static_max_of_pw_aff(ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims(dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_slab).copy( kernel=intersect_kernel_with_slab(kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append( merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop(codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any(_tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag))] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state .copy_and_assign(iname, hw_axis_expr) .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)