def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import ( static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def generate_unroll_loop(kernel, sched_index, codegen_state): iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(kernel, sched_index+1, new_codegen_state)) return gen_code_block(result)
def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError("length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append(build_loop_nest(new_codegen_state, sched_index + 1)) return merge_codegen_results(codegen_state, result)
def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff(kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size
def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size
def get_arg_descriptor_for_expression(kernel, expr): """ :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor` describing the argument expression *expr* which occurs in a call in the code of *kernel*. """ from loopy.symbolic import (SubArrayRef, pw_aff_to_expr, SweptInameStrideCollector) from loopy.kernel.data import TemporaryVariable, ArrayArg if isinstance(expr, SubArrayRef): name = expr.subscript.aggregate.name arg = kernel.get_var_descriptor(name) if not isinstance(arg, (TemporaryVariable, ArrayArg)): raise LoopyError("unsupported argument type " "'%s' of '%s' in call statement" % (type(arg).__name__, expr.name)) aspace = arg.address_space from loopy.kernel.array import FixedStrideArrayDimTag as DimTag sub_dim_tags = [] sub_shape = [] # This helps in identifying identities like # "2*(i//2) + i%2" := "i" # See the kernel in # test_callables.py::test_shape_translation_through_sub_array_refs from loopy.symbolic import simplify_using_aff linearized_index = simplify_using_aff( kernel, sum(dim_tag.stride * iname for dim_tag, iname in zip( arg.dim_tags, expr.subscript.index_tuple))) strides_as_dict = SweptInameStrideCollector( tuple(iname.name for iname in expr.swept_inames))(linearized_index) sub_dim_tags = tuple( # Not all swept inames necessarily occur in the expression. DimTag(strides_as_dict.get(iname, 0)) for iname in expr.swept_inames) sub_shape = tuple( pw_aff_to_expr( kernel.get_iname_bounds(iname.name).upper_bound_pw_aff - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff) + 1 for iname in expr.swept_inames) return ArrayArgDescriptor(address_space=aspace, dim_tags=sub_dim_tags, shape=sub_shape) else: ExpressionIsScalarChecker(kernel)(expr) return ValueArgDescriptor()
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=False, context=context)) try: base_index = pw_aff_to_expr( static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context)) except Exception as e: raise type(e)("while finding lower bound of '%s': %s" % (iname, str(e))) return base_index, size
def make_new_loop_index(inner, outer): # These two expressions are equivalent. Benchmarking between the # two was inconclusive, although one is shorter. if 0: # Triggers isl issues in check pass. return (inner + pw_aff_to_expr(chunk_floor) * outer + pw_aff_to_expr(chunk_diff) * Min( (outer, pw_aff_to_expr(chunk_mod)))) else: return (inner + pw_aff_to_expr(chunk_ceil) * Min( (outer, pw_aff_to_expr(chunk_mod))) + pw_aff_to_expr(chunk_floor) * (outer - Min( (outer, pw_aff_to_expr(chunk_mod)))))
def fix(self, iname, aff): new_impl_domain = self.implemented_domain impl_space = self.implemented_domain.get_space() if iname not in impl_space.get_var_dict(): new_impl_domain = new_impl_domain.add_dims(isl.dim_type.set, 1).set_dim_name( isl.dim_type.set, new_impl_domain.dim(isl.dim_type.set), iname ) impl_space = new_impl_domain.get_space() from loopy.isl_helpers import iname_rel_aff iname_plus_lb_aff = iname_rel_aff(impl_space, iname, "==", aff) from loopy.symbolic import pw_aff_to_expr cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) expr = pw_aff_to_expr(aff) new_impl_domain = new_impl_domain.add_constraint(cns) return self.copy_and_assign(iname, expr).copy(implemented_domain=new_impl_domain)
def fix(self, iname, aff): new_impl_domain = self.implemented_domain impl_space = self.implemented_domain.get_space() if iname not in impl_space.get_var_dict(): new_impl_domain = (new_impl_domain.add_dims( isl.dim_type.set, 1).set_dim_name(isl.dim_type.set, new_impl_domain.dim(isl.dim_type.set), iname)) impl_space = new_impl_domain.get_space() from loopy.isl_helpers import iname_rel_aff iname_plus_lb_aff = iname_rel_aff(impl_space, iname, "==", aff) from loopy.symbolic import pw_aff_to_expr cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) expr = pw_aff_to_expr(aff) new_impl_domain = new_impl_domain.add_constraint(cns) return self.copy_and_assign( iname, expr).copy(implemented_domain=new_impl_domain)
def duplicate_private_temporaries_for_ilp_and_vec(kernel): logger.debug("%s: duplicate temporaries for ilp" % kernel.name) wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] ilp_inames = frozenset(iname for iname in kernel.insn_inames(writer_insn) if isinstance( kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if not new_ilp_inames: break if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): raise LoopyError("instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), ", ".join(var_to_new_ilp_inames[tv.name]))) continue var_to_new_ilp_inames[tv.name] = set(new_ilp_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr ilp_iname_to_length = {} for ilp_inames in six.itervalues(var_to_new_ilp_inames): for iname in ilp_inames: if iname in ilp_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=True) ilp_iname_to_length[iname] = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) assert static_max_of_pw_aff( bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_ilp_inames): tv = new_temp_vars[tv_name] extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags) # }}} from pymbolic import var eiii = ExtraInameIndexInserter( dict((var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_ilp_inames))) new_insns = [ insn.with_transformed_expressions(eiii) for insn in kernel.instructions] return kernel.copy( temporary_variables=new_temp_vars, instructions=new_insns)
def test_pw_aff_to_conditional_expr(): from loopy.symbolic import pw_aff_to_expr cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") expr = pw_aff_to_expr(cond) assert str(expr) == "0 if i == 0 else -1 + i"
def guess_arg_shape_if_requested(kernel, default_order): new_args = [] import loopy as lp from loopy.kernel.array import ArrayBase from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper submap = SubstitutionRuleExpander(kernel.substitutions) for arg in kernel.args: if isinstance(arg, ArrayBase) and arg.shape is lp.auto: armap = AccessRangeMapper(kernel, arg.name) try: for insn in kernel.instructions: if isinstance(insn, lp.ExpressionInstruction): armap(submap(insn.assignee), kernel.insn_inames(insn)) armap(submap(insn.expression), kernel.insn_inames(insn)) except TypeError as e: from traceback import print_exc print_exc() from loopy.diagnostic import LoopyError raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for argument '%s'. " "Specifying the shape manually should get rid of this. " "The following error occurred: %s" % (arg.name, str(e))) if armap.access_range is None: if armap.bad_subscripts: raise RuntimeError("cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (arg.name, ", ".join( str(i) for i in armap.bad_subscripts))) # no subscripts found, let's call it a scalar shape = () else: from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr shape = [] for i in range(armap.access_range.dim(dim_type.set)): try: shape.append( pw_aff_to_expr(static_max_of_pw_aff( kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) except: print("While trying to find shape axis %d of " "argument '%s', the following " "exception occurred:" % (i, arg.name), file=sys.stderr) raise shape = tuple(shape) if arg.shape is lp.auto: arg = arg.copy(shape=shape) try: arg.strides except AttributeError: pass else: if arg.strides is lp.auto: from loopy.kernel.data import make_strides arg = arg.copy(strides=make_strides(shape, default_order)) new_args.append(arg) return kernel.copy(args=new_args)
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): from loopy.kernel.data import ( UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag) if hw_inames_left is None: hw_inames_left = [iname for iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)] if not hw_inames_left: return build_loop_nest(kernel, sched_index, codegen_state) global_size, local_size = kernel.get_grid_sizes() hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() tag = kernel.iname_to_tag.get(iname) from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex assert isinstance(tag, UniqueTag) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag) and kernel.iname_to_tag.get(other_iname).key == tag.key and other_iname != iname] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition( kernel, iname, sched_index, codegen_state) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] from loopy.codegen import add_comment for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, iname) if len(slabs) == 1: cmt = None # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr) inner = set_up_hw_parallel_loops( slabbed_kernel, sched_index, new_codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block return gen_code_block(result)
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces( codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True, across_dim_types=True ).params() lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .gist(impl_domain) .coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs( dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims( dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = ( codegen_state .intersect(impl_loop) .copy(kernel=intersect_kernel_with_slab( kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index+1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer( codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor*iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i+1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_forced_iname_deps(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy( forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] kernel = (kernel .copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel( ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag)] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any(_tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag))] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment( "%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state .copy_and_assign(iname, hw_axis_expr) .copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops( new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def test_pw_aff_to_conditional_expr(): from loopy.symbolic import pw_aff_to_expr cond = isl.PwAff("[i] -> { [(0)] : i = 0; [(-1 + i)] : i > 0 }") expr = pw_aff_to_expr(cond) assert str(expr) == "If(i == 0, 0, -1 + i)"
def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func, hw_inames_left=None): kernel = codegen_state.kernel from loopy.kernel.data import (UniqueTag, HardwareConcurrentTag, LocalIndexTag, GroupIndexTag, VectorizeTag) from loopy.schedule import get_insn_ids_for_block_at insn_ids_for_block = get_insn_ids_for_block_at(kernel.schedule, schedule_index) if hw_inames_left is None: all_inames_by_insns = set() for insn_id in insn_ids_for_block: all_inames_by_insns |= kernel.insn_inames(insn_id) hw_inames_left = [ iname for iname in all_inames_by_insns if kernel.iname_tags_of_type(iname, HardwareConcurrentTag) and not kernel.iname_tags_of_type(iname, VectorizeTag) ] if not hw_inames_left: return next_func(codegen_state) global_size, local_size = kernel.get_grid_sizes_for_insn_ids( insn_ids_for_block) hw_inames_left = hw_inames_left[:] iname = hw_inames_left.pop() from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex tag, = kernel.iname_tags_of_type(iname, UniqueTag, max_num=1, min_num=1) if isinstance(tag, GroupIndexTag): hw_axis_expr = GroupHardwareAxisIndex(tag.axis) elif isinstance(tag, LocalIndexTag): hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") other_inames_with_same_tag = [ other_iname for other_iname in kernel.all_inames() if (kernel.iname_tags_of_type(other_iname, UniqueTag) and other_iname != iname and any( _tag.key == tag.key for _tag in kernel.iname_tags(other_iname) if _tag)) ] # {{{ 'implement' hardware axis boundaries if isinstance(tag, LocalIndexTag): hw_axis_size = local_size[tag.axis] elif isinstance(tag, GroupIndexTag): hw_axis_size = global_size[tag.axis] else: raise RuntimeError("unknown hardware parallel tag") result = [] bounds = kernel.get_iname_bounds(iname) domain = kernel.get_inames_domain(iname) # It's ok to find a bound that's too "loose". The conditional # generators will mop up after us. from loopy.isl_helpers import static_min_of_pw_aff lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) # These bounds are 'implemented' by the hardware. Make sure # that the downstream conditional generators realize that. if not isinstance(hw_axis_size, int): hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound + hw_axis_size) codegen_state = codegen_state.intersect(slab) from loopy.symbolic import pw_aff_to_expr hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound) # }}} slabs = get_slab_decomposition(kernel, iname) if other_inames_with_same_tag and len(slabs) > 1: raise RuntimeError("cannot do slab decomposition on inames that share " "a tag with other inames") result = [] for slab_name, slab in slabs: if len(slabs) > 1: result.append( codegen_state.ast_builder.emit_comment("%s slab for '%s'" % (slab_name, iname))) # Have the conditional infrastructure generate the # slabbing conditionals. slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) new_codegen_state = (codegen_state.copy_and_assign( iname, hw_axis_expr).copy(kernel=slabbed_kernel)) inner = set_up_hw_parallel_loops(new_codegen_state, schedule_index, next_func, hw_inames_left) result.append(inner) return merge_codegen_results(codegen_state, result)
def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup)
def privatize_temporaries_with_inames( kernel, privatizing_inames, only_var_names=None): """This function provides each loop iteration of the *privatizing_inames* with its own private entry in the temporaries it accesses (possibly restricted to *only_var_names*). This is accomplished implicitly as part of generating instruction-level parallelism by the "ILP" tag and accessible separately through this transformation. Example:: for imatrix, i acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end might become:: for imatrix, i acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end facilitating loop interchange of the *imatrix* loop. .. versionadded:: 2018.1 """ if isinstance(privatizing_inames, str): privatizing_inames = frozenset( s.strip() for s in privatizing_inames.split(",")) if isinstance(only_var_names, str): only_var_names = frozenset( s.strip() for s in only_var_names.split(",")) wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): if only_var_names is not None and tv.name not in only_var_names: continue for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames referenced_priv_axis_inames = (priv_axis_inames & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: break if tv.name in var_to_new_priv_axis_iname: if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]): raise LoopyError("instruction '%s' requires adding " "indices for privatizing var '%s' on iname(s) '%s', " "but previous instructions required inames '%s'" % (writer_insn_id, tv.name, ", ".join(new_priv_axis_inames), ", ".join(var_to_new_priv_axis_iname[tv.name]))) continue var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=False) priv_axis_iname_to_length[iname] = pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=False)) assert static_max_of_pw_aff( bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if kernel.iname_tags_of_type(iname, VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = dict( (var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_priv_axis_iname)) new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). To remedy this, first promote" "the instruction into the iname." % (kernel.name, insn.id, ", ".join( eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy( temporary_variables=new_temp_vars, instructions=new_insns)
def base_index_and_length(self, set, iname, context=None, n_allowed_params_in_length=None): """ :arg n_allowed_params_in_length: Simplifies the 'length' argument so that only the first that many params (in the domain of *set*) occur. """ if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff, find_max_of_pwaff_with_params) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None): if iname is not None: logger.debug("%s: add axes to temporaries for ilp" % kernel.name) wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] if iname is None: ilp_inames = frozenset( iname for iname in kernel.insn_inames(writer_insn) if isinstance(kernel.iname_to_tag.get(iname), ( IlpBaseTag, VectorizeTag))) else: if not isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)): raise LoopyError("'%s' is not an ILP iname" % iname) ilp_inames = frozenset([iname]) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if not new_ilp_inames: break if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): raise LoopyError( "instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), ", ".join( var_to_new_ilp_inames[tv.name]))) continue var_to_new_ilp_inames[tv.name] = set(new_ilp_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr ilp_iname_to_length = {} for ilp_inames in six.itervalues(var_to_new_ilp_inames): for iname in ilp_inames: if iname in ilp_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=True) ilp_iname_to_length[iname] = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) assert static_max_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_ilp_inames): tv = new_temp_vars[tv_name] extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy( shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var eiii = ExtraInameIndexInserter( dict((var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_ilp_inames))) new_insns = [ insn.with_transformed_expressions(eiii) for insn in kernel.instructions ] return kernel.copy(temporary_variables=new_temp_vars, instructions=new_insns)
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor * iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i + 1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_within_inames(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy(within_inames=subst_within_inames(insn.within_inames)) for insn in kernel.instructions ] kernel = (kernel.copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict])) from loopy.match import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)): if das_iname in usable_inames: moved_inames.append(das_iname) dt, idx = dom_and_slab.get_var_dict()[das_iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] impl_domain = isl.align_spaces(codegen_state.implemented_domain, dom_and_slab, obj_bigger_ok=True).params() lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist( kernel.assumptions).gist(impl_domain).coalesce()) # }}} # {{{ find implemented loop, build inner code from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound) impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound) # impl_loop may be overapproximated from loopy.isl_helpers import make_loop_bounds_from_pwaffs impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space, loop_iname, impl_lbound, impl_ubound) for moved_iname in moved_inames: # move moved_iname to 'set' dim_type in impl_loop dt, idx = impl_loop.get_var_dict()[moved_iname] impl_loop = impl_loop.move_dims(dim_type.set, impl_loop.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_loop).copy( kernel=intersect_kernel_with_slab(kernel, slab, loop_iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) astb = codegen_state.ast_builder from loopy.symbolic import pw_aff_to_expr if impl_ubound.is_equal(impl_lbound): # single-trip, generate just a variable assignment, not a loop inner = merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ]) result.append( inner.with_new_ast( codegen_state, astb.ast_block_scope_class( inner.current_ast(codegen_state)))) else: inner_ast = inner.current_ast(codegen_state) from loopy.isl_helpers import simplify_pw_aff result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, pw_aff_to_expr( simplify_pw_aff(lbound, kernel.assumptions)), pw_aff_to_expr( simplify_pw_aff(ubound, kernel.assumptions)), inner_ast))) return merge_codegen_results(codegen_state, result)
def privatize_temporaries_with_inames(kernel, privatizing_inames, only_var_names=None): """This function provides each loop iteration of the *privatizing_inames* with its own private entry in the temporaries it accesses (possibly restricted to *only_var_names*). This is accomplished implicitly as part of generating instruction-level parallelism by the "ILP" tag and accessible separately through this transformation. Example:: for imatrix, i acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end might become:: for imatrix, i acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end facilitating loop interchange of the *imatrix* loop. .. versionadded:: 2018.1 """ if isinstance(privatizing_inames, str): privatizing_inames = frozenset(s.strip() for s in privatizing_inames.split(",")) if isinstance(only_var_names, str): only_var_names = frozenset(s.strip() for s in only_var_names.split(",")) wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} # {{{ find variables that need extra indices for tv in kernel.temporary_variables.values(): if only_var_names is not None and tv.name not in only_var_names: continue for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] priv_axis_inames = writer_insn.within_inames & privatizing_inames referenced_priv_axis_inames = ( priv_axis_inames & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: break if tv.name in var_to_new_priv_axis_iname: if new_priv_axis_inames != set( var_to_new_priv_axis_iname[tv.name]): raise LoopyError( "instruction '%s' requires adding " "indices for privatizing var '%s' on iname(s) '%s', " "but previous instructions required inames '%s'" % (writer_insn_id, tv.name, ", ".join(new_priv_axis_inames), ", ".join( var_to_new_priv_axis_iname[tv.name]))) continue var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} iname_to_lbound = {} for priv_axis_inames in var_to_new_priv_axis_iname.values(): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=False) priv_axis_iname_to_length[iname] = pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=False)) iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) # }}} # {{{ change temporary variables from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in var_to_new_priv_axis_iname.items(): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if kernel.iname_tags_of_type(iname, VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy( shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = { var_name: tuple(var(iname) for iname in inames) for var_name, inames in var_to_new_priv_axis_iname.items() } new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). To remedy this, first promote" "the instruction into the iname." % (kernel.name, insn.id, ", ".join(eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy(temporary_variables=new_temp_vars, instructions=new_insns)
def guess_var_shape(kernel, var_name): from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper armap = AccessRangeMapper(kernel, var_name) submap = SubstitutionRuleExpander(kernel.substitutions) def run_through_armap(expr): armap(submap(expr), kernel.insn_inames(insn)) return expr try: for insn in kernel.instructions: insn.with_transformed_expressions(run_through_armap) except TypeError as e: from traceback import print_exc print_exc() raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for variable '%s'. " "Specifying the shape manually should get rid of this. " "The following error occurred: %s" % (var_name, str(e))) if armap.access_range is None: if armap.bad_subscripts: from loopy.symbolic import LinearSubscript if any( isinstance(sub, LinearSubscript) for sub in armap.bad_subscripts): raise LoopyError( "cannot determine access range for '%s': " "linear subscript(s) in '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) n_axes_in_subscripts = set( len(sub.index_tuple) for sub in armap.bad_subscripts) if len(n_axes_in_subscripts) != 1: raise RuntimeError("subscripts of '%s' with differing " "numbers of axes were found" % var_name) n_axes, = n_axes_in_subscripts if n_axes == 1: # Leave shape undetermined--we can live with that for 1D. shape = (None, ) else: raise LoopyError( "cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) else: # no subscripts found, let's call it a scalar shape = () else: from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr shape = [] for i in range(armap.access_range.dim(dim_type.set)): try: shape.append( pw_aff_to_expr( static_max_of_pw_aff(kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) except: print("While trying to find shape axis %d of " "variable '%s', the following " "exception occurred:" % (i, var_name), file=sys.stderr) print("*** ADVICE: You may need to manually specify the " "shape of argument '%s'." % (var_name), file=sys.stderr) raise shape = tuple(shape) return shape