def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr( static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import ( static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff( lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - base_index_aff + 1, constants_only=False, context=context)) return base_index, size
def generate_unroll_loop(kernel, sched_index, codegen_state): iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError( "length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(kernel, sched_index+1, new_codegen_state)) return gen_code_block(result)
def generate_unroll_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): raise LoopyError("length of unrolled loop '%s' is not a constant, " "cannot unroll") length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) result = [] for i in range(length): idx_aff = lower_bound_aff + i new_codegen_state = codegen_state.fix(iname, idx_aff) result.append(build_loop_nest(new_codegen_state, sched_index + 1)) return merge_codegen_results(codegen_state, result)
def get_constant_iname_length(self, iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr return int( aff_to_expr( static_max_of_pw_aff(self.get_iname_bounds( iname, constants_only=True).size, constants_only=True)))
def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff(kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size
def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(codegen_state, sched_index) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def generate_vectorize_loop(codegen_state, sched_index): kernel = codegen_state.kernel iname = kernel.schedule[sched_index].iname bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length_aff = static_max_of_pw_aff(bounds.size, constants_only=True) if not length_aff.is_cst(): warn(kernel, "vec_upper_not_const", "upper bound for vectorized loop '%s' is not a constant, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) length = int(pw_aff_to_expr(length_aff)) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) if not lower_bound_aff.plain_is_zero(): warn(kernel, "vec_lower_not_0", "lower bound for vectorized loop '%s' is not zero, " "cannot vectorize--unrolling instead") return generate_unroll_loop(kernel, sched_index, codegen_state) # {{{ 'implement' vectorization bounds domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab slab = make_slab(domain.get_space(), iname, lower_bound_aff, lower_bound_aff+length) codegen_state = codegen_state.intersect(slab) # }}} from loopy.codegen import VectorizationInfo new_codegen_state = codegen_state.copy( vectorization_info=VectorizationInfo( iname=iname, length=length, space=length_aff.space)) return build_loop_nest(new_codegen_state, sched_index+1)
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" hw_inames = get_common_hw_inames(kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary( name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" hw_inames = get_common_hw_inames( kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary(name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def base_index_and_length(self, set, iname, context=None): if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr(static_max_of_pw_aff( upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=False, context=context)) try: base_index = pw_aff_to_expr( static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context)) except Exception as e: raise type(e)("while finding lower bound of '%s': %s" % (iname, str(e))) return base_index, size
def get_constant_iname_length(self, iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr return int(aff_to_expr(static_max_of_pw_aff( self.get_iname_bounds(iname, constants_only=True).size, constants_only=True)))
def duplicate_private_temporaries_for_ilp_and_vec(kernel): logger.debug("%s: duplicate temporaries for ilp" % kernel.name) wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] ilp_inames = frozenset(iname for iname in kernel.insn_inames(writer_insn) if isinstance( kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag))) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if not new_ilp_inames: break if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): raise LoopyError("instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), ", ".join(var_to_new_ilp_inames[tv.name]))) continue var_to_new_ilp_inames[tv.name] = set(new_ilp_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr ilp_iname_to_length = {} for ilp_inames in six.itervalues(var_to_new_ilp_inames): for iname in ilp_inames: if iname in ilp_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=True) ilp_iname_to_length[iname] = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) assert static_max_of_pw_aff( bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_ilp_inames): tv = new_temp_vars[tv_name] extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags) # }}} from pymbolic import var eiii = ExtraInameIndexInserter( dict((var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_ilp_inames))) new_insns = [ insn.with_transformed_expressions(eiii) for insn in kernel.instructions] return kernel.copy( temporary_variables=new_temp_vars, instructions=new_insns)
def guess_arg_shape_if_requested(kernel, default_order): new_args = [] import loopy as lp from loopy.kernel.array import ArrayBase from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper submap = SubstitutionRuleExpander(kernel.substitutions) for arg in kernel.args: if isinstance(arg, ArrayBase) and arg.shape is lp.auto: armap = AccessRangeMapper(kernel, arg.name) try: for insn in kernel.instructions: if isinstance(insn, lp.ExpressionInstruction): armap(submap(insn.assignee), kernel.insn_inames(insn)) armap(submap(insn.expression), kernel.insn_inames(insn)) except TypeError as e: from traceback import print_exc print_exc() from loopy.diagnostic import LoopyError raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for argument '%s'. " "Specifying the shape manually should get rid of this. " "The following error occurred: %s" % (arg.name, str(e))) if armap.access_range is None: if armap.bad_subscripts: raise RuntimeError("cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (arg.name, ", ".join( str(i) for i in armap.bad_subscripts))) # no subscripts found, let's call it a scalar shape = () else: from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr shape = [] for i in range(armap.access_range.dim(dim_type.set)): try: shape.append( pw_aff_to_expr(static_max_of_pw_aff( kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) except: print("While trying to find shape axis %d of " "argument '%s', the following " "exception occurred:" % (i, arg.name), file=sys.stderr) raise shape = tuple(shape) if arg.shape is lp.auto: arg = arg.copy(shape=shape) try: arg.strides except AttributeError: pass else: if arg.strides is lp.auto: from loopy.kernel.data import make_strides arg = arg.copy(strides=make_strides(shape, default_order)) new_args.append(arg) return kernel.copy(args=new_args)
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition( kernel, loop_iname, sched_index, codegen_state) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import ( static_min_of_pw_aff, static_max_of_pw_aff) lbound = ( kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) ubound = ( kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx) .gist(kernel.assumptions) .coalesce()) static_lbound = static_min_of_pw_aff( lbound, constants_only=False) static_ubound = static_max_of_pw_aff( ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs( dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims( dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = codegen_state.intersect(impl_slab) inner = build_loop_nest( intersect_kernel_with_slab( kernel, slab, iname), sched_index+1, new_codegen_state) # }}} if cmt is not None: from cgen import Comment result.append(Comment(cmt)) from cgen import Initializer, POD, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append(gen_code_block([ Initializer(Const(POD(kernel.index_dtype, loop_iname)), ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), Line(), inner, ])) else: result.append( kernel.target.emit_sequential_loop( codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner)) return gen_code_block(result)
def determine_temporaries_to_promote(kernel, temporaries, name_gen): """ For each temporary in the passed list of temporaries, construct a :class:`PromotedTemporary` which describes how the temporary should get promoted into global storage. :returns: A :class:`dict` mapping temporary names from `temporaries` to :class:`PromotedTemporary` objects """ new_temporaries = {} def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel) from loopy.kernel.data import LocalIndexTag for temporary in temporaries: temporary = kernel.temporary_variables[temporary] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) continue assert temporary.base_storage is None, \ "Cannot promote temporaries with base_storage to global" # `hw_inames`: The set of hw-parallel tagged inames that this temporary # is associated with. This is used for determining the shape of the # global storage needed for saving and restoring the temporary across # kernel calls. # # TODO: Make a policy decision about which dimensions to use. Currently, # the code looks at each instruction that defines or uses the temporary, # and takes the common set of hw-parallel tagged inames associated with # these instructions. # # Furthermore, in the case of local temporaries, inames that are tagged # hw-local do not contribute to the global storage shape. hw_inames = get_common_hw_inames( kernel, def_lists[temporary.name] + use_lists[temporary.name]) # This takes advantage of the fact that g < l in the alphabet :) hw_inames = sorted(hw_inames, key=lambda iname: str(kernel.iname_to_tag[iname])) # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. shape_prefix = [] backing_hw_inames = [] for iname in hw_inames: tag = kernel.iname_to_tag[iname] is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr shape_prefix.append( aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, False))) backing_temporary = PromotedTemporary(name=name_gen(temporary.name), orig_temporary=temporary, shape_prefix=tuple(shape_prefix), hw_inames=backing_hw_inames) new_temporaries[temporary.name] = backing_temporary return new_temporaries
def base_index_and_length(self, set, iname, context=None, n_allowed_params_in_length=None): """ :arg n_allowed_params_in_length: Simplifies the 'length' argument so that only the first that many params (in the domain of *set*) occur. """ if not isinstance(iname, int): iname_to_dim = set.space.get_var_dict() idx = iname_to_dim[iname][1] else: idx = iname lower_bound_pw_aff = self.dim_min(set, idx) upper_bound_pw_aff = self.dim_max(set, idx) from loopy.diagnostic import StaticValueFindingError from loopy.isl_helpers import (static_max_of_pw_aff, static_min_of_pw_aff, static_value_of_pw_aff, find_max_of_pwaff_with_params) from loopy.symbolic import pw_aff_to_expr # {{{ first: try to find static lower bound value try: base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) except StaticValueFindingError: base_index_aff = None if base_index_aff is not None: base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length # }}} # {{{ if that didn't work, try finding a lower bound base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, context=context) base_index = pw_aff_to_expr(base_index_aff) length = find_max_of_pwaff_with_params( upper_bound_pw_aff - base_index_aff + 1, n_allowed_params_in_length) length = pw_aff_to_expr( static_max_of_pw_aff(length, constants_only=False, context=context)) return base_index, length
def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) if not all_inames_by_insns <= self.all_inames(): raise RuntimeError("some inames collected from instructions (%s) " "are not present in domain (%s)" % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) global_sizes = {} local_sizes = {} from loopy.kernel.data import ( GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) for iname in all_inames_by_insns: tag = self.iname_to_tag.get(iname) if isinstance(tag, GroupIndexTag): tgt_dict = global_sizes elif isinstance(tag, LocalIndexTag): tgt_dict = local_sizes elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto: raise RuntimeError("cannot find grid sizes if automatic " "local index tags are present") else: tgt_dict = None if tgt_dict is None: continue size = self.get_iname_bounds(iname).size if tag.axis in tgt_dict: size = tgt_dict[tag.axis].max(size) from loopy.isl_helpers import static_max_of_pw_aff try: # insist block size is constant size = static_max_of_pw_aff(size, constants_only=isinstance(tag, LocalIndexTag)) except ValueError: pass tgt_dict[tag.axis] = size def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() size_list = [] sorted_axes = sorted(six.iterkeys(size_dict)) while sorted_axes or forced_sizes: if sorted_axes: cur_axis = sorted_axes.pop(0) else: cur_axis = None if len(size_list) in forced_sizes: size_list.append(forced_sizes.pop(len(size_list))) continue assert cur_axis is not None if cur_axis > len(size_list): raise RuntimeError("%s axis %d unused" % ( which, len(size_list))) size_list.append(size_dict[cur_axis]) return tuple(size_list) return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None): if iname is not None: logger.debug("%s: add axes to temporaries for ilp" % kernel.name) wmap = kernel.writer_map() from loopy.kernel.data import IlpBaseTag, VectorizeTag var_to_new_ilp_inames = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] if iname is None: ilp_inames = frozenset( iname for iname in kernel.insn_inames(writer_insn) if isinstance(kernel.iname_to_tag.get(iname), ( IlpBaseTag, VectorizeTag))) else: if not isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)): raise LoopyError("'%s' is not an ILP iname" % iname) ilp_inames = frozenset([iname]) referenced_ilp_inames = (ilp_inames & writer_insn.write_dependency_names()) new_ilp_inames = ilp_inames - referenced_ilp_inames if not new_ilp_inames: break if tv.name in var_to_new_ilp_inames: if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]): raise LoopyError( "instruction '%s' requires adding " "indices for ILP inames '%s' on var '%s', but previous " "instructions required inames '%s'" % (writer_insn_id, ", ".join(new_ilp_inames), ", ".join( var_to_new_ilp_inames[tv.name]))) continue var_to_new_ilp_inames[tv.name] = set(new_ilp_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr ilp_iname_to_length = {} for ilp_inames in six.itervalues(var_to_new_ilp_inames): for iname in ilp_inames: if iname in ilp_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=True) ilp_iname_to_length[iname] = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) assert static_max_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_ilp_inames): tv = new_temp_vars[tv_name] extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy( shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var eiii = ExtraInameIndexInserter( dict((var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_ilp_inames))) new_insns = [ insn.with_transformed_expressions(eiii) for insn in kernel.instructions ] return kernel.copy(temporary_variables=new_temp_vars, instructions=new_insns)
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.match.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor * iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import (static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int( pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i + 1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_within_inames(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy(within_inames=subst_within_inames(insn.within_inames)) for insn in kernel.instructions ] kernel = (kernel.copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict])) from loopy.match import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def auto_promote_temporary(self, temporary_name): temporary = self.kernel.temporary_variables[temporary_name] if temporary.scope == temp_var_scope.GLOBAL: # Nothing to be done for global temporaries (I hope) return None if temporary.initializer is not None: # Temporaries with initializers do not need saving/reloading - the # code generation takes care of emitting the initializers. assert temporary.read_only return None if temporary.base_storage is not None: raise ValueError( "Cannot promote temporaries with base_storage to global") # `hw_inames`: The set of hw-parallel tagged inames that this temporary # is associated with. This is used for determining the shape of the # global storage needed for saving and restoring the temporary across # kernel calls. # # TODO: Make a policy decision about which dimensions to use. Currently, # the code looks at each instruction that defines or uses the temporary, # and takes the common set of hw-parallel tagged inames associated with # these instructions. # # Furthermore, in the case of local temporaries, inames that are tagged # hw-local do not contribute to the global storage shape. hw_inames = self.insn_query.common_hw_inames( self.insn_query.insns_reading_or_writing(temporary.name)) # We want hw_inames to be arranged according to the order: # g.0 < g.1 < ... < l.0 < l.1 < ... # Sorting lexicographically accomplishes this. hw_inames = sorted( hw_inames, key=lambda iname: str(self.kernel.iname_to_tag[iname])) # Calculate the sizes of the dimensions that get added in front for # the global storage of the temporary. hw_dims = [] backing_hw_inames = [] for iname in hw_inames: tag = self.kernel.iname_to_tag[iname] from loopy.kernel.data import LocalIndexTag is_local_iname = isinstance(tag, LocalIndexTag) if is_local_iname and temporary.scope == temp_var_scope.LOCAL: # Restrict shape to that of group inames for locals. continue backing_hw_inames.append(iname) from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import aff_to_expr hw_dims.append( aff_to_expr( static_max_of_pw_aff( self.kernel.get_iname_bounds(iname).size, False))) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: # Scalar not in hardware: ensure at least one dimension. non_hw_dims = (1, ) backing_temporary = self.PromotedTemporary( name=self.var_name_gen(temporary.name + "_save_slot"), orig_temporary=temporary, hw_dims=tuple(hw_dims), non_hw_dims=non_hw_dims, hw_inames=backing_hw_inames) return backing_temporary
def privatize_temporaries_with_inames( kernel, privatizing_inames, only_var_names=None): """This function provides each loop iteration of the *privatizing_inames* with its own private entry in the temporaries it accesses (possibly restricted to *only_var_names*). This is accomplished implicitly as part of generating instruction-level parallelism by the "ILP" tag and accessible separately through this transformation. Example:: for imatrix, i acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end might become:: for imatrix, i acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end facilitating loop interchange of the *imatrix* loop. .. versionadded:: 2018.1 """ if isinstance(privatizing_inames, str): privatizing_inames = frozenset( s.strip() for s in privatizing_inames.split(",")) if isinstance(only_var_names, str): only_var_names = frozenset( s.strip() for s in only_var_names.split(",")) wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} # {{{ find variables that need extra indices for tv in six.itervalues(kernel.temporary_variables): if only_var_names is not None and tv.name not in only_var_names: continue for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames referenced_priv_axis_inames = (priv_axis_inames & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: break if tv.name in var_to_new_priv_axis_iname: if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]): raise LoopyError("instruction '%s' requires adding " "indices for privatizing var '%s' on iname(s) '%s', " "but previous instructions required inames '%s'" % (writer_insn_id, tv.name, ", ".join(new_priv_axis_inames), ", ".join(var_to_new_priv_axis_iname[tv.name]))) continue var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=False) priv_axis_iname_to_length[iname] = pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=False)) assert static_max_of_pw_aff( bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero() # }}} # {{{ change temporary variables from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if kernel.iname_tags_of_type(iname, VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = dict( (var_name, tuple(var(iname) for iname in inames)) for var_name, inames in six.iteritems(var_to_new_priv_axis_iname)) new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). To remedy this, first promote" "the instruction into the iname." % (kernel.name, insn.id, ", ".join( eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy( temporary_variables=new_temp_vars, instructions=new_insns)
def generate_sequential_loop_dim_code(codegen_state, sched_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper loop_iname = kernel.schedule[sched_index].iname slabs = get_slab_decomposition(kernel, loop_iname) from loopy.codegen.bounds import get_usable_inames_for_conditional # Note: this does not include loop_iname itself! usable_inames = get_usable_inames_for_conditional(kernel, sched_index) domain = kernel.get_inames_domain(loop_iname) result = [] for slab_name, slab in slabs: cmt = "%s slab for '%s'" % (slab_name, loop_iname) if len(slabs) == 1: cmt = None # {{{ find bounds aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True, obj_bigger_ok=True) dom_and_slab = aligned_domain & slab assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions) dom_and_slab, assumptions_non_param = isl.align_two( dom_and_slab, assumptions_non_param) dom_and_slab = dom_and_slab & assumptions_non_param # move inames that are usable into parameters moved_inames = [] for iname in dom_and_slab.get_var_names(dim_type.set): if iname in usable_inames: moved_inames.append(iname) dt, idx = dom_and_slab.get_var_dict()[iname] dom_and_slab = dom_and_slab.move_dims( dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx, 1) _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname] from loopy.isl_helpers import (static_min_of_pw_aff, static_max_of_pw_aff) lbound = (kernel.cache_manager.dim_min( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) ubound = (kernel.cache_manager.dim_max( dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce()) static_lbound = static_min_of_pw_aff(lbound, constants_only=False) static_ubound = static_max_of_pw_aff(ubound, constants_only=False) # }}} # {{{ find implemented slab, build inner code from loopy.isl_helpers import make_slab_from_bound_pwaffs # impl_slab may be overapproximated impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname, static_lbound, static_ubound) for iname in moved_inames: dt, idx = impl_slab.get_var_dict()[iname] impl_slab = impl_slab.move_dims(dim_type.set, impl_slab.dim(dim_type.set), dt, idx, 1) new_codegen_state = (codegen_state.intersect(impl_slab).copy( kernel=intersect_kernel_with_slab(kernel, slab, iname))) inner = build_loop_nest(new_codegen_state, sched_index + 1) # }}} if cmt is not None: result.append(codegen_state.ast_builder.emit_comment(cmt)) from loopy.symbolic import aff_to_expr astb = codegen_state.ast_builder if (static_ubound - static_lbound).plain_is_zero(): # single-trip, generate just a variable assignment, not a loop result.append( merge_codegen_results(codegen_state, [ astb.emit_initializer(codegen_state, kernel.index_dtype, loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i"), is_const=True), astb.emit_blank_line(), inner, ])) else: inner_ast = inner.current_ast(codegen_state) result.append( inner.with_new_ast( codegen_state, astb.emit_sequential_loop(codegen_state, loop_iname, kernel.index_dtype, static_lbound, static_ubound, inner_ast))) return merge_codegen_results(codegen_state, result)
def guess_var_shape(kernel, var_name): from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper armap = AccessRangeMapper(kernel, var_name) submap = SubstitutionRuleExpander(kernel.substitutions) def run_through_armap(expr): armap(submap(expr), kernel.insn_inames(insn)) return expr try: for insn in kernel.instructions: insn.with_transformed_expressions(run_through_armap) except TypeError as e: from traceback import print_exc print_exc() raise LoopyError( "Failed to (automatically, as requested) find " "shape/strides for variable '%s'. " "Specifying the shape manually should get rid of this. " "The following error occurred: %s" % (var_name, str(e))) if armap.access_range is None: if armap.bad_subscripts: from loopy.symbolic import LinearSubscript if any( isinstance(sub, LinearSubscript) for sub in armap.bad_subscripts): raise LoopyError( "cannot determine access range for '%s': " "linear subscript(s) in '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) n_axes_in_subscripts = set( len(sub.index_tuple) for sub in armap.bad_subscripts) if len(n_axes_in_subscripts) != 1: raise RuntimeError("subscripts of '%s' with differing " "numbers of axes were found" % var_name) n_axes, = n_axes_in_subscripts if n_axes == 1: # Leave shape undetermined--we can live with that for 1D. shape = (None, ) else: raise LoopyError( "cannot determine access range for '%s': " "undetermined index in subscript(s) '%s'" % (var_name, ", ".join(str(i) for i in armap.bad_subscripts))) else: # no subscripts found, let's call it a scalar shape = () else: from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr shape = [] for i in range(armap.access_range.dim(dim_type.set)): try: shape.append( pw_aff_to_expr( static_max_of_pw_aff(kernel.cache_manager.dim_max( armap.access_range, i) + 1, constants_only=False))) except: print("While trying to find shape axis %d of " "variable '%s', the following " "exception occurred:" % (i, var_name), file=sys.stderr) print("*** ADVICE: You may need to manually specify the " "shape of argument '%s'." % (var_name), file=sys.stderr) raise shape = tuple(shape) return shape
def join_inames(kernel, inames, new_iname=None, tag=None, within=None): """ :arg inames: fastest varying last :arg within: a stack match as understood by :func:`loopy.context_matching.parse_stack_match`. """ # now fastest varying first inames = inames[::-1] if new_iname is None: new_iname = kernel.get_var_name_generator()("_and_".join(inames)) from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, frozenset(inames)) for iname in inames: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise LoopyError("iname '%s' is not 'at home' in the " "join's leaf domain" % iname) new_domain = domch.domain new_dim_idx = new_domain.dim(dim_type.set) new_domain = new_domain.add_dims(dim_type.set, 1) new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname) joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space) subst_dict = {} base_divisor = 1 from pymbolic import var for i, iname in enumerate(inames): iname_dt, iname_idx = zero.get_space().get_var_dict()[iname] iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1) joint_aff = joint_aff + base_divisor*iname_aff bounds = kernel.get_iname_bounds(iname, constants_only=True) from loopy.isl_helpers import ( static_max_of_pw_aff, static_value_of_pw_aff) from loopy.symbolic import pw_aff_to_expr length = int(pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=True))) try: lower_bound_aff = static_value_of_pw_aff( bounds.lower_bound_pw_aff.coalesce(), constants_only=False) except Exception as e: raise type(e)("while finding lower bound of '%s': " % iname) my_val = var(new_iname) // base_divisor if i+1 < len(inames): my_val %= length my_val += pw_aff_to_expr(lower_bound_aff) subst_dict[iname] = my_val base_divisor *= length from loopy.isl_helpers import iname_rel_aff new_domain = new_domain.add_constraint( isl.Constraint.equality_from_aff( iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff))) for i, iname in enumerate(inames): iname_to_dim = new_domain.get_space().get_var_dict() iname_dt, iname_idx = iname_to_dim[iname] if within is None: new_domain = new_domain.project_out(iname_dt, iname_idx, 1) def subst_forced_iname_deps(fid): result = set() for iname in fid: if iname in inames: result.add(new_iname) else: result.add(iname) return frozenset(result) new_insns = [ insn.copy( forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps)) for insn in kernel.instructions] kernel = (kernel .copy( instructions=new_insns, domains=domch.get_domains_with(new_domain), applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict] )) from loopy.context_matching import parse_stack_match within = parse_stack_match(within) from pymbolic.mapper.substitutor import make_subst_func rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) ijoin = _InameJoiner(rule_mapping_context, within, make_subst_func(subst_dict), inames, new_iname) kernel = rule_mapping_context.finish_kernel( ijoin.map_kernel(kernel)) if tag is not None: kernel = tag_inames(kernel, {new_iname: tag}) return kernel
def privatize_temporaries_with_inames(kernel, privatizing_inames, only_var_names=None): """This function provides each loop iteration of the *privatizing_inames* with its own private entry in the temporaries it accesses (possibly restricted to *only_var_names*). This is accomplished implicitly as part of generating instruction-level parallelism by the "ILP" tag and accessible separately through this transformation. Example:: for imatrix, i acc = 0 for k acc = acc + a[imatrix, i, k] * vec[k] end end might become:: for imatrix, i acc[imatrix] = 0 for k acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k] end end facilitating loop interchange of the *imatrix* loop. .. versionadded:: 2018.1 """ if isinstance(privatizing_inames, str): privatizing_inames = frozenset(s.strip() for s in privatizing_inames.split(",")) if isinstance(only_var_names, str): only_var_names = frozenset(s.strip() for s in only_var_names.split(",")) wmap = kernel.writer_map() var_to_new_priv_axis_iname = {} # {{{ find variables that need extra indices for tv in kernel.temporary_variables.values(): if only_var_names is not None and tv.name not in only_var_names: continue for writer_insn_id in wmap.get(tv.name, []): writer_insn = kernel.id_to_insn[writer_insn_id] priv_axis_inames = writer_insn.within_inames & privatizing_inames referenced_priv_axis_inames = ( priv_axis_inames & writer_insn.write_dependency_names()) new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames if not new_priv_axis_inames: break if tv.name in var_to_new_priv_axis_iname: if new_priv_axis_inames != set( var_to_new_priv_axis_iname[tv.name]): raise LoopyError( "instruction '%s' requires adding " "indices for privatizing var '%s' on iname(s) '%s', " "but previous instructions required inames '%s'" % (writer_insn_id, tv.name, ", ".join(new_priv_axis_inames), ", ".join( var_to_new_priv_axis_iname[tv.name]))) continue var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames) # }}} # {{{ find ilp iname lengths from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr priv_axis_iname_to_length = {} iname_to_lbound = {} for priv_axis_inames in var_to_new_priv_axis_iname.values(): for iname in priv_axis_inames: if iname in priv_axis_iname_to_length: continue bounds = kernel.get_iname_bounds(iname, constants_only=False) priv_axis_iname_to_length[iname] = pw_aff_to_expr( static_max_of_pw_aff(bounds.size, constants_only=False)) iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff) # }}} # {{{ change temporary variables from loopy.kernel.data import VectorizeTag new_temp_vars = kernel.temporary_variables.copy() for tv_name, inames in var_to_new_priv_axis_iname.items(): tv = new_temp_vars[tv_name] extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames) shape = tv.shape if shape is None: shape = () dim_tags = ["c"] * (len(shape) + len(extra_shape)) for i, iname in enumerate(inames): if kernel.iname_tags_of_type(iname, VectorizeTag): dim_tags[len(shape) + i] = "vec" new_temp_vars[tv.name] = tv.copy( shape=shape + extra_shape, # Forget what you knew about data layout, # create from scratch. dim_tags=dim_tags, dim_names=None) # }}} from pymbolic import var var_to_extra_iname = { var_name: tuple(var(iname) for iname in inames) for var_name, inames in var_to_new_priv_axis_iname.items() } new_insns = [] for insn in kernel.instructions: eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound) new_insn = insn.with_transformed_expressions(eiii) if not eiii.seen_priv_axis_inames <= insn.within_inames: raise LoopyError( "Kernel '%s': Instruction '%s': touched variable that " "(for privatization, e.g. as performed for ILP) " "required iname(s) '%s', but that the instruction was not " "previously within the iname(s). To remedy this, first promote" "the instruction into the iname." % (kernel.name, insn.id, ", ".join(eiii.seen_priv_axis_inames - insn.within_inames))) new_insns.append(new_insn) return kernel.copy(temporary_variables=new_temp_vars, instructions=new_insns)
def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False): """Return a tuple (global_size, local_size) containing a grid that could accommodate execution of all instructions whose IDs are given in *insn_ids*. :arg insn_ids: a :class:`frozenset` of instruction IDs *global_size* and *local_size* are :class:`islpy.PwAff` objects. """ all_inames_by_insns = set() for insn_id in insn_ids: all_inames_by_insns |= self.insn_inames(insn_id) if not all_inames_by_insns <= self.all_inames(): raise RuntimeError("some inames collected from instructions (%s) " "are not present in domain (%s)" % (", ".join(sorted(all_inames_by_insns)), ", ".join(sorted(self.all_inames())))) global_sizes = {} local_sizes = {} from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, AutoLocalIndexTagBase) for iname in all_inames_by_insns: tag = self.iname_to_tag.get(iname) if isinstance(tag, GroupIndexTag): tgt_dict = global_sizes elif isinstance(tag, LocalIndexTag): tgt_dict = local_sizes elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto: raise RuntimeError("cannot find grid sizes if automatic " "local index tags are present") else: tgt_dict = None if tgt_dict is None: continue size = self.get_iname_bounds(iname).size if tag.axis in tgt_dict: size = tgt_dict[tag.axis].max(size) from loopy.isl_helpers import static_max_of_pw_aff try: # insist block size is constant size = static_max_of_pw_aff(size, constants_only=isinstance( tag, LocalIndexTag)) except ValueError: pass tgt_dict[tag.axis] = size def to_dim_tuple(size_dict, which, forced_sizes={}): forced_sizes = forced_sizes.copy() size_list = [] sorted_axes = sorted(six.iterkeys(size_dict)) while sorted_axes or forced_sizes: if sorted_axes: cur_axis = sorted_axes.pop(0) else: cur_axis = None if len(size_list) in forced_sizes: size_list.append(forced_sizes.pop(len(size_list))) continue assert cur_axis is not None if cur_axis > len(size_list): raise RuntimeError("%s axis %d unused" % (which, len(size_list))) size_list.append(size_dict[cur_axis]) return tuple(size_list) return (to_dim_tuple(global_sizes, "global"), to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))