def get_grid_sizes( self, knl, param_dict=None, ): cache_key = (prepare_for_caching(knl), KernelStatOptions.GRID_SIZES) # TODO avoid multiple calls to prepare_for_caching()? try: grid_sizes = self.stat_cache[cache_key] except KeyError: global_size, local_size = knl.get_grid_size_upper_bounds() from islpy import PwQPolynomial gsize_pwqs = [] lsize_pwqs = [] for gsize in global_size: gsize_pwqs.append(PwQPolynomial.from_pw_aff(gsize)) for lsize in local_size: lsize_pwqs.append(PwQPolynomial.from_pw_aff(lsize)) grid_sizes = [gsize_pwqs, lsize_pwqs] self.stat_cache[cache_key] = grid_sizes if self.evaluate_polys: if param_dict is None: raise ValueError( "Cannont evaluate polynomials without param_dict.") return [g.eval_with_dict(param_dict) for g in grid_sizes[0]], \ [l.eval_with_dict(param_dict) for l in grid_sizes[1]] else: return grid_sizes
def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching # prepare_for_caching() gets run by preprocess, but the kernel at this # stage is not guaranteed to be preprocessed. cacheable_kernel = prepare_for_caching(self.kernel) cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) if CACHING_ENABLED: try: return typed_and_scheduled_cache[cache_key] except KeyError: pass logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) if CACHING_ENABLED: typed_and_scheduled_cache.store_if_not_present(cache_key, kernel) return kernel
def get_cached_stats_map( self, knl, stat_option, # KernelStatOptions ): cache_key = (prepare_for_caching(knl), stat_option) # TODO avoid multiple calls to prepare_for_caching()? try: return self.stat_cache[cache_key] except KeyError: if stat_option == KernelStatOptions.MEM_ACCESS_MAP: from loopy.statistics import get_mem_access_map stats_map = get_mem_access_map( knl, count_redundant_work=self.count_redundant_work, subgroup_size=self.subgroup_size, ) elif stat_option == KernelStatOptions.OP_MAP: from loopy.statistics import get_op_map stats_map = get_op_map( knl, count_redundant_work=self.count_redundant_work, count_within_subscripts=self.count_within_subscripts, subgroup_size=self.subgroup_size, count_madds=self.count_madds, ) elif stat_option == KernelStatOptions.SYNC_MAP: from loopy.statistics import get_synchronization_map stats_map = get_synchronization_map( knl, subgroup_size=self.subgroup_size, ) self.stat_cache[cache_key] = stats_map return stats_map
def make_kernel(domains, instructions, kernel_data=["..."], **kwargs): """User-facing kernel creation entrypoint. :arg domains: :class:`islpy.BasicSet` :arg instructions: :arg kernel_data: A list of :class:`ValueArg`, :class:`GlobalArg`, ... (etc.) instances. The order of these arguments determines the order of the arguments to the generated kernel. May also contain :class:`TemporaryVariable` instances(which do not give rise to kernel-level arguments). The string ``"..."`` may be passed as one of the entries of the list, in which case loopy will infer names, shapes, and types of arguments from the kernel code. It is possible to just pass the list ``["..."]``, in which case all arguments are inferred. In Python 3, the string ``"..."`` may be spelled somewhat more sensibly as just ``...`` (the ellipsis), for the same meaning. As an additional option, each argument may be specified as just a name (a string). This is useful to specify argument ordering. All other characteristics of the named arguments are inferred. The following keyword arguments are recognized: :arg preambles: a list of (tag, code) tuples that identify preamble snippets. Each tag's snippet is only included once, at its first occurrence. The preambles will be inserted in order of their tags. :arg preamble_generators: a list of functions of signature (seen_dtypes, seen_functions) where seen_functions is a set of (name, c_name, arg_dtypes), generating extra entries for *preambles*. :arg defines: a dictionary of replacements to be made in instructions given as strings before parsing. A macro instance intended to be replaced should look like "MACRO" in the instruction code. The expansion given in this parameter is allowed to be a list. In this case, instructions are generated for *each* combination of macro values. These defines may also be used in the domain and in argument shapes and strides. They are expanded only upon kernel creation. :arg default_order: "C" (default) or "F" :arg default_offset: 0 or :class:`loopy.auto`. The default value of *offset* in :attr:`loopy.kernel.data.GlobalArg` for guessed arguments. Defaults to 0. :arg function_manglers: list of functions of signature (name, arg_dtypes) returning a tuple (result_dtype, c_name) or a tuple (result_dtype, c_name, arg_dtypes), where c_name is the C-level function to be called. :arg symbol_manglers: list of functions of signature (name) returning a tuple (result_dtype, c_name), where c_name is the C-level symbol to be evaluated. :arg assumptions: the initial implemented_domain, captures assumptions on loop domain parameters. (an isl.Set or a string in :ref:`isl-syntax`. If given as a string, only the CONDITIONS part of the set notation should be given.) :arg local_sizes: A dictionary from integers to integers, mapping workgroup axes to their sizes, e.g. *{0: 16}* forces axis 0 to be length 16. :arg silenced_warnings: a list (or semicolon-separated string) or warnings to silence :arg options: an instance of :class:`loopy.Options` or an equivalent string representation :arg target: an instance of :class:`loopy.target.TargetBase`, or *None*, to use an OpenCL target. """ defines = kwargs.pop("defines", {}) default_order = kwargs.pop("default_order", "C") default_offset = kwargs.pop("default_offset", 0) silenced_warnings = kwargs.pop("silenced_warnings", []) options = kwargs.pop("options", None) flags = kwargs.pop("flags", None) target = kwargs.pop("target", None) if target is None: try: import pyopencl # noqa except ImportError: from loopy.target.opencl import OpenCLTarget target = OpenCLTarget() else: from loopy.target.pyopencl import PyOpenCLTarget target = PyOpenCLTarget() if flags is not None: if options is not None: raise TypeError("may not pass both 'options' and 'flags'") from warnings import warn warn("'flags' is deprecated. Use 'options' instead", DeprecationWarning, stacklevel=2) options = flags from loopy.options import make_options options = make_options(options) if isinstance(silenced_warnings, str): silenced_warnings = silenced_warnings.split(";") # {{{ separate temporary variables and arguments, take care of names with commas from loopy.kernel.data import TemporaryVariable, ArrayBase if isinstance(kernel_data, str): kernel_data = kernel_data.split(",") kernel_args = [] temporary_variables = kwargs.pop("temporary_variables", {}).copy() for dat in kernel_data: if dat is Ellipsis or isinstance(dat, str): kernel_args.append(dat) continue if isinstance(dat, ArrayBase) and isinstance(dat.shape, tuple): new_shape = [] for shape_axis in dat.shape: if shape_axis is not None: new_shape.append(expand_defines_in_expr(shape_axis, defines)) else: new_shape.append(shape_axis) dat = dat.copy(shape=tuple(new_shape)) for arg_name in dat.name.split(","): arg_name = arg_name.strip() if not arg_name: continue my_dat = dat.copy(name=arg_name) if isinstance(dat, TemporaryVariable): temporary_variables[my_dat.name] = dat else: kernel_args.append(my_dat) del kernel_data # }}} # {{{ instruction/subst parsing parsed_instructions = [] kwargs["substitutions"] = substitutions = {} inames_to_dup = [] if isinstance(instructions, str): instructions = [instructions] for insn in instructions: for new_insn, insn_inames_to_dup in parse_if_necessary(insn, defines): if isinstance(new_insn, InstructionBase): parsed_instructions.append(new_insn) # Need to maintain 1-to-1 correspondence to instructions inames_to_dup.append(insn_inames_to_dup) elif isinstance(new_insn, SubstitutionRule): substitutions[new_insn.name] = new_insn assert not insn_inames_to_dup else: raise RuntimeError("unexpected type in instruction parsing") instructions = parsed_instructions del parsed_instructions # }}} # {{{ find/create isl_context for domain in domains: if isinstance(domain, isl.BasicSet): assert domain.get_ctx() == isl.DEFAULT_CONTEXT # }}} domains = parse_domains(domains, defines) arg_guesser = ArgumentGuesser(domains, instructions, temporary_variables, substitutions, default_offset) kernel_args = arg_guesser.convert_names_to_full_args(kernel_args) kernel_args = arg_guesser.guess_kernel_args_if_requested(kernel_args) from loopy.kernel import LoopKernel knl = LoopKernel(domains, instructions, kernel_args, temporary_variables=temporary_variables, silenced_warnings=silenced_warnings, options=options, target=target, **kwargs) from loopy import duplicate_inames for insn, insn_inames_to_dup in zip(knl.instructions, inames_to_dup): for old_iname, new_iname in insn_inames_to_dup: knl = duplicate_inames(knl, old_iname, within=insn.id, new_inames=new_iname) check_for_nonexistent_iname_deps(knl) knl = tag_reduction_inames_as_sequential(knl) knl = create_temporaries(knl, default_order) knl = determine_shapes_of_temporaries(knl) knl = expand_cses(knl) knl = expand_defines_in_shapes(knl, defines) knl = guess_arg_shape_if_requested(knl, default_order) knl = apply_default_order_to_args(knl, default_order) knl = resolve_wildcard_deps(knl) # ------------------------------------------------------------------------- # Ordering dependency: # ------------------------------------------------------------------------- # Must create temporaries before checking for writes to temporary variables # that are domain parameters. # ------------------------------------------------------------------------- check_for_multiple_writes_to_loop_bounds(knl) check_for_duplicate_names(knl) check_written_variable_names(knl) from loopy.preprocess import prepare_for_caching knl = prepare_for_caching(knl) return knl
def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): """Replace accesses to *var_name* with ones to a temporary, which is created and acts as a buffer. To perform this transformation, the access footprint to *var_name* is determined and a temporary of a suitable :class:`loopy.AddressSpace` and shape is created. By default, the value of the buffered cells in *var_name* are read prior to any (read/write) use, and the modified values are written out after use has concluded, but for special use cases (e.g. additive accumulation), the behavior can be modified using *init_expression* and *store_expression*. :arg buffer_inames: The inames across which the buffer should be usable--i.e. all possible values of these inames will be covered by the buffer footprint. A tuple of inames or a comma-separated string. :arg init_expression: Either *None* (indicating the prior value of the buffered array should be read) or an expression optionally involving the variable 'base' (which references the associated location in the array being buffered). :arg store_expression: Either *None*, *False*, or an expression involving variables 'base' and 'buffer' (without array indices). (*None* indicates that a default storage instruction should be used, *False* indicates that no storing of the temporary should occur at all.) :arg within: If not None, limit the action of the transformation to matching contexts. See :func:`loopy.match.parse_stack_match` for syntax. :arg temporary_scope: If given, override the choice of :class:`AddressSpace` for the created temporary. :arg default_tag: The default :ref:`iname-tags` to be assigned to the inames used for fetching and storing :arg fetch_bounding_box: If the access footprint is non-convex (resulting in an error), setting this argument to *True* will force a rectangular (and hence convex) superset of the footprint to be fetched. """ # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import AddressSpace if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", DeprecationWarning, stacklevel=2) if temporary_scope is not None: raise LoopyError("may not specify both temporary_is_local and " "temporary_scope") if temporary_is_local: temporary_scope = AddressSpace.LOCAL else: temporary_scope = AddressSpace.PRIVATE del temporary_is_local # }}} # {{{ process arguments if isinstance(init_expression, str): from loopy.symbolic import parse init_expression = parse(init_expression) if isinstance(store_expression, str): from loopy.symbolic import parse store_expression = parse(store_expression) if isinstance(buffer_inames, str): buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()] for iname in buffer_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) buffer_inames = list(buffer_inames) buffer_inames_set = frozenset(buffer_inames) from loopy.match import parse_stack_match within = parse_stack_match(within) if var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] elif var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] else: raise ValueError("variable '%s' not found" % var_name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): var_shape = var_descr.shape else: var_shape = () if temporary_scope is None: import loopy as lp temporary_scope = lp.auto # }}} # {{{ caching from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) cache_key = (key_kernel, var_name, tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) if CACHING_ENABLED: try: result = buffer_array_cache[cache_key] logger.info("%s: buffer_array cache hit" % kernel.name) return result except KeyError: pass # }}} var_name_gen = kernel.get_var_name_generator() within_inames = set() access_descriptors = [] for insn in kernel.instructions: if not within(kernel, insn.id, ()): continue from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in insn.assignees: if isinstance(assignee, Variable): assignee_name = assignee.name index = () elif isinstance(assignee, Subscript): assignee_name = assignee.aggregate.name index = assignee.index_tuple elif isinstance(assignee, LinearSubscript): if assignee.aggregate.name == var_name: raise LoopyError("buffer_array may not be applied in the " "presence of linear write indexing into '%s'" % var_name) else: raise LoopyError("invalid lvalue '%s'" % assignee) if assignee_name == var_name: within_inames.update( (get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) access_descriptors.append( AccessDescriptor( identifier=insn.id, storage_axis_exprs=index)) # {{{ find fetch/store inames init_inames = [] store_inames = [] new_iname_to_tag = {} for i in range(len(var_shape)): dim_name = str(i) if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: dim_name = var_descr.dim_names[i] init_iname = var_name_gen(f"{var_name}_init_{dim_name}") store_iname = var_name_gen(f"{var_name}_store_{dim_name}") new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag init_inames.append(init_iname) store_inames.append(store_iname) # }}} # {{{ modify loop domain non1_init_inames = [] non1_store_inames = [] if var_shape: # {{{ find domain to be changed from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, buffer_inames_set | within_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape)) for i in range(len(var_shape)): if abm.non1_storage_axis_flags[i]: non1_init_inames.append(init_inames[i]) non1_store_inames.append(store_inames[i]) else: del new_iname_to_tag[init_inames[i]] del new_iname_to_tag[store_inames[i]] new_domain = domch.domain new_domain = abm.augment_domain_with_sweep( new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box) new_domain = abm.augment_domain_with_sweep( new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box) new_kernel_domains = domch.get_domains_with(new_domain) del new_domain else: # leave kernel domains unchanged new_kernel_domains = kernel.domains abm = NoOpArrayToBufferMap() # }}} # {{{ set up temp variable import loopy as lp buf_var_name = var_name_gen(based_on=var_name+"_buf") new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=buf_var_name, dtype=var_descr.dtype, base_indices=(0,)*len(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape), address_space=temporary_scope) new_temporary_variables[buf_var_name] = temp_var # }}} new_insns = [] buf_var = var(buf_var_name) # {{{ generate init instruction buf_var_init = buf_var if non1_init_inames: buf_var_init = buf_var_init.index( tuple(var(iname) for iname in non1_init_inames)) init_base = var(var_name) init_subscript = [] init_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_init_inames[init_iname_idx]) init_iname_idx += 1 init_subscript.append(ax_subscript) if init_subscript: init_base = init_base.index(tuple(init_subscript)) if init_expression is None: init_expression = init_base else: init_expression = init_expression init_expression = SubstitutionMapper( make_subst_func({ "base": init_base, }))(init_expression) init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) from loopy.kernel.data import Assignment init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, within_inames=( frozenset(within_inames) | frozenset(non1_init_inames)), depends_on=frozenset(), depends_on_is_final=True) # }}} rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var) kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] if buf_var_name in insn.assignee_var_names(): did_write = True # {{{ add init_insn_id to depends_on new_insns = [] def none_to_empty_set(s): if s is None: return frozenset() else: return s for insn in kernel.instructions: if insn.id in aar.modified_insn_ids: new_insns.append( insn.copy( depends_on=( none_to_empty_set(insn.depends_on) | frozenset([init_insn_id])))) else: new_insns.append(insn) # }}} # {{{ generate store instruction buf_var_store = buf_var if non1_store_inames: buf_var_store = buf_var_store.index( tuple(var(iname) for iname in non1_store_inames)) store_subscript = [] store_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_store_inames[store_iname_idx]) store_iname_idx += 1 store_subscript.append(ax_subscript) store_target = var(var_name) if store_subscript: store_target = store_target.index(tuple(store_subscript)) if store_expression is None: store_expression = buf_var_store else: store_expression = SubstitutionMapper( make_subst_func({ "base": store_target, "buffer": buf_var_store, }))(store_expression) if store_expression is not False: from loopy.kernel.data import Assignment store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), no_sync_with=frozenset([(init_insn_id, "any")]), assignee=store_target, expression=store_expression, within_inames=( frozenset(within_inames) | frozenset(non1_store_inames))) else: did_write = False # }}} new_insns.append(init_instruction) if did_write: new_insns.append(store_instruction) else: for iname in store_inames: del new_iname_to_tag[iname] kernel = kernel.copy( domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables) from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching buffer_array_cache.store_if_not_present( cache_key, prepare_for_caching(kernel)) return kernel
def buffer_array(kernel, var_name, buffer_inames, init_expression=None, store_expression=None, within=None, default_tag="l.auto", temporary_scope=None, temporary_is_local=None, fetch_bounding_box=False): """ :arg init_expression: Either *None* (indicating the prior value of the buffered array should be read) or an expression optionally involving the variable 'base' (which references the associated location in the array being buffered). :arg store_expression: Either *None*, *False*, or an expression involving variables 'base' and 'buffer' (without array indices). (*None* indicates that a default storage instruction should be used, *False* indicates that no storing of the temporary should occur at all.) """ # {{{ unify temporary_scope / temporary_is_local from loopy.kernel.data import temp_var_scope if temporary_is_local is not None: from warnings import warn warn("temporary_is_local is deprecated. Use temporary_scope instead", DeprecationWarning, stacklevel=2) if temporary_scope is not None: raise LoopyError("may not specify both temporary_is_local and " "temporary_scope") if temporary_is_local: temporary_scope = temp_var_scope.LOCAL else: temporary_scope = temp_var_scope.PRIVATE del temporary_is_local # }}} # {{{ process arguments if isinstance(init_expression, str): from loopy.symbolic import parse init_expression = parse(init_expression) if isinstance(store_expression, str): from loopy.symbolic import parse store_expression = parse(store_expression) if isinstance(buffer_inames, str): buffer_inames = [s.strip() for s in buffer_inames.split(",") if s.strip()] for iname in buffer_inames: if iname not in kernel.all_inames(): raise RuntimeError("sweep iname '%s' is not a known iname" % iname) buffer_inames = list(buffer_inames) buffer_inames_set = frozenset(buffer_inames) from loopy.match import parse_stack_match within = parse_stack_match(within) if var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] elif var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] else: raise ValueError("variable '%s' not found" % var_name) from loopy.kernel.data import ArrayBase if isinstance(var_descr, ArrayBase): var_shape = var_descr.shape else: var_shape = () if temporary_scope is None: import loopy as lp temporary_scope = lp.auto # }}} # {{{ caching from loopy import CACHING_ENABLED from loopy.preprocess import prepare_for_caching key_kernel = prepare_for_caching(kernel) cache_key = (key_kernel, var_name, tuple(buffer_inames), PymbolicExpressionHashWrapper(init_expression), PymbolicExpressionHashWrapper(store_expression), within, default_tag, temporary_scope, fetch_bounding_box) if CACHING_ENABLED: try: result = buffer_array_cache[cache_key] logger.info("%s: buffer_array cache hit" % kernel.name) return result except KeyError: pass # }}} var_name_gen = kernel.get_var_name_generator() within_inames = set() access_descriptors = [] for insn in kernel.instructions: if not within(kernel, insn.id, ()): continue from pymbolic.primitives import Variable, Subscript from loopy.symbolic import LinearSubscript for assignee in insn.assignees: if isinstance(assignee, Variable): assignee_name = assignee.name index = () elif isinstance(assignee, Subscript): assignee_name = assignee.aggregate.name index = assignee.index_tuple elif isinstance(assignee, LinearSubscript): if assignee.aggregate.name == var_name: raise LoopyError("buffer_array may not be applied in the " "presence of linear write indexing into '%s'" % var_name) else: raise LoopyError("invalid lvalue '%s'" % assignee) if assignee_name == var_name: within_inames.update( (get_dependencies(index) & kernel.all_inames()) - buffer_inames_set) access_descriptors.append( AccessDescriptor( identifier=insn.id, storage_axis_exprs=index)) # {{{ find fetch/store inames init_inames = [] store_inames = [] new_iname_to_tag = {} for i in range(len(var_shape)): dim_name = str(i) if isinstance(var_descr, ArrayBase) and var_descr.dim_names is not None: dim_name = var_descr.dim_names[i] init_iname = var_name_gen("%s_init_%s" % (var_name, dim_name)) store_iname = var_name_gen("%s_store_%s" % (var_name, dim_name)) new_iname_to_tag[init_iname] = default_tag new_iname_to_tag[store_iname] = default_tag init_inames.append(init_iname) store_inames.append(store_iname) # }}} # {{{ modify loop domain non1_init_inames = [] non1_store_inames = [] if var_shape: # {{{ find domain to be changed from loopy.kernel.tools import DomainChanger domch = DomainChanger(kernel, buffer_inames_set | within_inames) if domch.leaf_domain_index is not None: # If the sweep inames are at home in parent domains, then we'll add # fetches with loops over copies of these parent inames that will end # up being scheduled *within* loops over these parents. for iname in buffer_inames_set: if kernel.get_home_domain_index(iname) != domch.leaf_domain_index: raise RuntimeError("buffer iname '%s' is not 'at home' in the " "sweep's leaf domain" % iname) # }}} abm = ArrayToBufferMap(kernel, domch.domain, buffer_inames, access_descriptors, len(var_shape)) for i in range(len(var_shape)): if abm.non1_storage_axis_flags[i]: non1_init_inames.append(init_inames[i]) non1_store_inames.append(store_inames[i]) else: del new_iname_to_tag[init_inames[i]] del new_iname_to_tag[store_inames[i]] new_domain = domch.domain new_domain = abm.augment_domain_with_sweep( new_domain, non1_init_inames, boxify_sweep=fetch_bounding_box) new_domain = abm.augment_domain_with_sweep( new_domain, non1_store_inames, boxify_sweep=fetch_bounding_box) new_kernel_domains = domch.get_domains_with(new_domain) del new_domain else: # leave kernel domains unchanged new_kernel_domains = kernel.domains abm = NoOpArrayToBufferMap() # }}} # {{{ set up temp variable import loopy as lp buf_var_name = var_name_gen(based_on=var_name+"_buf") new_temporary_variables = kernel.temporary_variables.copy() temp_var = lp.TemporaryVariable( name=buf_var_name, dtype=var_descr.dtype, base_indices=(0,)*len(abm.non1_storage_shape), shape=tuple(abm.non1_storage_shape), scope=temporary_scope) new_temporary_variables[buf_var_name] = temp_var # }}} new_insns = [] buf_var = var(buf_var_name) # {{{ generate init instruction buf_var_init = buf_var if non1_init_inames: buf_var_init = buf_var_init.index( tuple(var(iname) for iname in non1_init_inames)) init_base = var(var_name) init_subscript = [] init_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_init_inames[init_iname_idx]) init_iname_idx += 1 init_subscript.append(ax_subscript) if init_subscript: init_base = init_base.index(tuple(init_subscript)) if init_expression is None: init_expression = init_base else: init_expression = init_expression init_expression = SubstitutionMapper( make_subst_func({ "base": init_base, }))(init_expression) init_insn_id = kernel.make_unique_instruction_id(based_on="init_"+var_name) from loopy.kernel.data import Assignment init_instruction = Assignment(id=init_insn_id, assignee=buf_var_init, expression=init_expression, forced_iname_deps=( frozenset(within_inames) | frozenset(non1_init_inames)), depends_on=frozenset(), depends_on_is_final=True) # }}} rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, kernel.get_var_name_generator()) aar = ArrayAccessReplacer(rule_mapping_context, var_name, within, abm, buf_var) kernel = rule_mapping_context.finish_kernel(aar.map_kernel(kernel)) did_write = False for insn_id in aar.modified_insn_ids: insn = kernel.id_to_insn[insn_id] if buf_var_name in insn.assignee_var_names(): did_write = True # {{{ add init_insn_id to depends_on new_insns = [] def none_to_empty_set(s): if s is None: return frozenset() else: return s for insn in kernel.instructions: if insn.id in aar.modified_insn_ids: new_insns.append( insn.copy( depends_on=( none_to_empty_set(insn.depends_on) | frozenset([init_insn_id])))) else: new_insns.append(insn) # }}} # {{{ generate store instruction buf_var_store = buf_var if non1_store_inames: buf_var_store = buf_var_store.index( tuple(var(iname) for iname in non1_store_inames)) store_subscript = [] store_iname_idx = 0 if var_shape: for i in range(len(var_shape)): ax_subscript = abm.storage_base_indices[i] if abm.non1_storage_axis_flags[i]: ax_subscript += var(non1_store_inames[store_iname_idx]) store_iname_idx += 1 store_subscript.append(ax_subscript) store_target = var(var_name) if store_subscript: store_target = store_target.index(tuple(store_subscript)) if store_expression is None: store_expression = buf_var_store else: store_expression = SubstitutionMapper( make_subst_func({ "base": store_target, "buffer": buf_var_store, }))(store_expression) if store_expression is not False: from loopy.kernel.data import Assignment store_instruction = Assignment( id=kernel.make_unique_instruction_id(based_on="store_"+var_name), depends_on=frozenset(aar.modified_insn_ids), no_sync_with=frozenset([init_insn_id]), assignee=store_target, expression=store_expression, forced_iname_deps=( frozenset(within_inames) | frozenset(non1_store_inames))) else: did_write = False # }}} new_insns.append(init_instruction) if did_write: new_insns.append(store_instruction) else: for iname in store_inames: del new_iname_to_tag[iname] kernel = kernel.copy( domains=new_kernel_domains, instructions=new_insns, temporary_variables=new_temporary_variables) from loopy import tag_inames kernel = tag_inames(kernel, new_iname_to_tag) from loopy.kernel.tools import assign_automatic_axes kernel = assign_automatic_axes(kernel) if CACHING_ENABLED: from loopy.preprocess import prepare_for_caching buffer_array_cache[cache_key] = prepare_for_caching(kernel) return kernel