def add_default_dependencies(kernel): logger.debug("%s: default deps" % kernel.name) from loopy.transform.subst import expand_subst expanded_kernel = expand_subst(kernel) writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) dep_map = dict( (insn.id, insn.read_dependency_names() & var_names) for insn in expanded_kernel.instructions) new_insns = [] for insn in kernel.instructions: if not insn.depends_on_is_final: auto_deps = set() # {{{ add automatic dependencies all_my_var_writers = set() for var in dep_map[insn.id]: var_writers = writer_map.get(var, set()) all_my_var_writers |= var_writers if not var_writers and var not in arg_names: tv = kernel.temporary_variables[var] if tv.initializer is None: warn_with_kernel(kernel, "read_no_write(%s)" % var, "temporary variable '%s' is read, but never written." % var) if len(var_writers) == 1: auto_deps.update( var_writers - set([insn.id])) # }}} depends_on = insn.depends_on if depends_on is None: depends_on = frozenset() insn = insn.copy(depends_on=frozenset(auto_deps) | depends_on) new_insns.append(insn) return kernel.copy(instructions=new_insns)
def add_default_dependencies(kernel): logger.debug("%s: default deps" % kernel.name) from loopy.transform.subst import expand_subst expanded_kernel = expand_subst(kernel) writer_map = kernel.writer_map() arg_names = set(arg.name for arg in kernel.args) var_names = arg_names | set(six.iterkeys(kernel.temporary_variables)) dep_map = dict( (insn.id, insn.read_dependency_names() & var_names) for insn in expanded_kernel.instructions) new_insns = [] for insn in kernel.instructions: if not insn.depends_on_is_final: auto_deps = set() # {{{ add automatic dependencies all_my_var_writers = set() for var in dep_map[insn.id]: var_writers = writer_map.get(var, set()) all_my_var_writers |= var_writers if not var_writers and var not in arg_names: tv = kernel.temporary_variables[var] if tv.initializer is None: warn_with_kernel(kernel, "read_no_write(%s)" % var, "temporary variable '%s' is read, but never written." % var) if len(var_writers) == 1: auto_deps.update( var_writers - set([insn.id])) # }}} depends_on = insn.depends_on if depends_on is None: depends_on = frozenset() insn = insn.copy(depends_on=frozenset(auto_deps) | depends_on) new_insns.append(insn) return kernel.copy(instructions=new_insns)
def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) def debug(s): logger.debug("%s: %s" % (kernel.name, s)) unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ fill queue # queue contains temporary variables queue = [] import loopy as lp for tv in six.itervalues(kernel.temporary_variables): if tv.dtype is lp.auto: queue.append(tv) for arg in kernel.args: if arg.dtype is None: queue.append(arg) # }}} from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel, _DictUnionView([ new_temp_vars, new_arg_dict ])) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument failed_names = set() while queue: item = queue.pop(0) debug("inferring type for %s %s" % (type(item).__name__, item.name)) result, symbols_with_unavailable_types = \ _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) failed = result is None if not failed: debug(" success: %s" % result) if isinstance(item, TemporaryVariable): new_temp_vars[item.name] = item.copy(dtype=result) elif isinstance(item, KernelArgument): new_arg_dict[item.name] = item.copy(dtype=result) else: raise LoopyError("unexpected item type in type inference") else: debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unavailable_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) if expect_completion: raise LoopyError( "could not determine type of '%s'%s" % (item.name, advice)) else: # We're done here. break # remember that this item failed failed_names.add(item.name) queue_names = set(qi.name for qi in queue) if queue_names == failed_names: # We did what we could... print(queue_names, failed_names, item.name) assert not expect_completion break # can't infer type yet, put back into queue queue.append(item) else: # we've made progress, reset failure markers failed_names = set() # }}} return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], )
def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) from functools import partial debug = partial(_debug, kernel) import time start_time = time.time() unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ find names_with_unknown_types # contains both arguments and temporaries names_for_type_inference = [] import loopy as lp for tv in six.itervalues(kernel.temporary_variables): assert tv.dtype is not lp.auto if tv.dtype is None: names_for_type_inference.append(tv.name) for arg in kernel.args: assert arg.dtype is not lp.auto if arg.dtype is None: names_for_type_inference.append(arg.name) # }}} logger.debug("finding types for {count:d} names".format( count=len(names_for_type_inference))) writer_map = kernel.writer_map() dep_graph = dict( (written_var, set( read_var for insn_id in writer_map.get(written_var, []) for read_var in kernel.id_to_insn[insn_id].read_dependency_names() if read_var in names_for_type_inference)) for written_var in names_for_type_inference) from loopy.tools import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types # mutually depend on themselves. The SCCs are returned and processed in # topological order. sccs = compute_sccs(dep_graph) item_lookup = _DictUnionView([ new_temp_vars, new_arg_dict ]) type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] failed_names = set() while queue or changed_during_last_queue_run: if not queue and changed_during_last_queue_run: changed_during_last_queue_run = False # Optimization: If there's a single variable in the SCC without # a self-referential dependency, then the type is known after a # single iteration (we don't need to look at the expressions # again). if len(var_chain) == 1: single_var, = var_chain if single_var not in dep_graph[single_var]: break queue = var_chain[:] name = queue.pop(0) item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) result, symbols_with_unavailable_types = ( _infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) failed = not result if not failed: new_dtype, = result if new_dtype.target is None: new_dtype = new_dtype.with_target(kernel.target) debug(" success: %s", new_dtype) if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) changed_during_last_queue_run = True if isinstance(item, TemporaryVariable): new_temp_vars[name] = item.copy(dtype=new_dtype) elif isinstance(item, KernelArgument): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError("unexpected item type in type inference") else: debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unavailable_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) if expect_completion: raise LoopyError( "could not determine type of '%s'%s" % (item.name, advice)) else: # We're done here. break # remember that this item failed failed_names.add(item.name) if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) assert not expect_completion break # can't infer type yet, put back into queue queue.append(name) else: # we've made progress, reset failure markers failed_names = set() # }}} end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format( dur=end_time - start_time)) return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], )
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): assert isinstance(kernel, LoopKernel) # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) if var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] elif var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] else: raise NameError("array '%s' was not found" % var_name) # {{{ check/normalize vary_by_axes if isinstance(vary_by_axes, str): vary_by_axes = vary_by_axes.split(",") from loopy.kernel.array import ArrayBase if isinstance(var_descr, ArrayBase): if var_descr.dim_names is not None: name_to_index = { name: idx for idx, name in enumerate(var_descr.dim_names) } else: name_to_index = {} def map_ax_name_to_index(ax): if isinstance(ax, str): try: return name_to_index[ax] except KeyError: raise LoopyError("axis name '%s' not understood " % ax) else: return ax vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes] if (vary_by_axes and (min(vary_by_axes) < 0 or max(vary_by_axes) > var_descr.num_user_axes())): raise LoopyError("vary_by_axes refers to out-of-bounds axis index") # }}} from pymbolic.mapper.substitutor import make_subst_func from pymbolic.primitives import (Sum, Product, is_zero, flattened_sum, flattened_product, Subscript, Variable) from loopy.symbolic import (get_dependencies, SubstitutionMapper, UnidirectionalUnifier) # {{{ common factor key list maintenance # list of (index_key, common factors found) common_factors = [] def find_unifiable_cf_index(index_key): for i, (key, _val) in enumerate(common_factors): unif = UnidirectionalUnifier( lhs_mapping_candidates=get_dependencies(key)) unif_result = unif(key, index_key) if unif_result: assert len(unif_result) == 1 return i, unif_result[0] return None, None def extract_index_key(access_expr): if isinstance(access_expr, Variable): return () elif isinstance(access_expr, Subscript): index = access_expr.index_tuple return tuple(index[ax] for ax in vary_by_axes) else: raise ValueError("unexpected type of access_expr") def is_assignee(insn): return var_name in insn.assignee_var_names() def iterate_as(cls, expr): if isinstance(expr, cls): yield from expr.children else: yield expr # }}} # {{{ find common factors from loopy.kernel.data import Assignment for insn in kernel.instructions: if not is_assignee(insn): continue if not isinstance(insn, Assignment): raise LoopyError("'%s' modified by non-single-assignment" % var_name) lhs = insn.assignee rhs = insn.expression if is_zero(rhs): continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: # {{{ doesn't exist yet assert unif_result is None my_common_factors = None for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) if my_common_factors is None: my_common_factors = product_parts else: my_common_factors = my_common_factors & product_parts if my_common_factors is not None: common_factors.append((index_key, my_common_factors)) # }}} else: # {{{ match, filter existing common factors _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) my_common_factors = { cf for cf in my_common_factors if unif_subst_map(cf) in product_parts } common_factors[cf_index] = (index_key, my_common_factors) # }}} # }}} common_factors = [(ik, cf) for ik, cf in common_factors if cf] if not common_factors: raise LoopyError("no common factors found") # {{{ remove common factors new_insns = [] for insn in kernel.instructions: if not isinstance(insn, Assignment) or not is_assignee(insn): new_insns.append(insn) continue index_key = extract_index_key(insn.assignee) lhs = insn.assignee rhs = insn.expression if is_zero(rhs): new_insns.append(insn) continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: new_insns.append(insn) continue _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap)) mapped_my_common_factors = { unif_subst_map(cf) for cf in my_common_factors } new_sum_terms = [] for term in iterate_as(Sum, rhs): if term == lhs: new_sum_terms.append(term) continue new_sum_terms.append( flattened_product([ part for part in iterate_as(Product, term) if part not in mapped_my_common_factors ])) new_insns.append(insn.copy(expression=flattened_sum(new_sum_terms))) # }}} # {{{ substitute common factors into usage sites def find_substitution(expr): if isinstance(expr, Subscript): v = expr.aggregate.name elif isinstance(expr, Variable): v = expr.name else: return expr if v != var_name: return expr index_key = extract_index_key(expr) cf_index, unif_result = find_unifiable_cf_index(index_key) unif_subst_map = SubstitutionMapper(make_subst_func(unif_result.lmap)) _, my_common_factors = common_factors[cf_index] if my_common_factors is not None: return flattened_product( [unif_subst_map(cf) for cf in my_common_factors] + [expr]) else: return expr insns = new_insns new_insns = [] subm = SubstitutionMapper(find_substitution) for insn in insns: if not isinstance(insn, Assignment) or is_assignee(insn): new_insns.append(insn) continue new_insns.append(insn.with_transformed_expressions(subm)) # }}} return kernel.copy(instructions=new_insns)
def preprocess_kernel(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) from loopy.kernel import kernel_state if kernel.state != kernel_state.INITIAL: raise LoopyError("cannot re-preprocess an already preprocessed " "kernel") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = preprocess_cache[kernel] logger.info("%s: preprocess cache hit" % kernel.name) return result except KeyError: pass # }}} logger.info("%s: preprocess start" % kernel.name) # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase for iname, tag in six.iteritems(kernel.iname_to_tag): if (isinstance(tag, AutoLocalIndexTagBase) and iname in kernel.all_inames()): raise LoopyError("kernel with automatically-assigned " "local axes passed to preprocessing") # }}} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) check_reduction_iname_uniqueness(kernel) kernel = add_default_dependencies(kernel) # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs # to be able to determine the types of the reduced expressions. # # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. from loopy.transform.ilp import add_axes_to_temporaries_for_ilp_and_vec kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel) kernel = find_temporary_scope(kernel) kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( state=kernel_state.PREPROCESSED) # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working # towards in order to pickle and unpickle them. This is the first pass that # uses caching, so we need to be ready to pickle. This means propagating # this target information. if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) kernel = prepare_for_caching(kernel) # }}} if CACHING_ENABLED: preprocess_cache[input_kernel] = kernel return kernel
def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()): # FIXME: Does not understand subst rules for now if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) if var_name in kernel.temporary_variables: var_descr = kernel.temporary_variables[var_name] elif var_name in kernel.arg_dict: var_descr = kernel.arg_dict[var_name] else: raise NameError("array '%s' was not found" % var_name) # {{{ check/normalize vary_by_axes if isinstance(vary_by_axes, str): vary_by_axes = vary_by_axes.split(",") from loopy.kernel.array import ArrayBase if isinstance(var_descr, ArrayBase): if var_descr.dim_names is not None: name_to_index = dict( (name, idx) for idx, name in enumerate(var_descr.dim_names)) else: name_to_index = {} def map_ax_name_to_index(ax): if isinstance(ax, str): try: return name_to_index[ax] except KeyError: raise LoopyError("axis name '%s' not understood " % ax) else: return ax vary_by_axes = [map_ax_name_to_index(ax) for ax in vary_by_axes] if ( vary_by_axes and (min(vary_by_axes) < 0 or max(vary_by_axes) > var_descr.num_user_axes())): raise LoopyError("vary_by_axes refers to out-of-bounds axis index") # }}} from pymbolic.mapper.substitutor import make_subst_func from pymbolic.primitives import (Sum, Product, is_zero, flattened_sum, flattened_product, Subscript, Variable) from loopy.symbolic import (get_dependencies, SubstitutionMapper, UnidirectionalUnifier) # {{{ common factor key list maintenance # list of (index_key, common factors found) common_factors = [] def find_unifiable_cf_index(index_key): for i, (key, val) in enumerate(common_factors): unif = UnidirectionalUnifier( lhs_mapping_candidates=get_dependencies(key)) unif_result = unif(key, index_key) if unif_result: assert len(unif_result) == 1 return i, unif_result[0] return None, None def extract_index_key(access_expr): if isinstance(access_expr, Variable): return () elif isinstance(access_expr, Subscript): index = access_expr.index_tuple return tuple(index[ax] for ax in vary_by_axes) else: raise ValueError("unexpected type of access_expr") def is_assignee(insn): return any( lhs == var_name for lhs, sbscript in insn.assignees_and_indices()) def iterate_as(cls, expr): if isinstance(expr, cls): for ch in expr.children: yield ch else: yield expr # }}} # {{{ find common factors from loopy.kernel.data import Assignment for insn in kernel.instructions: if not is_assignee(insn): continue if not isinstance(insn, Assignment): raise LoopyError("'%s' modified by non-expression instruction" % var_name) lhs = insn.assignee rhs = insn.expression if is_zero(rhs): continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: # {{{ doesn't exist yet assert unif_result is None my_common_factors = None for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) if my_common_factors is None: my_common_factors = product_parts else: my_common_factors = my_common_factors & product_parts if my_common_factors is not None: common_factors.append((index_key, my_common_factors)) # }}} else: # {{{ match, filter existing common factors _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) for term in iterate_as(Sum, rhs): if term == lhs: continue for part in iterate_as(Product, term): if var_name in get_dependencies(part): raise LoopyError("unexpected dependency on '%s' " "in RHS of instruction '%s'" % (var_name, insn.id)) product_parts = set(iterate_as(Product, term)) my_common_factors = set( cf for cf in my_common_factors if unif_subst_map(cf) in product_parts) common_factors[cf_index] = (index_key, my_common_factors) # }}} # }}} # {{{ remove common factors new_insns = [] for insn in kernel.instructions: if not isinstance(insn, Assignment) or not is_assignee(insn): new_insns.append(insn) continue (_, index_key), = insn.assignees_and_indices() lhs = insn.assignee rhs = insn.expression if is_zero(rhs): new_insns.append(insn) continue index_key = extract_index_key(lhs) cf_index, unif_result = find_unifiable_cf_index(index_key) if cf_index is None: new_insns.append(insn) continue _, my_common_factors = common_factors[cf_index] unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) mapped_my_common_factors = set( unif_subst_map(cf) for cf in my_common_factors) new_sum_terms = [] for term in iterate_as(Sum, rhs): if term == lhs: new_sum_terms.append(term) continue new_sum_terms.append( flattened_product([ part for part in iterate_as(Product, term) if part not in mapped_my_common_factors ])) new_insns.append( insn.copy(expression=flattened_sum(new_sum_terms))) # }}} # {{{ substitute common factors into usage sites def find_substitution(expr): if isinstance(expr, Subscript): v = expr.aggregate.name elif isinstance(expr, Variable): v = expr.name else: return expr if v != var_name: return expr index_key = extract_index_key(expr) cf_index, unif_result = find_unifiable_cf_index(index_key) unif_subst_map = SubstitutionMapper( make_subst_func(unif_result.lmap)) _, my_common_factors = common_factors[cf_index] if my_common_factors is not None: return flattened_product( [unif_subst_map(cf) for cf in my_common_factors] + [expr]) else: return expr insns = new_insns new_insns = [] subm = SubstitutionMapper(find_substitution) for insn in insns: if not isinstance(insn, Assignment) or is_assignee(insn): new_insns.append(insn) continue new_insns.append(insn.with_transformed_expressions(subm)) # }}} return kernel.copy(instructions=new_insns)
def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # "None" is a valid value here, distinct from the default. default_tag=_not_provided, rule_name=None, temporary_name=None, temporary_scope=None, temporary_is_local=None, footprint_subscripts=None, fetch_bounding_box=False, fetch_outer_inames=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. :arg var_name: A string, the name of the variable being prefetched. This may be a 'tagged variable name' (such as ``field$mytag`` to restrict the effect of the operation to only variable accesses with a matching tag. This may also be a subscripted version of the variable, in which case this access dictates the footprint that is prefetched, e.g. ``A[:,:]`` or ``field[i,j,:,:]``. In this case, accesses in the kernel are disregarded. :arg sweep_inames: A list of inames, or a comma-separated string of them. This routine 'sweeps' all accesses to *var_name* through all allowed values of the *sweep_inames* to generate a footprint. All values in this footprint are then stored in a temporary variable, and the original variable accesses replaced with accesses to this temporary. :arg dim_arg_names: List of names representing each fetch axis. These names show up as inames in the generated fetch code :arg default_tag: The :ref:`implementation tag <iname-tags>` to assign to the inames driving the prefetch code. Use *None* to leave them undefined (to assign them later by hand). The current default will make them local axes and automatically split them to fit the work group size, but this default will disappear in favor of simply leaving them untagged in 2019.x. For 2018.x, a warning will be issued if no *default_tag* is specified. :arg rule_name: base name of the generated temporary variable. :arg temporary_name: The name of the temporary to be used. :arg temporary_scope: The :class:`temp_var_scope` to use for the temporary. :arg temporary_is_local: Deprecated, use *temporary_scope* instead. :arg footprint_subscripts: A list of tuples indicating the index (i.e. subscript) tuples used to generate the footprint. If only one such set of indices is desired, this may also be specified directly by putting an index expression into *var_name*. Substitutions such as those occurring in dimension splits are recorded and also applied to these indices. :arg fetch_bounding_box: To fit within :mod:`loopy`'s execution model, the 'footprint' of the fetch currently has to be a convex set. Sometimes this is not the case, e.g. for a high-order stencil:: o o ooooo o o The footprint of the stencil when 'swept' over a base domain would look like this, and because of the 'missing corners', this set is not convex:: oooooooooo oooooooooo oooooooooooooo oooooooooooooo oooooooooooooo oooooooooooooo oooooooooo oooooooooo Passing ``fetch_bounding_box=True`` gives :mod:`loopy` permission to instead fetch the 'bounding box' of the footprint, i.e. this set in the stencil example:: OOooooooooooOO OOooooooooooOO oooooooooooooo oooooooooooooo oooooooooooooo oooooooooooooo OOooooooooooOO OOooooooooooOO Note the added corners marked with "``O``". The resulting footprint is guaranteed to be convex. :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. This function internally uses :func:`extract_subst` and :func:`precompute`. """ # {{{ fish indexing out of var_name and into footprint_subscripts from loopy.symbolic import parse parsed_var_name = parse(var_name) from pymbolic.primitives import Variable, Subscript if isinstance(parsed_var_name, Variable): # nothing to see pass elif isinstance(parsed_var_name, Subscript): if footprint_subscripts is not None: raise TypeError("if footprint_subscripts is specified, then var_name " "may not contain a subscript") assert isinstance(parsed_var_name.aggregate, Variable) footprint_subscripts = [parsed_var_name.index] parsed_var_name = parsed_var_name.aggregate else: raise ValueError("var_name must either be a variable name or a subscript") # }}} # {{{ fish out tag from loopy.symbolic import TaggedVariable if isinstance(parsed_var_name, TaggedVariable): var_name = parsed_var_name.name tag = parsed_var_name.tag else: var_name = parsed_var_name.name tag = None # }}} c_name = var_name if tag is not None: c_name = c_name + "_" + tag var_name_gen = kernel.get_var_name_generator() if rule_name is None: rule_name = var_name_gen("%s_fetch_rule" % c_name) if temporary_name is None: temporary_name = var_name_gen("%s_fetch" % c_name) arg = kernel.arg_dict[var_name] # {{{ make parameter names and unification template parameters = [] for i in range(arg.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) if arg.dim_names is not None: based_on = "%s_dim_%s" % (c_name, arg.dim_names[i]) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] par_name = var_name_gen(based_on=based_on) parameters.append(par_name) from pymbolic import var uni_template = parsed_var_name if len(parameters) > 1: uni_template = uni_template.index( tuple(var(par_name) for par_name in parameters)) elif len(parameters) == 1: uni_template = uni_template.index(var(parameters[0])) # }}} from loopy.transform.subst import extract_subst kernel = extract_subst(kernel, rule_name, uni_template, parameters) if isinstance(sweep_inames, str): sweep_inames = [s.strip() for s in sweep_inames.split(",")] else: # copy, standardize to list sweep_inames = list(sweep_inames) kernel, subst_use, sweep_inames, inames_to_be_removed = \ _process_footprint_subscripts( kernel, rule_name, sweep_inames, footprint_subscripts, arg) # Our _not_provided is actually a different object from the one in the # precompute module, but precompute acutally uses that to adjust its # warning message. from loopy.transform.precompute import precompute new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, temporary_scope=temporary_scope, temporary_is_local=temporary_is_local, precompute_outer_inames=fetch_outer_inames) # {{{ remove inames that were temporarily added by slice sweeps new_domains = new_kernel.domains[:] for iname in inames_to_be_removed: home_domain_index = kernel.get_home_domain_index(iname) domain = new_domains[home_domain_index] dt, idx = domain.get_var_dict()[iname] assert dt == dim_type.set new_domains[home_domain_index] = domain.project_out(dt, idx, 1) new_kernel = new_kernel.copy(domains=new_domains) # }}} # If the rule survived past precompute() (i.e. some accesses fell outside # the footprint), get rid of it before moving on. if rule_name in new_kernel.substitutions: from loopy.transform.subst import expand_subst return expand_subst(new_kernel, "... > id:"+rule_name) else: return new_kernel
def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) writer_map = kernel.writer_map() insn_id_to_inames = {} insn_assignee_inames = {} all_read_deps = {} all_write_deps = {} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps if insn.within_inames_is_final: iname_deps = insn.within_inames else: iname_deps = (deps & kernel.all_inames() | insn.within_inames) assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) logger.debug("%s: find_all_insn_inames: %s (init): %s - " "read deps: %s - write deps: %s" % ( kernel.name, insn.id, ", ".join(sorted(iname_deps)), ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), )) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following # scenario: # # z = expr(iname) # y = expr(z) # x = expr(y) # # x clearly has a dependency on iname, but this is not found until that # dependency has propagated all the way up. Doing this recursively is # not guaranteed to terminate because of circular dependencies. while True: did_something = False for insn in kernel.instructions: if insn.within_inames_is_final: continue # {{{ depdency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( kernel, insn, insn_id_to_inames) insn_id_to_inames[insn.id] = inames_new if inames_new != inames_old: did_something = True warn_with_kernel( kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' " "was/were automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new - inames_old), insn.id)) # }}} # {{{ domain-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = set(insn_id_to_inames[insn.id]) for iname in inames_old: home_domain = kernel.domains[kernel.get_home_domain_index( iname)] for par in home_domain.get_var_names(dim_type.param): # Add all inames occurring in parameters of domains that my # current inames refer to. if par in kernel.all_inames(): inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is # in. if par in kernel.temporary_variables: for writer_id in writer_map.get(par, []): inames_new.update(insn_id_to_inames[writer_id]) if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) warn_with_kernel( kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' was " "automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new - inames_old), insn.id)) # }}} if not did_something: break logger.debug("%s: find_all_insn_inames: done" % kernel.name) for v in six.itervalues(insn_id_to_inames): assert isinstance(v, frozenset) return insn_id_to_inames
def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) from functools import partial debug = partial(_debug, kernel) import time start_time = time.time() unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ find names_with_unknown_types # contains both arguments and temporaries names_for_type_inference = [] import loopy as lp for tv in six.itervalues(kernel.temporary_variables): if tv.dtype is lp.auto: names_for_type_inference.append(tv.name) for arg in kernel.args: if arg.dtype is None: names_for_type_inference.append(arg.name) # }}} logger.debug("finding types for {count:d} names".format( count=len(names_for_type_inference))) writer_map = kernel.writer_map() dep_graph = dict(( written_var, set(read_var for insn_id in writer_map.get(written_var, []) for read_var in kernel.id_to_insn[insn_id].read_dependency_names() if read_var in names_for_type_inference)) for written_var in names_for_type_inference) from loopy.tools import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types # mutually depend on themselves. The SCCs are returned and processed in # topological order. sccs = compute_sccs(dep_graph) item_lookup = _DictUnionView([new_temp_vars, new_arg_dict]) type_inf_mapper = TypeInferenceMapper(kernel, item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument for var_chain in sccs: changed_during_last_queue_run = False queue = var_chain[:] failed_names = set() while queue or changed_during_last_queue_run: if not queue and changed_during_last_queue_run: changed_during_last_queue_run = False # Optimization: If there's a single variable in the SCC without # a self-referential dependency, then the type is known after a # single iteration (we don't need to look at the expressions # again). if len(var_chain) == 1: single_var, = var_chain if single_var not in dep_graph[single_var]: break queue = var_chain[:] name = queue.pop(0) item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) result, symbols_with_unavailable_types = (_infer_var_type( kernel, item.name, type_inf_mapper, subst_expander)) failed = not result if not failed: new_dtype, = result debug(" success: %s", new_dtype) if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) changed_during_last_queue_run = True if isinstance(item, TemporaryVariable): new_temp_vars[name] = item.copy(dtype=new_dtype) elif isinstance(item, KernelArgument): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError( "unexpected item type in type inference") else: debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unavailable_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) if expect_completion: raise LoopyError("could not determine type of '%s'%s" % (item.name, advice)) else: # We're done here. break # remember that this item failed failed_names.add(item.name) if set(queue) == failed_names: # We did what we could... print(queue, failed_names, item.name) assert not expect_completion break # can't infer type yet, put back into queue queue.append(name) else: # we've made progress, reset failure markers failed_names = set() # }}} end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format(dur=end_time - start_time)) return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], )
def infer_unknown_types(kernel, expect_completion=False): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) def debug(s): logger.debug("%s: %s" % (kernel.name, s)) unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ fill queue # queue contains temporary variables queue = [] import loopy as lp for tv in six.itervalues(kernel.temporary_variables): if tv.dtype is lp.auto: queue.append(tv) for arg in kernel.args: if arg.dtype is None: queue.append(arg) # }}} from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel, _DictUnionView([ new_temp_vars, new_arg_dict ])) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument failed_names = set() while queue: item = queue.pop(0) debug("inferring type for %s %s" % (type(item).__name__, item.name)) result, symbols_with_unavailable_types = \ _infer_var_type(kernel, item.name, type_inf_mapper, subst_expander) failed = result is None if not failed: debug(" success: %s" % result) if isinstance(item, TemporaryVariable): new_temp_vars[item.name] = item.copy(dtype=result) elif isinstance(item, KernelArgument): new_arg_dict[item.name] = item.copy(dtype=result) else: raise LoopyError("unexpected item type in type inference") else: debug(" failure") if failed: if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unavailable_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unavailable_types)) if expect_completion: raise LoopyError( "could not determine type of '%s'%s" % (item.name, advice)) else: # We're done here. break # remember that this item failed failed_names.add(item.name) queue_names = set(qi.name for qi in queue) if queue_names == failed_names: # We did what we could... print(queue_names, failed_names, item.name) assert not expect_completion break # can't infer type yet, put back into queue queue.append(item) else: # we've made progress, reset failure markers failed_names = set() # }}} return unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], )
def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) writer_map = kernel.writer_map() insn_id_to_inames = {} insn_assignee_inames = {} all_read_deps = {} all_write_deps = {} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps if insn.forced_iname_deps_is_final: iname_deps = insn.forced_iname_deps else: iname_deps = deps & kernel.all_inames() | insn.forced_iname_deps assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) logger.debug( "%s: find_all_insn_inames: %s (init): %s - " "read deps: %s - write deps: %s" % ( kernel.name, insn.id, ", ".join(sorted(iname_deps)), ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), ) ) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() written_vars = kernel.get_written_variables() # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following # scenario: # # z = expr(iname) # y = expr(z) # x = expr(y) # # x clearly has a dependency on iname, but this is not found until that # dependency has propagated all the way up. Doing this recursively is # not guaranteed to terminate because of circular dependencies. while True: did_something = False for insn in kernel.instructions: if insn.forced_iname_deps_is_final: continue # {{{ depdency-based propagation # For all variables that insn depends on, find the intersection # of iname deps of all writers, and add those to insn's # dependencies. for tv_name in all_read_deps[insn.id] & written_vars: implicit_inames = None for writer_id in writer_map[tv_name]: writer_implicit_inames = insn_id_to_inames[writer_id] - insn_assignee_inames[writer_id] if implicit_inames is None: implicit_inames = writer_implicit_inames else: implicit_inames = implicit_inames & writer_implicit_inames inames_old = insn_id_to_inames[insn.id] inames_new = (inames_old | implicit_inames) - insn.reduction_inames() insn_id_to_inames[insn.id] = inames_new if inames_new != inames_old: did_something = True logger.debug( "%s: find_all_insn_inames: %s -> %s (dep-based)" % (kernel.name, insn.id, ", ".join(sorted(inames_new))) ) # }}} # {{{ domain-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = set(insn_id_to_inames[insn.id]) for iname in inames_old: home_domain = kernel.domains[kernel.get_home_domain_index(iname)] for par in home_domain.get_var_names(dim_type.param): # Add all inames occurring in parameters of domains that my # current inames refer to. if par in kernel.all_inames(): inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is # in. if par in kernel.temporary_variables: for writer_id in writer_map.get(par, []): inames_new.update(insn_id_to_inames[writer_id]) if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) logger.debug( "%s: find_all_insn_inames: %s -> %s (domain-based)" % (kernel.name, insn.id, ", ".join(sorted(inames_new))) ) # }}} if not did_something: break logger.debug("%s: find_all_insn_inames: done" % kernel.name) for v in six.itervalues(insn_id_to_inames): assert isinstance(v, frozenset) return insn_id_to_inames
def preprocess_kernel(kernel, device=None): if device is not None: from warnings import warn warn("passing 'device' to preprocess_kernel() is deprecated", DeprecationWarning, stacklevel=2) from loopy.kernel import kernel_state if kernel.state != kernel_state.INITIAL: raise LoopyError("cannot re-preprocess an already preprocessed " "kernel") # {{{ cache retrieval from loopy import CACHING_ENABLED if CACHING_ENABLED: input_kernel = kernel try: result = preprocess_cache[kernel] logger.debug("%s: preprocess cache hit" % kernel.name) return result except KeyError: pass # }}} logger.info("%s: preprocess start" % kernel.name) from loopy.check import check_identifiers_in_subst_rules check_identifiers_in_subst_rules(kernel) # {{{ check that there are no l.auto-tagged inames from loopy.kernel.data import AutoLocalIndexTagBase for iname, tag in six.iteritems(kernel.iname_to_tag): if (isinstance(tag, AutoLocalIndexTagBase) and iname in kernel.all_inames()): raise LoopyError("kernel with automatically-assigned " "local axes passed to preprocessing") # }}} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) # Ordering restriction: # Type inference and reduction iname uniqueness don't handle substitutions. # Get them out of the way. kernel = infer_unknown_types(kernel, expect_completion=False) check_reduction_iname_uniqueness(kernel) kernel = add_default_dependencies(kernel) # Ordering restrictions: # # - realize_reduction must happen after type inference because it needs # to be able to determine the types of the reduced expressions. # # - realize_reduction must happen after default dependencies are added # because it manipulates the depends_on field, which could prevent # defaults from being applied. kernel = realize_reduction(kernel, unknown_types_ok=False) # Ordering restriction: # add_axes_to_temporaries_for_ilp because reduction accumulators # need to be duplicated by this. from loopy.transform.ilp import add_axes_to_temporaries_for_ilp_and_vec kernel = add_axes_to_temporaries_for_ilp_and_vec(kernel) kernel = find_temporary_scope(kernel) # boostability should be removed in 2017.x. kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) kernel = kernel.copy( state=kernel_state.PREPROCESSED) # {{{ prepare for caching # PicklableDtype instances for example need to know the target they're working # towards in order to pickle and unpickle them. This is the first pass that # uses caching, so we need to be ready to pickle. This means propagating # this target information. if CACHING_ENABLED: input_kernel = prepare_for_caching(input_kernel) kernel = prepare_for_caching(kernel) # }}} if CACHING_ENABLED: preprocess_cache[input_kernel] = kernel return kernel
def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx): """Infer types on temporaries and arguments.""" logger.debug("%s: infer types" % kernel.name) from functools import partial debug = partial(_debug, kernel) import time start_time = time.time() unexpanded_kernel = kernel if kernel.substitutions: from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) new_temp_vars = kernel.temporary_variables.copy() new_arg_dict = kernel.arg_dict.copy() # {{{ find names_with_unknown_types # contains both arguments and temporaries names_for_type_inference = [] import loopy as lp for tv in kernel.temporary_variables.values(): assert tv.dtype is not lp.auto if tv.dtype is None: names_for_type_inference.append(tv.name) for arg in kernel.args: assert arg.dtype is not lp.auto if arg.dtype is None: names_for_type_inference.append(arg.name) # }}} logger.debug("finding types for {count:d} names".format( count=len(names_for_type_inference))) writer_map = kernel.writer_map() dep_graph = { written_var: { read_var for insn_id in writer_map.get(written_var, []) for read_var in kernel.id_to_insn[insn_id].read_dependency_names() if read_var in names_for_type_inference } for written_var in names_for_type_inference } from pytools.graph import compute_sccs # To speed up processing, we sort the variables by computing the SCCs of the # type dependency graph. Each SCC represents a set of variables whose types # mutually depend on themselves. The SCCs are returned and processed in # topological order. sccs = compute_sccs(dep_graph) item_lookup = _DictUnionView([new_temp_vars, new_arg_dict]) type_inf_mapper = TypeInferenceMapper(kernel, clbl_inf_ctx, item_lookup) from loopy.symbolic import SubstitutionRuleExpander subst_expander = SubstitutionRuleExpander(kernel.substitutions) # {{{ work on type inference queue from loopy.kernel.data import TemporaryVariable, KernelArgument old_calls_to_new_calls = {} for var_chain in sccs: changed_during_last_queue_run = False var_queue = var_chain[:] failed_names = set() while var_queue or changed_during_last_queue_run: if not var_queue and changed_during_last_queue_run: changed_during_last_queue_run = False # Optimization: If there's a single variable in the SCC without # a self-referential dependency, then the type is known after a # single iteration (we don't need to look at the expressions # again). if len(var_chain) == 1: single_var, = var_chain if single_var not in dep_graph[single_var]: break var_queue = var_chain[:] name = var_queue.pop(0) item = item_lookup[name] debug("inferring type for %s %s", type(item).__name__, item.name) try: (result, symbols_with_unknown_types, new_old_calls_to_new_calls, clbl_inf_ctx) = (_infer_var_type(kernel, item.name, type_inf_mapper, subst_expander)) except DependencyTypeInferenceFailure: result = () symbols_with_unknown_types = () type_inf_mapper = type_inf_mapper.copy(clbl_inf_ctx=clbl_inf_ctx) if result: new_dtype, = result debug(" success: %s", new_dtype) if new_dtype != item.dtype: debug(" changed from: %s", item.dtype) changed_during_last_queue_run = True if isinstance(item, TemporaryVariable): new_temp_vars[name] = item.copy(dtype=new_dtype) elif isinstance(item, KernelArgument): new_arg_dict[name] = item.copy(dtype=new_dtype) else: raise LoopyError( "unexpected item type in type inference") old_calls_to_new_calls.update(new_old_calls_to_new_calls) # we've made progress, reset failure markers failed_names = set() else: debug(" failure") if item.name in failed_names: # this item has failed before, give up. advice = "" if symbols_with_unknown_types: advice += ( " (need type of '%s'--check for missing arguments)" % ", ".join(symbols_with_unknown_types)) debug("could not determine type of '%s'%s" % (item.name, advice)) # We're done here break # remember that this item failed failed_names.add(item.name) if set(var_queue) == failed_names: # We did what we could... print(var_queue, failed_names, item.name) break # can't infer type yet, put back into var_queue var_queue.append(name) # }}} # {{{ check if insn missed during type inference def _instruction_missed_during_inference(insn): for assignee in insn.assignees: if isinstance(assignee, Lookup): assignee = assignee.aggregate if isinstance(assignee, Variable): if assignee.name in kernel.arg_dict: if kernel.arg_dict[assignee.name].dtype is None: return False else: assert assignee.name in kernel.temporary_variables if kernel.temporary_variables[assignee.name].dtype is None: return False elif isinstance(assignee, (Subscript, LinearSubscript)): if assignee.aggregate.name in kernel.arg_dict: if kernel.arg_dict[assignee.aggregate.name].dtype is None: return False else: assert assignee.aggregate.name in kernel.temporary_variables if kernel.temporary_variables[ assignee.aggregate.name].dtype is None: return False else: assert isinstance(assignee, SubArrayRef) if assignee.subscript.aggregate.name in kernel.arg_dict: if kernel.arg_dict[ assignee.subscript.aggregate.name].dtype is None: return False else: assert assignee.subscript.aggregate.name in ( kernel.temporary_variables) if kernel.temporary_variables[ assignee.subscript.aggregate.name] is None: return False return True # }}} for insn in kernel.instructions: if isinstance(insn, lp.MultiAssignmentBase): # just a dummy run over the expression, to pass over all the # functions if _instruction_missed_during_inference(insn): type_inf_mapper(insn.expression, return_tuple=len(insn.assignees) != 1, return_dtype_set=True) elif isinstance(insn, (_DataObliviousInstruction, lp.CInstruction)): pass else: raise NotImplementedError("Unknown instructions type %s." % (type(insn).__name__)) clbl_inf_ctx = type_inf_mapper.clbl_inf_ctx old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls) end_time = time.time() logger.debug("type inference took {dur:.2f} seconds".format(dur=end_time - start_time)) pre_type_specialized_knl = unexpanded_kernel.copy( temporary_variables=new_temp_vars, args=[new_arg_dict[arg.name] for arg in kernel.args], ) type_specialized_kernel = change_names_of_pymbolic_calls( pre_type_specialized_knl, old_calls_to_new_calls) return type_specialized_kernel, clbl_inf_ctx
def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, default_tag="l.auto", rule_name=None, temporary_name=None, temporary_scope=None, temporary_is_local=None, footprint_subscripts=None, fetch_bounding_box=False, fetch_outer_inames=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. :arg dim_arg_names: List of names representing each fetch axis. :arg rule_name: base name of the generated temporary variable. :arg footprint_subscripts: A list of tuples indicating the index (i.e. subscript) tuples used to generate the footprint. If only one such set of indices is desired, this may also be specified directly by putting an index expression into *var_name*. Substitutions such as those occurring in dimension splits are recorded and also applied to these indices. :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. This function combines :func:`extract_subst` and :func:`precompute`. """ # {{{ fish indexing out of var_name and into footprint_subscripts from loopy.symbolic import parse parsed_var_name = parse(var_name) from pymbolic.primitives import Variable, Subscript if isinstance(parsed_var_name, Variable): # nothing to see pass elif isinstance(parsed_var_name, Subscript): if footprint_subscripts is not None: raise TypeError( "if footprint_subscripts is specified, then var_name " "may not contain a subscript") assert isinstance(parsed_var_name.aggregate, Variable) footprint_subscripts = [parsed_var_name.index] parsed_var_name = parsed_var_name.aggregate else: raise ValueError( "var_name must either be a variable name or a subscript") # }}} # {{{ fish out tag from loopy.symbolic import TaggedVariable if isinstance(parsed_var_name, TaggedVariable): var_name = parsed_var_name.name tag = parsed_var_name.tag else: var_name = parsed_var_name.name tag = None # }}} c_name = var_name if tag is not None: c_name = c_name + "_" + tag var_name_gen = kernel.get_var_name_generator() if rule_name is None: rule_name = var_name_gen("%s_fetch_rule" % c_name) if temporary_name is None: temporary_name = var_name_gen("%s_fetch" % c_name) arg = kernel.arg_dict[var_name] # {{{ make parameter names and unification template parameters = [] for i in range(arg.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) if arg.dim_names is not None: based_on = "%s_dim_%s" % (c_name, arg.dim_names[i]) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] par_name = var_name_gen(based_on=based_on) parameters.append(par_name) from pymbolic import var uni_template = parsed_var_name if len(parameters) > 1: uni_template = uni_template.index( tuple(var(par_name) for par_name in parameters)) elif len(parameters) == 1: uni_template = uni_template.index(var(parameters[0])) # }}} from loopy.transform.subst import extract_subst kernel = extract_subst(kernel, rule_name, uni_template, parameters) if isinstance(sweep_inames, str): sweep_inames = [s.strip() for s in sweep_inames.split(",")] else: # copy, standardize to list sweep_inames = list(sweep_inames) kernel, subst_use, sweep_inames, inames_to_be_removed = \ _process_footprint_subscripts( kernel, rule_name, sweep_inames, footprint_subscripts, arg) from loopy.transform.precompute import precompute new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, temporary_scope=temporary_scope, temporary_is_local=temporary_is_local, precompute_outer_inames=fetch_outer_inames) # {{{ remove inames that were temporarily added by slice sweeps new_domains = new_kernel.domains[:] for iname in inames_to_be_removed: home_domain_index = kernel.get_home_domain_index(iname) domain = new_domains[home_domain_index] dt, idx = domain.get_var_dict()[iname] assert dt == dim_type.set new_domains[home_domain_index] = domain.project_out(dt, idx, 1) new_kernel = new_kernel.copy(domains=new_domains) # }}} # If the rule survived past precompute() (i.e. some accesses fell outside # the footprint), get rid of it before moving on. if rule_name in new_kernel.substitutions: from loopy.transform.subst import expand_subst return expand_subst(new_kernel, "... > id:" + rule_name) else: return new_kernel
def find_all_insn_inames(kernel): logger.debug("%s: find_all_insn_inames: start" % kernel.name) writer_map = kernel.writer_map() insn_id_to_inames = {} insn_assignee_inames = {} all_read_deps = {} all_write_deps = {} from loopy.transform.subst import expand_subst kernel = expand_subst(kernel) for insn in kernel.instructions: all_read_deps[insn.id] = read_deps = insn.read_dependency_names() all_write_deps[insn.id] = write_deps = insn.write_dependency_names() deps = read_deps | write_deps if insn.forced_iname_deps_is_final: iname_deps = insn.forced_iname_deps else: iname_deps = ( deps & kernel.all_inames() | insn.forced_iname_deps) assert isinstance(read_deps, frozenset), type(insn) assert isinstance(write_deps, frozenset), type(insn) assert isinstance(iname_deps, frozenset), type(insn) logger.debug("%s: find_all_insn_inames: %s (init): %s - " "read deps: %s - write deps: %s" % ( kernel.name, insn.id, ", ".join(sorted(iname_deps)), ", ".join(sorted(read_deps)), ", ".join(sorted(write_deps)), )) insn_id_to_inames[insn.id] = iname_deps insn_assignee_inames[insn.id] = write_deps & kernel.all_inames() # fixed point iteration until all iname dep sets have converged # Why is fixed point iteration necessary here? Consider the following # scenario: # # z = expr(iname) # y = expr(z) # x = expr(y) # # x clearly has a dependency on iname, but this is not found until that # dependency has propagated all the way up. Doing this recursively is # not guaranteed to terminate because of circular dependencies. while True: did_something = False for insn in kernel.instructions: if insn.forced_iname_deps_is_final: continue # {{{ depdency-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = inames_old | guess_iname_deps_based_on_var_use( kernel, insn, insn_id_to_inames) insn_id_to_inames[insn.id] = inames_new if inames_new != inames_old: did_something = True warn_with_kernel(kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' in kernel '%s' " "was/were automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding 'for' loops" % (", ".join(inames_new-inames_old), insn.id, kernel.name)) # }}} # {{{ domain-based propagation inames_old = insn_id_to_inames[insn.id] inames_new = set(insn_id_to_inames[insn.id]) for iname in inames_old: home_domain = kernel.domains[kernel.get_home_domain_index(iname)] for par in home_domain.get_var_names(dim_type.param): # Add all inames occurring in parameters of domains that my # current inames refer to. if par in kernel.all_inames(): inames_new.add(intern(par)) # If something writes the bounds of a loop in which I'm # sitting, I had better be in the inames that the writer is # in. if par in kernel.temporary_variables: for writer_id in writer_map.get(par, []): inames_new.update(insn_id_to_inames[writer_id]) if inames_new != inames_old: did_something = True insn_id_to_inames[insn.id] = frozenset(inames_new) warn_with_kernel(kernel, "inferred_iname", "The iname(s) '%s' on instruction '%s' was " "automatically added. " "This is deprecated. Please add the iname " "to the instruction " "explicitly, e.g. by adding '{inames=...}" % (", ".join(inames_new-inames_old), insn.id)) # }}} if not did_something: break logger.debug("%s: find_all_insn_inames: done" % kernel.name) for v in six.itervalues(insn_id_to_inames): assert isinstance(v, frozenset) return insn_id_to_inames
def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, default_tag="l.auto", rule_name=None, temporary_name=None, temporary_is_local=None, footprint_subscripts=None, fetch_bounding_box=False): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. :arg dim_arg_names: List of names representing each fetch axis. :arg rule_name: base name of the generated temporary variable. :arg footprint_subscripts: A list of tuples indicating the index (i.e. subscript) tuples used to generate the footprint. If only one such set of indices is desired, this may also be specified directly by putting an index expression into *var_name*. Substitutions such as those occurring in dimension splits are recorded and also applied to these indices. This function combines :func:`extract_subst` and :func:`precompute`. """ # {{{ fish indexing out of var_name and into footprint_subscripts from loopy.symbolic import parse parsed_var_name = parse(var_name) from pymbolic.primitives import Variable, Subscript if isinstance(parsed_var_name, Variable): # nothing to see pass elif isinstance(parsed_var_name, Subscript): if footprint_subscripts is not None: raise TypeError("if footprint_subscripts is specified, then var_name " "may not contain a subscript") assert isinstance(parsed_var_name.aggregate, Variable) footprint_subscripts = [parsed_var_name.index] parsed_var_name = parsed_var_name.aggregate else: raise ValueError("var_name must either be a variable name or a subscript") # }}} # {{{ fish out tag from loopy.symbolic import TaggedVariable if isinstance(parsed_var_name, TaggedVariable): var_name = parsed_var_name.name tag = parsed_var_name.tag else: var_name = parsed_var_name.name tag = None # }}} c_name = var_name if tag is not None: c_name = c_name + "_" + tag var_name_gen = kernel.get_var_name_generator() if rule_name is None: rule_name = var_name_gen("%s_fetch_rule" % c_name) if temporary_name is None: temporary_name = var_name_gen("%s_fetch" % c_name) arg = kernel.arg_dict[var_name] # {{{ make parameter names and unification template parameters = [] for i in range(arg.num_user_axes()): based_on = "%s_dim_%d" % (c_name, i) if arg.dim_names is not None: based_on = "%s_dim_%s" % (c_name, arg.dim_names[i]) if dim_arg_names is not None and i < len(dim_arg_names): based_on = dim_arg_names[i] par_name = var_name_gen(based_on=based_on) parameters.append(par_name) from pymbolic import var uni_template = parsed_var_name if len(parameters) > 1: uni_template = uni_template.index( tuple(var(par_name) for par_name in parameters)) elif len(parameters) == 1: uni_template = uni_template.index(var(parameters[0])) # }}} from loopy.transform.subst import extract_subst kernel = extract_subst(kernel, rule_name, uni_template, parameters) if isinstance(sweep_inames, str): sweep_inames = [s.strip() for s in sweep_inames.split(",")] else: # copy, standardize to list sweep_inames = list(sweep_inames) kernel, subst_use, sweep_inames, inames_to_be_removed = \ _process_footprint_subscripts( kernel, rule_name, sweep_inames, footprint_subscripts, arg) from loopy.transform.precompute import precompute new_kernel = precompute(kernel, subst_use, sweep_inames, precompute_inames=dim_arg_names, default_tag=default_tag, dtype=arg.dtype, fetch_bounding_box=fetch_bounding_box, temporary_name=temporary_name, temporary_is_local=temporary_is_local) # {{{ remove inames that were temporarily added by slice sweeps new_domains = new_kernel.domains[:] for iname in inames_to_be_removed: home_domain_index = kernel.get_home_domain_index(iname) domain = new_domains[home_domain_index] dt, idx = domain.get_var_dict()[iname] assert dt == dim_type.set new_domains[home_domain_index] = domain.project_out(dt, idx, 1) new_kernel = new_kernel.copy(domains=new_domains) # }}} # If the rule survived past precompute() (i.e. some accesses fell outside # the footprint), get rid of it before moving on. if rule_name in new_kernel.substitutions: from loopy.transform.subst import expand_subst return expand_subst(new_kernel, "... > id:"+rule_name) else: return new_kernel