def __init__(self, codegen_state, type_inf_mapper=None): self.kernel = codegen_state.kernel self.codegen_state = codegen_state if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel) self.type_inf_mapper = type_inf_mapper
def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None): self.kernel = codegen_state.kernel self.codegen_state = codegen_state if type_inf_mapper is None: type_inf_mapper = TypeInferenceMapper(self.kernel) self.type_inf_mapper = type_inf_mapper self.allow_complex = codegen_state.allow_complex self.fortran_abi = fortran_abi
def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. If *insn_id_filter* is given, only the outermost level of reductions will be expanded, inner reductions will be left alone (because they end up in a new instruction with a different ID, which doesn't match the filter). If *insn_id_filter* is not given, all reductions in all instructions will be realized. """ logger.debug("%s: realize reduction" % kernel.name) new_insns = [] new_iname_tags = {} insn_id_gen = kernel.get_instruction_id_generator() var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() from loopy.type_inference import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) # {{{ sequential def map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) from pymbolic import var acc_var_names = [ var_name_gen("acc_"+"_".join(expr.inames)) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=dtype, scope=temp_var_scope.PRIVATE) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set(expr.inames) reduction_insn = make_assignment( id=update_id, assignees=acc_vars, expression=expr.operation( arg_dtype, acc_vars if len(acc_vars) > 1 else acc_vars[0], expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0] else: return acc_vars # }}} # {{{ local-parallel def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size def _make_slab_set(iname, size): v = isl.make_zero_and_vars([iname]) bs, = ( v[0].le_set(v[iname]) & v[iname].lt_set(v[0] + size)).get_basic_sets() return bs def map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes): red_iname, = expr.inames size = _get_int_iname_size(red_iname) outer_insn_inames = temp_kernel.insn_inames(insn) from loopy.kernel.data import LocalIndexTagBase outer_local_inames = tuple( oiname for oiname in outer_insn_inames if isinstance( kernel.iname_to_tag.get(oiname), LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( var(oiname) for oiname in outer_local_inames) outer_local_iname_sizes = tuple( _get_int_iname_size(oiname) for oiname in outer_local_inames) # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active # on our red_iname. base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] # }}} neutral_var_names = [ var_name_gen("neutral_"+red_iname) for i in range(nresults)] acc_var_names = [ var_name_gen("acc_"+red_iname) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=outer_local_iname_sizes + (size,), dtype=dtype, scope=temp_var_scope.LOCAL) for name, dtype in zip(neutral_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=dtype, scope=temp_var_scope.PRIVATE) base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(arg_dtype, expr.inames) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset()) generated_insns.append(init_insn) def _strip_if_scalar(c): if len(acc_vars) == 1: return c[0] else: return c init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) init_neutral_insn = make_assignment( id=init_neutral_id, assignees=tuple(var(nvn) for nvn in neutral_var_names), expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset()) generated_insns.append(init_neutral_insn) transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)), expr.expr, expr.inames), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) cur_size = 1 while cur_size < size: cur_size *= 2 prev_id = transfer_id bound = size istage = 0 while cur_size > 1: new_size = cur_size // 2 assert new_size * 2 == cur_size stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, _strip_if_scalar(tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars)), _strip_if_scalar(tuple( acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] for acc_var in acc_vars)), expr.inames), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), ) generated_insns.append(stage_insn) prev_id = stage_id cur_size = new_size bound = cur_size istage += 1 new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add((prev_id, "any")) new_insn_add_within_inames.add(base_exec_iname or stage_exec_iname) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. try: arg_dtype = type_inf_mapper(expr.expr) except DependencyTypeInferenceFailure: if unknown_types_ok: arg_dtype = lp.auto reduction_dtypes = (lp.auto,)*nresults else: raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) else: arg_dtype = arg_dtype.with_target(kernel.target) reduction_dtypes = expr.operation.result_dtypes( kernel, arg_dtype, expr.inames) reduction_dtypes = tuple( dt.with_target(kernel.target) for dt in reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames if bad_inames: raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) n_sequential = 0 n_local_par = 0 from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, ParallelTag) for iname in expr.inames: iname_tag = kernel.iname_to_tag.get(iname) if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with # them as sequential. n_sequential += 1 elif isinstance(iname_tag, LocalIndexTagBase): n_local_par += 1 elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): raise LoopyError("the only form of parallelism supported " "by reductions is 'local'--found iname '%s' " "tagged '%s'" % (iname, type(iname_tag).__name__)) else: n_sequential += 1 if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " "sequential inames. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_local_par > 1: raise LoopyError("Reduction over '%s' contains more than" "one parallel iname. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_sequential: assert n_local_par == 0 return map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes) elif n_local_par: return map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes) else: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") return expr.expr # }}} from loopy.symbolic import ReductionCallbackMapper cb_mapper = ReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] insn_id_replacements = {} domains = kernel.domains[:] temp_kernel = kernel import loopy as lp while insn_queue: new_insn_add_depends_on = set() new_insn_add_no_sync_with = set() new_insn_add_within_inames = set() generated_insns = [] insn = insn_queue.pop(0) if insn_id_filter is not None and insn.id != insn_id_filter \ or not isinstance(insn, lp.MultiAssignmentBase): new_insns.append(insn) continue nresults = len(insn.assignees) # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, nresults=nresults) else: new_expressions = (cb_mapper(insn.expression),) if generated_insns: # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. kwargs = insn.get_copy_kwargs( depends_on=insn.depends_on | frozenset(new_insn_add_depends_on), no_sync_with=insn.no_sync_with | frozenset(new_insn_add_no_sync_with), within_inames=( temp_kernel.insn_inames(insn) | new_insn_add_within_inames)) kwargs.pop("id") kwargs.pop("expression") kwargs.pop("assignee", None) kwargs.pop("assignees", None) kwargs.pop("temp_var_type", None) kwargs.pop("temp_var_types", None) if isinstance(insn.expression, Reduction) and nresults > 1: replacement_insns = [ lp.Assignment( id=insn_id_gen(insn.id), assignee=assignee, expression=new_expr, **kwargs) for assignee, new_expr in zip( insn.assignees, new_expressions)] else: new_expr, = new_expressions replacement_insns = [ make_assignment( id=insn_id_gen(insn.id), assignees=insn.assignees, expression=new_expr, **kwargs) ] insn_id_replacements[insn.id] = [ rinsn.id for rinsn in replacement_insns] insn_queue = generated_insns + replacement_insns + insn_queue # The reduction expander needs an up-to-date kernel # object to find dependencies. Keep temp_kernel up-to-date. temp_kernel = kernel.copy( instructions=new_insns + insn_queue, temporary_variables=new_temporary_variables, domains=domains) temp_kernel = lp.replace_instruction_ids( temp_kernel, insn_id_replacements) else: # nothing happened, we're done with insn assert not new_insn_add_depends_on new_insns.append(insn) kernel = kernel.copy( instructions=new_insns, temporary_variables=new_temporary_variables, domains=domains) kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) kernel = lp.tag_inames(kernel, new_iname_tags) return kernel