def map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) from pymbolic import var acc_var_names = [ var_name_gen("acc_"+"_".join(expr.inames)) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=dtype, scope=temp_var_scope.PRIVATE) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) init_insn = make_assignment( id=init_id, assignees=acc_vars, forced_iname_deps=outer_insn_inames - frozenset(expr.inames), forced_iname_deps_is_final=insn.forced_iname_deps_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) if insn.forced_iname_deps_is_final: update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames) reduction_insn = make_assignment( id=update_id, assignees=acc_vars, expression=expr.operation( arg_dtype, acc_vars if len(acc_vars) > 1 else acc_vars[0], expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, forced_iname_deps=update_insn_iname_deps, forced_iname_deps_is_final=insn.forced_iname_deps_is_final) generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0] else: return acc_vars
def map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) from pymbolic import var acc_var_names = [ var_name_gen("acc_"+"_".join(expr.inames)) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=dtype, scope=temp_var_scope.PRIVATE) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set(expr.inames) reduction_insn = make_assignment( id=update_id, assignees=acc_vars, expression=expr.operation( arg_dtype, acc_vars if len(acc_vars) > 1 else acc_vars[0], expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0] else: return acc_vars
def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, within_inames_is_final): from pymbolic.primitives import Call from loopy.symbolic import Reduction assert isinstance(expr, (Call, Reduction)) temp_var_names = [ var_name_gen(id + "_arg" + str(i)) for i in range(nresults) ] for name in temp_var_names: from loopy.kernel.data import TemporaryVariable, temp_var_scope new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=lp.auto, scope=temp_var_scope.PRIVATE) from pymbolic import var temp_vars = tuple(var(n) for n in temp_var_names) call_insn = make_assignment( id=id, assignees=temp_vars, expression=expr, depends_on=depends_on, within_inames=within_inames, within_inames_is_final=within_inames_is_final) generated_insns.append(call_insn) return temp_vars
def map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes): red_iname, = expr.inames size = _get_int_iname_size(red_iname) outer_insn_inames = temp_kernel.insn_inames(insn) from loopy.kernel.data import LocalIndexTagBase outer_local_inames = tuple( oiname for oiname in outer_insn_inames if isinstance( kernel.iname_to_tag.get(oiname), LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( var(oiname) for oiname in outer_local_inames) outer_local_iname_sizes = tuple( _get_int_iname_size(oiname) for oiname in outer_local_inames) # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active # on our red_iname. base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] # }}} acc_var_names = [ var_name_gen("acc_"+red_iname) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=outer_local_iname_sizes + (size,), dtype=dtype, scope=temp_var_scope.LOCAL) base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(arg_dtype, expr.inames) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]), forced_iname_deps_is_final=insn.forced_iname_deps_is_final, depends_on=frozenset()) generated_insns.append(init_insn) transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, neutral, expr.expr, expr.inames), forced_iname_deps=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), forced_iname_deps_is_final=insn.forced_iname_deps_is_final, depends_on=frozenset([init_id]) | insn.depends_on, no_sync_with=frozenset([init_id])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): if len(acc_vars) == 1: return c[0] else: return c cur_size = 1 while cur_size < size: cur_size *= 2 prev_id = transfer_id bound = size istage = 0 while cur_size > 1: new_size = cur_size // 2 assert new_size * 2 == cur_size stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, _strip_if_scalar([ acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars]), _strip_if_scalar([ acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] for acc_var in acc_vars]), expr.inames), forced_iname_deps=( base_iname_deps | frozenset([stage_exec_iname])), forced_iname_deps_is_final=insn.forced_iname_deps_is_final, depends_on=frozenset([prev_id]), ) generated_insns.append(stage_insn) prev_id = stage_id cur_size = new_size bound = cur_size istage += 1 new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add(prev_id) new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
def map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes): red_iname, = expr.inames size = _get_int_iname_size(red_iname) outer_insn_inames = temp_kernel.insn_inames(insn) from loopy.kernel.data import LocalIndexTagBase outer_local_inames = tuple( oiname for oiname in outer_insn_inames if isinstance( kernel.iname_to_tag.get(oiname), LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( var(oiname) for oiname in outer_local_inames) outer_local_iname_sizes = tuple( _get_int_iname_size(oiname) for oiname in outer_local_inames) # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active # on our red_iname. base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] # }}} acc_var_names = [ var_name_gen("acc_"+red_iname) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=outer_local_iname_sizes + (size,), dtype=dtype, scope=temp_var_scope.LOCAL) base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(arg_dtype, expr.inames) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset()) generated_insns.append(init_insn) transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, neutral, expr.expr, expr.inames), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, no_sync_with=frozenset([init_id])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): if len(acc_vars) == 1: return c[0] else: return c cur_size = 1 while cur_size < size: cur_size *= 2 prev_id = transfer_id bound = size istage = 0 while cur_size > 1: new_size = cur_size // 2 assert new_size * 2 == cur_size stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, _strip_if_scalar([ acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars]), _strip_if_scalar([ acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] for acc_var in acc_vars]), expr.inames), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), ) generated_insns.append(stage_insn) prev_id = stage_id cur_size = new_size bound = cur_size istage += 1 new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add(prev_id) new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. If *insn_id_filter* is given, only the outermost level of reductions will be expanded, inner reductions will be left alone (because they end up in a new instruction with a different ID, which doesn't match the filter). If *insn_id_filter* is not given, all reductions in all instructions will be realized. """ logger.debug("%s: realize reduction" % kernel.name) new_insns = [] new_iname_tags = {} insn_id_gen = kernel.get_instruction_id_generator() var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() from loopy.expression import TypeInferenceMapper type_inf_mapper = TypeInferenceMapper(kernel) # {{{ sequential def map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) from pymbolic import var acc_var_names = [ var_name_gen("acc_"+"_".join(expr.inames)) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=(), dtype=dtype, scope=temp_var_scope.PRIVATE) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) init_insn = make_assignment( id=init_id, assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset(), expression=expr.operation.neutral_element(arg_dtype, expr.inames)) generated_insns.append(init_insn) update_id = insn_id_gen( based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames) if insn.within_inames_is_final: update_insn_iname_deps = insn.within_inames | set(expr.inames) reduction_insn = make_assignment( id=update_id, assignees=acc_vars, expression=expr.operation( arg_dtype, acc_vars if len(acc_vars) > 1 else acc_vars[0], expr.expr, expr.inames), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) generated_insns.append(reduction_insn) new_insn_add_depends_on.add(reduction_insn.id) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0] else: return acc_vars # }}} # {{{ local-parallel def _get_int_iname_size(iname): from loopy.isl_helpers import static_max_of_pw_aff from loopy.symbolic import pw_aff_to_expr size = pw_aff_to_expr( static_max_of_pw_aff( kernel.get_iname_bounds(iname).size, constants_only=True)) assert isinstance(size, six.integer_types) return size def _make_slab_set(iname, size): v = isl.make_zero_and_vars([iname]) bs, = ( v[0].le_set(v[iname]) & v[iname].lt_set(v[0] + size)).get_basic_sets() return bs def map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes): red_iname, = expr.inames size = _get_int_iname_size(red_iname) outer_insn_inames = temp_kernel.insn_inames(insn) from loopy.kernel.data import LocalIndexTagBase outer_local_inames = tuple( oiname for oiname in outer_insn_inames if isinstance( kernel.iname_to_tag.get(oiname), LocalIndexTagBase)) from pymbolic import var outer_local_iname_vars = tuple( var(oiname) for oiname in outer_local_inames) outer_local_iname_sizes = tuple( _get_int_iname_size(oiname) for oiname in outer_local_inames) # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active # on our red_iname. base_exec_iname = var_name_gen("red_"+red_iname) domains.append(_make_slab_set(base_exec_iname, size)) new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname] # }}} acc_var_names = [ var_name_gen("acc_"+red_iname) for i in range(nresults)] acc_vars = tuple(var(n) for n in acc_var_names) from loopy.kernel.data import TemporaryVariable, temp_var_scope for name, dtype in zip(acc_var_names, reduction_dtypes): new_temporary_variables[name] = TemporaryVariable( name=name, shape=outer_local_iname_sizes + (size,), dtype=dtype, scope=temp_var_scope.LOCAL) base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(arg_dtype, expr.inames) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(base_exec_iname),)] for acc_var in acc_vars), expression=neutral, within_inames=base_iname_deps | frozenset([base_exec_iname]), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset()) generated_insns.append(init_insn) transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname)) transfer_insn = make_assignment( id=transfer_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, neutral, expr.expr, expr.inames), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([init_id]) | insn.depends_on, no_sync_with=frozenset([init_id])) generated_insns.append(transfer_insn) def _strip_if_scalar(c): if len(acc_vars) == 1: return c[0] else: return c cur_size = 1 while cur_size < size: cur_size *= 2 prev_id = transfer_id bound = size istage = 0 while cur_size > 1: new_size = cur_size // 2 assert new_size * 2 == cur_size stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage)) domains.append(_make_slab_set(stage_exec_iname, bound-new_size)) new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname] stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage)) stage_insn = make_assignment( id=stage_id, assignees=tuple( acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( arg_dtype, _strip_if_scalar([ acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars]), _strip_if_scalar([ acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] for acc_var in acc_vars]), expr.inames), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, depends_on=frozenset([prev_id]), ) generated_insns.append(stage_insn) prev_id = stage_id cur_size = new_size bound = cur_size istage += 1 new_insn_add_depends_on.add(prev_id) new_insn_add_no_sync_with.add(prev_id) new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname) if nresults == 1: assert len(acc_vars) == 1 return acc_vars[0][outer_local_iname_vars + (0,)] else: return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): # Only expand one level of reduction at a time, going from outermost to # innermost. Otherwise we get the (iname + insn) dependencies wrong. try: arg_dtype = type_inf_mapper(expr.expr) except DependencyTypeInferenceFailure: if unknown_types_ok: arg_dtype = lp.auto reduction_dtypes = (lp.auto,)*nresults else: raise LoopyError("failed to determine type of accumulator for " "reduction '%s'" % expr) else: arg_dtype = arg_dtype.with_target(kernel.target) reduction_dtypes = expr.operation.result_dtypes( kernel, arg_dtype, expr.inames) reduction_dtypes = tuple( dt.with_target(kernel.target) for dt in reduction_dtypes) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames if bad_inames: raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) n_sequential = 0 n_local_par = 0 from loopy.kernel.data import ( LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, ParallelTag) for iname in expr.inames: iname_tag = kernel.iname_to_tag.get(iname) if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): # These are nominally parallel, but we can live with # them as sequential. n_sequential += 1 elif isinstance(iname_tag, LocalIndexTagBase): n_local_par += 1 elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): raise LoopyError("the only form of parallelism supported " "by reductions is 'local'--found iname '%s' " "tagged '%s'" % (iname, type(iname_tag).__name__)) else: n_sequential += 1 if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " "sequential inames. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_local_par > 1: raise LoopyError("Reduction over '%s' contains more than" "one parallel iname. It must be split " "(using split_reduction_{in,out}ward) " "before code generation." % ", ".join(expr.inames)) if n_sequential: assert n_local_par == 0 return map_reduction_seq(expr, rec, nresults, arg_dtype, reduction_dtypes) elif n_local_par: return map_reduction_local(expr, rec, nresults, arg_dtype, reduction_dtypes) else: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") return expr.expr # }}} from loopy.symbolic import ReductionCallbackMapper cb_mapper = ReductionCallbackMapper(map_reduction) insn_queue = kernel.instructions[:] insn_id_replacements = {} domains = kernel.domains[:] temp_kernel = kernel import loopy as lp while insn_queue: new_insn_add_depends_on = set() new_insn_add_no_sync_with = set() new_insn_add_within_inames = set() generated_insns = [] insn = insn_queue.pop(0) if insn_id_filter is not None and insn.id != insn_id_filter \ or not isinstance(insn, lp.MultiAssignmentBase): new_insns.append(insn) continue nresults = len(insn.assignees) # Run reduction expansion. from loopy.symbolic import Reduction if isinstance(insn.expression, Reduction) and nresults > 1: new_expressions = cb_mapper(insn.expression, nresults=nresults) else: new_expressions = (cb_mapper(insn.expression),) if generated_insns: # An expansion happened, so insert the generated stuff plus # ourselves back into the queue. kwargs = insn.get_copy_kwargs( depends_on=insn.depends_on | frozenset(new_insn_add_depends_on), no_sync_with=insn.no_sync_with | frozenset(new_insn_add_no_sync_with), within_inames=( temp_kernel.insn_inames(insn) | new_insn_add_within_inames)) kwargs.pop("id") kwargs.pop("expression") kwargs.pop("assignee", None) kwargs.pop("assignees", None) kwargs.pop("temp_var_type", None) kwargs.pop("temp_var_types", None) if isinstance(insn.expression, Reduction) and nresults > 1: replacement_insns = [ lp.Assignment( id=insn_id_gen(insn.id), assignee=assignee, expression=new_expr, **kwargs) for assignee, new_expr in zip( insn.assignees, new_expressions)] else: new_expr, = new_expressions replacement_insns = [ make_assignment( id=insn_id_gen(insn.id), assignees=insn.assignees, expression=new_expr, **kwargs) ] insn_id_replacements[insn.id] = [ rinsn.id for rinsn in replacement_insns] insn_queue = generated_insns + replacement_insns + insn_queue # The reduction expander needs an up-to-date kernel # object to find dependencies. Keep temp_kernel up-to-date. temp_kernel = kernel.copy( instructions=new_insns + insn_queue, temporary_variables=new_temporary_variables, domains=domains) temp_kernel = lp.replace_instruction_ids( temp_kernel, insn_id_replacements) else: # nothing happened, we're done with insn assert not new_insn_add_depends_on new_insns.append(insn) kernel = kernel.copy( instructions=new_insns, temporary_variables=new_temporary_variables, domains=domains) kernel = lp.replace_instruction_ids(kernel, insn_id_replacements) kernel = lp.tag_inames(kernel, new_iname_tags) return kernel
def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): """ Multi assignment function calls are currently lowered into OpenCL so that the function call:: a, b = segmented_sum(x, y, z, w) becomes:: a = segmented_sum_mangled(x, y, z, w, &b). For OpenCL, the scope of "b" is significant, and the preamble generation currently assumes the scope is always private. This function forces that to be the case by introducing temporary assignments into the kernel. """ insn_id_gen = kernel.get_instruction_id_generator() var_name_gen = kernel.get_var_name_generator() new_or_updated_instructions = {} new_temporaries = {} dep_map = dict((insn.id, insn.depends_on) for insn in kernel.instructions) inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions) import six for insn_id, deps in six.iteritems(dep_map): for dep in deps: inverse_dep_map[dep].add(insn_id) del dep_map # {{{ utils def _add_to_no_sync_with(insn_id, new_no_sync_with_params): insn = kernel.id_to_insn.get(insn_id) insn = new_or_updated_instructions.get(insn_id, insn) new_or_updated_instructions[insn_id] = (insn.copy( no_sync_with=(insn.no_sync_with | frozenset(new_no_sync_with_params)))) def _add_to_depends_on(insn_id, new_depends_on_params): insn = kernel.id_to_insn.get(insn_id) insn = new_or_updated_instructions.get(insn_id, insn) new_or_updated_instructions[insn_id] = (insn.copy( depends_on=insn.depends_on | frozenset(new_depends_on_params))) # }}} from loopy.kernel.instruction import CallInstruction for insn in kernel.instructions: if not isinstance(insn, CallInstruction): continue if len(insn.assignees) <= 1: continue assignees = insn.assignees assignee_var_names = insn.assignee_var_names() new_assignees = [assignees[0]] newly_added_assignments_ids = set() needs_replacement = False last_added_insn_id = insn.id from loopy.kernel.data import temp_var_scope, TemporaryVariable FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa for assignee_nr, assignee_var_name, assignee in zip( range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)), assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:], assignees[FIRST_POINTER_ASSIGNEE_IDX:]): if (assignee_var_name in kernel.temporary_variables and (kernel.temporary_variables[assignee_var_name].scope == temp_var_scope.PRIVATE)): new_assignees.append(assignee) continue needs_replacement = True # {{{ generate a new assignent instruction new_assignee_name = var_name_gen( "{insn_id}_retval_{assignee_nr}".format( insn_id=insn.id, assignee_nr=assignee_nr)) new_assignment_id = insn_id_gen( "{insn_id}_assign_retval_{assignee_nr}".format( insn_id=insn.id, assignee_nr=assignee_nr)) newly_added_assignments_ids.add(new_assignment_id) import loopy as lp new_temporaries[new_assignee_name] = (TemporaryVariable( name=new_assignee_name, dtype=lp.auto, scope=temp_var_scope.PRIVATE)) from pymbolic import var new_assignee = var(new_assignee_name) new_assignees.append(new_assignee) new_or_updated_instructions[new_assignment_id] = (make_assignment( assignees=(assignee, ), expression=new_assignee, id=new_assignment_id, depends_on=frozenset([last_added_insn_id]), depends_on_is_final=True, no_sync_with=(insn.no_sync_with | frozenset([(insn.id, "any")])), predicates=insn.predicates, within_inames=insn.within_inames)) last_added_insn_id = new_assignment_id # }}} if not needs_replacement: continue # {{{ update originating instruction orig_insn = new_or_updated_instructions.get(insn.id, insn) new_or_updated_instructions[insn.id] = (orig_insn.copy( assignees=tuple(new_assignees))) _add_to_no_sync_with(insn.id, [(id, "any") for id in newly_added_assignments_ids]) # }}} # {{{ squash spurious memory dependencies amongst new assignments for new_insn_id in newly_added_assignments_ids: _add_to_no_sync_with( new_insn_id, [(id, "any") for id in newly_added_assignments_ids if id != new_insn_id]) # }}} # {{{ update instructions that depend on the originating instruction for inverse_dep in inverse_dep_map[insn.id]: _add_to_depends_on(inverse_dep, newly_added_assignments_ids) for insn_id, scope in ( new_or_updated_instructions[inverse_dep].no_sync_with): if insn_id == insn.id: _add_to_no_sync_with( inverse_dep, [(id, scope) for id in newly_added_assignments_ids]) # }}} new_temporary_variables = kernel.temporary_variables.copy() new_temporary_variables.update(new_temporaries) new_instructions = (list(new_or_updated_instructions.values()) + list(insn for insn in kernel.instructions if insn.id not in new_or_updated_instructions)) return kernel.copy(temporary_variables=new_temporary_variables, instructions=new_instructions)