Exemple #1
0
    def map_reduction_seq(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        outer_insn_inames = temp_kernel.insn_inames(insn)

        from pymbolic import var
        acc_var_names = [
                var_name_gen("acc_"+"_".join(expr.inames))
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope

        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=(),
                    dtype=dtype,
                    scope=temp_var_scope.PRIVATE)

        init_id = insn_id_gen(
                "%s_%s_init" % (insn.id, "_".join(expr.inames)))

        init_insn = make_assignment(
                id=init_id,
                assignees=acc_vars,
                forced_iname_deps=outer_insn_inames - frozenset(expr.inames),
                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                depends_on=frozenset(),
                expression=expr.operation.neutral_element(arg_dtype, expr.inames))

        generated_insns.append(init_insn)

        update_id = insn_id_gen(
                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))

        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
        if insn.forced_iname_deps_is_final:
            update_insn_iname_deps = insn.forced_iname_deps | set(expr.inames)

        reduction_insn = make_assignment(
                id=update_id,
                assignees=acc_vars,
                expression=expr.operation(
                    arg_dtype,
                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
                    expr.expr, expr.inames),
                depends_on=frozenset([init_insn.id]) | insn.depends_on,
                forced_iname_deps=update_insn_iname_deps,
                forced_iname_deps_is_final=insn.forced_iname_deps_is_final)

        generated_insns.append(reduction_insn)

        new_insn_add_depends_on.add(reduction_insn.id)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0]
        else:
            return acc_vars
Exemple #2
0
    def map_reduction_seq(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        outer_insn_inames = temp_kernel.insn_inames(insn)

        from pymbolic import var
        acc_var_names = [
                var_name_gen("acc_"+"_".join(expr.inames))
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope

        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=(),
                    dtype=dtype,
                    scope=temp_var_scope.PRIVATE)

        init_id = insn_id_gen(
                "%s_%s_init" % (insn.id, "_".join(expr.inames)))

        init_insn = make_assignment(
                id=init_id,
                assignees=acc_vars,
                within_inames=outer_insn_inames - frozenset(expr.inames),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset(),
                expression=expr.operation.neutral_element(arg_dtype, expr.inames))

        generated_insns.append(init_insn)

        update_id = insn_id_gen(
                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))

        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
        if insn.within_inames_is_final:
            update_insn_iname_deps = insn.within_inames | set(expr.inames)

        reduction_insn = make_assignment(
                id=update_id,
                assignees=acc_vars,
                expression=expr.operation(
                    arg_dtype,
                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
                    expr.expr, expr.inames),
                depends_on=frozenset([init_insn.id]) | insn.depends_on,
                within_inames=update_insn_iname_deps,
                within_inames_is_final=insn.within_inames_is_final)

        generated_insns.append(reduction_insn)

        new_insn_add_depends_on.add(reduction_insn.id)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0]
        else:
            return acc_vars
Exemple #3
0
    def expand_inner_reduction(id, expr, nresults, depends_on, within_inames,
                               within_inames_is_final):
        from pymbolic.primitives import Call
        from loopy.symbolic import Reduction
        assert isinstance(expr, (Call, Reduction))

        temp_var_names = [
            var_name_gen(id + "_arg" + str(i)) for i in range(nresults)
        ]

        for name in temp_var_names:
            from loopy.kernel.data import TemporaryVariable, temp_var_scope
            new_temporary_variables[name] = TemporaryVariable(
                name=name,
                shape=(),
                dtype=lp.auto,
                scope=temp_var_scope.PRIVATE)

        from pymbolic import var
        temp_vars = tuple(var(n) for n in temp_var_names)

        call_insn = make_assignment(
            id=id,
            assignees=temp_vars,
            expression=expr,
            depends_on=depends_on,
            within_inames=within_inames,
            within_inames_is_final=within_inames_is_final)

        generated_insns.append(call_insn)

        return temp_vars
Exemple #4
0
    def map_reduction_local(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        red_iname, = expr.inames

        size = _get_int_iname_size(red_iname)

        outer_insn_inames = temp_kernel.insn_inames(insn)

        from loopy.kernel.data import LocalIndexTagBase
        outer_local_inames = tuple(
                oiname
                for oiname in outer_insn_inames
                if isinstance(
                    kernel.iname_to_tag.get(oiname),
                    LocalIndexTagBase))

        from pymbolic import var
        outer_local_iname_vars = tuple(
                var(oiname) for oiname in outer_local_inames)

        outer_local_iname_sizes = tuple(
                _get_int_iname_size(oiname)
                for oiname in outer_local_inames)

        # {{{ add separate iname to carry out the reduction

        # Doing this sheds any odd conditionals that may be active
        # on our red_iname.

        base_exec_iname = var_name_gen("red_"+red_iname)
        domains.append(_make_slab_set(base_exec_iname, size))
        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname]

        # }}}

        acc_var_names = [
                var_name_gen("acc_"+red_iname)
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope
        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=outer_local_iname_sizes + (size,),
                    dtype=dtype,
                    scope=temp_var_scope.LOCAL)

        base_iname_deps = outer_insn_inames - frozenset(expr.inames)

        neutral = expr.operation.neutral_element(arg_dtype, expr.inames)

        init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
        init_insn = make_assignment(
                id=init_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
                    for acc_var in acc_vars),
                expression=neutral,
                forced_iname_deps=base_iname_deps | frozenset([base_exec_iname]),
                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                depends_on=frozenset())
        generated_insns.append(init_insn)

        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
        transfer_insn = make_assignment(
                id=transfer_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(red_iname),)]
                    for acc_var in acc_vars),
                expression=expr.operation(
                    arg_dtype, neutral, expr.expr, expr.inames),
                forced_iname_deps=(
                    (outer_insn_inames - frozenset(expr.inames))
                    | frozenset([red_iname])),
                forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                depends_on=frozenset([init_id]) | insn.depends_on,
                no_sync_with=frozenset([init_id]))
        generated_insns.append(transfer_insn)

        def _strip_if_scalar(c):
            if len(acc_vars) == 1:
                return c[0]
            else:
                return c

        cur_size = 1
        while cur_size < size:
            cur_size *= 2

        prev_id = transfer_id
        bound = size

        istage = 0
        while cur_size > 1:

            new_size = cur_size // 2
            assert new_size * 2 == cur_size

            stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
            domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname]

            stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
            stage_insn = make_assignment(
                    id=stage_id,
                    assignees=tuple(
                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                        for acc_var in acc_vars),
                    expression=expr.operation(
                        arg_dtype,
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (var(stage_exec_iname),)]
                            for acc_var in acc_vars]),
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (
                                    var(stage_exec_iname) + new_size,)]
                            for acc_var in acc_vars]),
                        expr.inames),
                    forced_iname_deps=(
                        base_iname_deps | frozenset([stage_exec_iname])),
                    forced_iname_deps_is_final=insn.forced_iname_deps_is_final,
                    depends_on=frozenset([prev_id]),
                    )

            generated_insns.append(stage_insn)
            prev_id = stage_id

            cur_size = new_size
            bound = cur_size
            istage += 1

        new_insn_add_depends_on.add(prev_id)
        new_insn_add_no_sync_with.add(prev_id)
        new_insn_add_forced_iname_deps.add(stage_exec_iname or base_exec_iname)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0][outer_local_iname_vars + (0,)]
        else:
            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
Exemple #5
0
    def map_reduction_local(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        red_iname, = expr.inames

        size = _get_int_iname_size(red_iname)

        outer_insn_inames = temp_kernel.insn_inames(insn)

        from loopy.kernel.data import LocalIndexTagBase
        outer_local_inames = tuple(
                oiname
                for oiname in outer_insn_inames
                if isinstance(
                    kernel.iname_to_tag.get(oiname),
                    LocalIndexTagBase))

        from pymbolic import var
        outer_local_iname_vars = tuple(
                var(oiname) for oiname in outer_local_inames)

        outer_local_iname_sizes = tuple(
                _get_int_iname_size(oiname)
                for oiname in outer_local_inames)

        # {{{ add separate iname to carry out the reduction

        # Doing this sheds any odd conditionals that may be active
        # on our red_iname.

        base_exec_iname = var_name_gen("red_"+red_iname)
        domains.append(_make_slab_set(base_exec_iname, size))
        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname]

        # }}}

        acc_var_names = [
                var_name_gen("acc_"+red_iname)
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope
        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=outer_local_iname_sizes + (size,),
                    dtype=dtype,
                    scope=temp_var_scope.LOCAL)

        base_iname_deps = outer_insn_inames - frozenset(expr.inames)

        neutral = expr.operation.neutral_element(arg_dtype, expr.inames)

        init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
        init_insn = make_assignment(
                id=init_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
                    for acc_var in acc_vars),
                expression=neutral,
                within_inames=base_iname_deps | frozenset([base_exec_iname]),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset())
        generated_insns.append(init_insn)

        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
        transfer_insn = make_assignment(
                id=transfer_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(red_iname),)]
                    for acc_var in acc_vars),
                expression=expr.operation(
                    arg_dtype, neutral, expr.expr, expr.inames),
                within_inames=(
                    (outer_insn_inames - frozenset(expr.inames))
                    | frozenset([red_iname])),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset([init_id]) | insn.depends_on,
                no_sync_with=frozenset([init_id]))
        generated_insns.append(transfer_insn)

        def _strip_if_scalar(c):
            if len(acc_vars) == 1:
                return c[0]
            else:
                return c

        cur_size = 1
        while cur_size < size:
            cur_size *= 2

        prev_id = transfer_id
        bound = size

        istage = 0
        while cur_size > 1:

            new_size = cur_size // 2
            assert new_size * 2 == cur_size

            stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
            domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname]

            stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
            stage_insn = make_assignment(
                    id=stage_id,
                    assignees=tuple(
                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                        for acc_var in acc_vars),
                    expression=expr.operation(
                        arg_dtype,
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (var(stage_exec_iname),)]
                            for acc_var in acc_vars]),
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (
                                    var(stage_exec_iname) + new_size,)]
                            for acc_var in acc_vars]),
                        expr.inames),
                    within_inames=(
                        base_iname_deps | frozenset([stage_exec_iname])),
                    within_inames_is_final=insn.within_inames_is_final,
                    depends_on=frozenset([prev_id]),
                    )

            generated_insns.append(stage_insn)
            prev_id = stage_id

            cur_size = new_size
            bound = cur_size
            istage += 1

        new_insn_add_depends_on.add(prev_id)
        new_insn_add_no_sync_with.add(prev_id)
        new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0][outer_local_iname_vars + (0,)]
        else:
            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
Exemple #6
0
def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True):
    """Rewrites reductions into their imperative form. With *insn_id_filter*
    specified, operate only on the instruction with an instruction id matching
    *insn_id_filter*.

    If *insn_id_filter* is given, only the outermost level of reductions will be
    expanded, inner reductions will be left alone (because they end up in a new
    instruction with a different ID, which doesn't match the filter).

    If *insn_id_filter* is not given, all reductions in all instructions will
    be realized.
    """

    logger.debug("%s: realize reduction" % kernel.name)

    new_insns = []
    new_iname_tags = {}

    insn_id_gen = kernel.get_instruction_id_generator()

    var_name_gen = kernel.get_var_name_generator()
    new_temporary_variables = kernel.temporary_variables.copy()

    from loopy.expression import TypeInferenceMapper
    type_inf_mapper = TypeInferenceMapper(kernel)

    # {{{ sequential

    def map_reduction_seq(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        outer_insn_inames = temp_kernel.insn_inames(insn)

        from pymbolic import var
        acc_var_names = [
                var_name_gen("acc_"+"_".join(expr.inames))
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope

        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=(),
                    dtype=dtype,
                    scope=temp_var_scope.PRIVATE)

        init_id = insn_id_gen(
                "%s_%s_init" % (insn.id, "_".join(expr.inames)))

        init_insn = make_assignment(
                id=init_id,
                assignees=acc_vars,
                within_inames=outer_insn_inames - frozenset(expr.inames),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset(),
                expression=expr.operation.neutral_element(arg_dtype, expr.inames))

        generated_insns.append(init_insn)

        update_id = insn_id_gen(
                based_on="%s_%s_update" % (insn.id, "_".join(expr.inames)))

        update_insn_iname_deps = temp_kernel.insn_inames(insn) | set(expr.inames)
        if insn.within_inames_is_final:
            update_insn_iname_deps = insn.within_inames | set(expr.inames)

        reduction_insn = make_assignment(
                id=update_id,
                assignees=acc_vars,
                expression=expr.operation(
                    arg_dtype,
                    acc_vars if len(acc_vars) > 1 else acc_vars[0],
                    expr.expr, expr.inames),
                depends_on=frozenset([init_insn.id]) | insn.depends_on,
                within_inames=update_insn_iname_deps,
                within_inames_is_final=insn.within_inames_is_final)

        generated_insns.append(reduction_insn)

        new_insn_add_depends_on.add(reduction_insn.id)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0]
        else:
            return acc_vars

    # }}}

    # {{{ local-parallel

    def _get_int_iname_size(iname):
        from loopy.isl_helpers import static_max_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr
        size = pw_aff_to_expr(
                static_max_of_pw_aff(
                    kernel.get_iname_bounds(iname).size,
                    constants_only=True))
        assert isinstance(size, six.integer_types)
        return size

    def _make_slab_set(iname, size):
        v = isl.make_zero_and_vars([iname])
        bs, = (
                v[0].le_set(v[iname])
                &
                v[iname].lt_set(v[0] + size)).get_basic_sets()
        return bs

    def map_reduction_local(expr, rec, nresults, arg_dtype,
            reduction_dtypes):
        red_iname, = expr.inames

        size = _get_int_iname_size(red_iname)

        outer_insn_inames = temp_kernel.insn_inames(insn)

        from loopy.kernel.data import LocalIndexTagBase
        outer_local_inames = tuple(
                oiname
                for oiname in outer_insn_inames
                if isinstance(
                    kernel.iname_to_tag.get(oiname),
                    LocalIndexTagBase))

        from pymbolic import var
        outer_local_iname_vars = tuple(
                var(oiname) for oiname in outer_local_inames)

        outer_local_iname_sizes = tuple(
                _get_int_iname_size(oiname)
                for oiname in outer_local_inames)

        # {{{ add separate iname to carry out the reduction

        # Doing this sheds any odd conditionals that may be active
        # on our red_iname.

        base_exec_iname = var_name_gen("red_"+red_iname)
        domains.append(_make_slab_set(base_exec_iname, size))
        new_iname_tags[base_exec_iname] = kernel.iname_to_tag[red_iname]

        # }}}

        acc_var_names = [
                var_name_gen("acc_"+red_iname)
                for i in range(nresults)]
        acc_vars = tuple(var(n) for n in acc_var_names)

        from loopy.kernel.data import TemporaryVariable, temp_var_scope
        for name, dtype in zip(acc_var_names, reduction_dtypes):
            new_temporary_variables[name] = TemporaryVariable(
                    name=name,
                    shape=outer_local_iname_sizes + (size,),
                    dtype=dtype,
                    scope=temp_var_scope.LOCAL)

        base_iname_deps = outer_insn_inames - frozenset(expr.inames)

        neutral = expr.operation.neutral_element(arg_dtype, expr.inames)

        init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname))
        init_insn = make_assignment(
                id=init_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(base_exec_iname),)]
                    for acc_var in acc_vars),
                expression=neutral,
                within_inames=base_iname_deps | frozenset([base_exec_iname]),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset())
        generated_insns.append(init_insn)

        transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, red_iname))
        transfer_insn = make_assignment(
                id=transfer_id,
                assignees=tuple(
                    acc_var[outer_local_iname_vars + (var(red_iname),)]
                    for acc_var in acc_vars),
                expression=expr.operation(
                    arg_dtype, neutral, expr.expr, expr.inames),
                within_inames=(
                    (outer_insn_inames - frozenset(expr.inames))
                    | frozenset([red_iname])),
                within_inames_is_final=insn.within_inames_is_final,
                depends_on=frozenset([init_id]) | insn.depends_on,
                no_sync_with=frozenset([init_id]))
        generated_insns.append(transfer_insn)

        def _strip_if_scalar(c):
            if len(acc_vars) == 1:
                return c[0]
            else:
                return c

        cur_size = 1
        while cur_size < size:
            cur_size *= 2

        prev_id = transfer_id
        bound = size

        istage = 0
        while cur_size > 1:

            new_size = cur_size // 2
            assert new_size * 2 == cur_size

            stage_exec_iname = var_name_gen("red_%s_s%d" % (red_iname, istage))
            domains.append(_make_slab_set(stage_exec_iname, bound-new_size))
            new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[red_iname]

            stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
            stage_insn = make_assignment(
                    id=stage_id,
                    assignees=tuple(
                        acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                        for acc_var in acc_vars),
                    expression=expr.operation(
                        arg_dtype,
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (var(stage_exec_iname),)]
                            for acc_var in acc_vars]),
                        _strip_if_scalar([
                            acc_var[
                                outer_local_iname_vars + (
                                    var(stage_exec_iname) + new_size,)]
                            for acc_var in acc_vars]),
                        expr.inames),
                    within_inames=(
                        base_iname_deps | frozenset([stage_exec_iname])),
                    within_inames_is_final=insn.within_inames_is_final,
                    depends_on=frozenset([prev_id]),
                    )

            generated_insns.append(stage_insn)
            prev_id = stage_id

            cur_size = new_size
            bound = cur_size
            istage += 1

        new_insn_add_depends_on.add(prev_id)
        new_insn_add_no_sync_with.add(prev_id)
        new_insn_add_within_inames.add(stage_exec_iname or base_exec_iname)

        if nresults == 1:
            assert len(acc_vars) == 1
            return acc_vars[0][outer_local_iname_vars + (0,)]
        else:
            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
    # }}}

    # {{{ seq/par dispatch

    def map_reduction(expr, rec, nresults=1):
        # Only expand one level of reduction at a time, going from outermost to
        # innermost. Otherwise we get the (iname + insn) dependencies wrong.

        try:
            arg_dtype = type_inf_mapper(expr.expr)
        except DependencyTypeInferenceFailure:
            if unknown_types_ok:
                arg_dtype = lp.auto

                reduction_dtypes = (lp.auto,)*nresults

            else:
                raise LoopyError("failed to determine type of accumulator for "
                        "reduction '%s'" % expr)
        else:
            arg_dtype = arg_dtype.with_target(kernel.target)

            reduction_dtypes = expr.operation.result_dtypes(
                        kernel, arg_dtype, expr.inames)
            reduction_dtypes = tuple(
                    dt.with_target(kernel.target) for dt in reduction_dtypes)

        outer_insn_inames = temp_kernel.insn_inames(insn)
        bad_inames = frozenset(expr.inames) & outer_insn_inames
        if bad_inames:
            raise LoopyError("reduction used within loop(s) that it was "
                    "supposed to reduce over: " + ", ".join(bad_inames))

        n_sequential = 0
        n_local_par = 0

        from loopy.kernel.data import (
                LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag,
                ParallelTag)
        for iname in expr.inames:
            iname_tag = kernel.iname_to_tag.get(iname)

            if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)):
                # These are nominally parallel, but we can live with
                # them as sequential.
                n_sequential += 1

            elif isinstance(iname_tag, LocalIndexTagBase):
                n_local_par += 1

            elif isinstance(iname_tag, (ParallelTag, VectorizeTag)):
                raise LoopyError("the only form of parallelism supported "
                        "by reductions is 'local'--found iname '%s' "
                        "tagged '%s'"
                        % (iname, type(iname_tag).__name__))

            else:
                n_sequential += 1

        if n_local_par and n_sequential:
            raise LoopyError("Reduction over '%s' contains both parallel and "
                    "sequential inames. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_local_par > 1:
            raise LoopyError("Reduction over '%s' contains more than"
                    "one parallel iname. It must be split "
                    "(using split_reduction_{in,out}ward) "
                    "before code generation."
                    % ", ".join(expr.inames))

        if n_sequential:
            assert n_local_par == 0
            return map_reduction_seq(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        elif n_local_par:
            return map_reduction_local(expr, rec, nresults, arg_dtype,
                    reduction_dtypes)
        else:
            from loopy.diagnostic import warn_with_kernel
            warn_with_kernel(kernel, "empty_reduction",
                    "Empty reduction found (no inames to reduce over). "
                    "Eliminating.")

            return expr.expr

    # }}}

    from loopy.symbolic import ReductionCallbackMapper
    cb_mapper = ReductionCallbackMapper(map_reduction)

    insn_queue = kernel.instructions[:]
    insn_id_replacements = {}
    domains = kernel.domains[:]

    temp_kernel = kernel

    import loopy as lp
    while insn_queue:
        new_insn_add_depends_on = set()
        new_insn_add_no_sync_with = set()
        new_insn_add_within_inames = set()

        generated_insns = []

        insn = insn_queue.pop(0)

        if insn_id_filter is not None and insn.id != insn_id_filter \
                or not isinstance(insn, lp.MultiAssignmentBase):
            new_insns.append(insn)
            continue

        nresults = len(insn.assignees)

        # Run reduction expansion.
        from loopy.symbolic import Reduction
        if isinstance(insn.expression, Reduction) and nresults > 1:
            new_expressions = cb_mapper(insn.expression, nresults=nresults)
        else:
            new_expressions = (cb_mapper(insn.expression),)

        if generated_insns:
            # An expansion happened, so insert the generated stuff plus
            # ourselves back into the queue.

            kwargs = insn.get_copy_kwargs(
                    depends_on=insn.depends_on
                    | frozenset(new_insn_add_depends_on),
                    no_sync_with=insn.no_sync_with
                    | frozenset(new_insn_add_no_sync_with),
                    within_inames=(
                        temp_kernel.insn_inames(insn)
                        | new_insn_add_within_inames))

            kwargs.pop("id")
            kwargs.pop("expression")
            kwargs.pop("assignee", None)
            kwargs.pop("assignees", None)
            kwargs.pop("temp_var_type", None)
            kwargs.pop("temp_var_types", None)

            if isinstance(insn.expression, Reduction) and nresults > 1:
                replacement_insns = [
                        lp.Assignment(
                            id=insn_id_gen(insn.id),
                            assignee=assignee,
                            expression=new_expr,
                            **kwargs)
                        for assignee, new_expr in zip(
                            insn.assignees, new_expressions)]

            else:
                new_expr, = new_expressions
                replacement_insns = [
                        make_assignment(
                            id=insn_id_gen(insn.id),
                            assignees=insn.assignees,
                            expression=new_expr,
                            **kwargs)
                        ]

            insn_id_replacements[insn.id] = [
                    rinsn.id for rinsn in replacement_insns]

            insn_queue = generated_insns + replacement_insns + insn_queue

            # The reduction expander needs an up-to-date kernel
            # object to find dependencies. Keep temp_kernel up-to-date.

            temp_kernel = kernel.copy(
                    instructions=new_insns + insn_queue,
                    temporary_variables=new_temporary_variables,
                    domains=domains)
            temp_kernel = lp.replace_instruction_ids(
                    temp_kernel, insn_id_replacements)

        else:
            # nothing happened, we're done with insn
            assert not new_insn_add_depends_on

            new_insns.append(insn)

    kernel = kernel.copy(
            instructions=new_insns,
            temporary_variables=new_temporary_variables,
            domains=domains)

    kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)

    kernel = lp.tag_inames(kernel, new_iname_tags)

    return kernel
Exemple #7
0
def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
    """
    Multi assignment function calls are currently lowered into OpenCL so that
    the function call::

       a, b = segmented_sum(x, y, z, w)

    becomes::

       a = segmented_sum_mangled(x, y, z, w, &b).

    For OpenCL, the scope of "b" is significant, and the preamble generation
    currently assumes the scope is always private. This function forces that to
    be the case by introducing temporary assignments into the kernel.
    """

    insn_id_gen = kernel.get_instruction_id_generator()
    var_name_gen = kernel.get_var_name_generator()

    new_or_updated_instructions = {}
    new_temporaries = {}

    dep_map = dict((insn.id, insn.depends_on) for insn in kernel.instructions)

    inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions)

    import six
    for insn_id, deps in six.iteritems(dep_map):
        for dep in deps:
            inverse_dep_map[dep].add(insn_id)

    del dep_map

    # {{{ utils

    def _add_to_no_sync_with(insn_id, new_no_sync_with_params):
        insn = kernel.id_to_insn.get(insn_id)
        insn = new_or_updated_instructions.get(insn_id, insn)
        new_or_updated_instructions[insn_id] = (insn.copy(
            no_sync_with=(insn.no_sync_with
                          | frozenset(new_no_sync_with_params))))

    def _add_to_depends_on(insn_id, new_depends_on_params):
        insn = kernel.id_to_insn.get(insn_id)
        insn = new_or_updated_instructions.get(insn_id, insn)
        new_or_updated_instructions[insn_id] = (insn.copy(
            depends_on=insn.depends_on | frozenset(new_depends_on_params)))

    # }}}

    from loopy.kernel.instruction import CallInstruction
    for insn in kernel.instructions:
        if not isinstance(insn, CallInstruction):
            continue

        if len(insn.assignees) <= 1:
            continue

        assignees = insn.assignees
        assignee_var_names = insn.assignee_var_names()

        new_assignees = [assignees[0]]
        newly_added_assignments_ids = set()
        needs_replacement = False

        last_added_insn_id = insn.id

        from loopy.kernel.data import temp_var_scope, TemporaryVariable

        FIRST_POINTER_ASSIGNEE_IDX = 1  # noqa

        for assignee_nr, assignee_var_name, assignee in zip(
                range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)),
                assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:],
                assignees[FIRST_POINTER_ASSIGNEE_IDX:]):

            if (assignee_var_name in kernel.temporary_variables
                    and (kernel.temporary_variables[assignee_var_name].scope
                         == temp_var_scope.PRIVATE)):
                new_assignees.append(assignee)
                continue

            needs_replacement = True

            # {{{ generate a new assignent instruction

            new_assignee_name = var_name_gen(
                "{insn_id}_retval_{assignee_nr}".format(
                    insn_id=insn.id, assignee_nr=assignee_nr))

            new_assignment_id = insn_id_gen(
                "{insn_id}_assign_retval_{assignee_nr}".format(
                    insn_id=insn.id, assignee_nr=assignee_nr))

            newly_added_assignments_ids.add(new_assignment_id)

            import loopy as lp
            new_temporaries[new_assignee_name] = (TemporaryVariable(
                name=new_assignee_name,
                dtype=lp.auto,
                scope=temp_var_scope.PRIVATE))

            from pymbolic import var
            new_assignee = var(new_assignee_name)
            new_assignees.append(new_assignee)

            new_or_updated_instructions[new_assignment_id] = (make_assignment(
                assignees=(assignee, ),
                expression=new_assignee,
                id=new_assignment_id,
                depends_on=frozenset([last_added_insn_id]),
                depends_on_is_final=True,
                no_sync_with=(insn.no_sync_with
                              | frozenset([(insn.id, "any")])),
                predicates=insn.predicates,
                within_inames=insn.within_inames))

            last_added_insn_id = new_assignment_id

            # }}}

        if not needs_replacement:
            continue

        # {{{ update originating instruction

        orig_insn = new_or_updated_instructions.get(insn.id, insn)

        new_or_updated_instructions[insn.id] = (orig_insn.copy(
            assignees=tuple(new_assignees)))

        _add_to_no_sync_with(insn.id, [(id, "any")
                                       for id in newly_added_assignments_ids])

        # }}}

        # {{{ squash spurious memory dependencies amongst new assignments

        for new_insn_id in newly_added_assignments_ids:
            _add_to_no_sync_with(
                new_insn_id,
                [(id, "any")
                 for id in newly_added_assignments_ids if id != new_insn_id])

        # }}}

        # {{{ update instructions that depend on the originating instruction

        for inverse_dep in inverse_dep_map[insn.id]:
            _add_to_depends_on(inverse_dep, newly_added_assignments_ids)

            for insn_id, scope in (
                    new_or_updated_instructions[inverse_dep].no_sync_with):
                if insn_id == insn.id:
                    _add_to_no_sync_with(
                        inverse_dep,
                        [(id, scope) for id in newly_added_assignments_ids])

        # }}}

    new_temporary_variables = kernel.temporary_variables.copy()
    new_temporary_variables.update(new_temporaries)

    new_instructions = (list(new_or_updated_instructions.values()) +
                        list(insn for insn in kernel.instructions
                             if insn.id not in new_or_updated_instructions))

    return kernel.copy(temporary_variables=new_temporary_variables,
                       instructions=new_instructions)