Esempio n. 1
0
def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context):
    if context is not None:
        context = isl.align_spaces(context, pw_aff.get_domain_space(),
                obj_bigger_ok=True).params()
        pw_aff = pw_aff.gist(context)

    pieces = pw_aff.get_pieces()
    if len(pieces) == 1:
        (_, result), = pieces
        if constants_only and not result.is_cst():
            raise ValueError("a numeric %s was not found for PwAff '%s'"
                    % (what, pw_aff))
        return result

    from pytools import memoize, flatten

    @memoize
    def is_bounded(set):
        assert set.dim(dim_type.set) == 0
        return (set
                .move_dims(dim_type.set, 0,
                    dim_type.param, 0, set.dim(dim_type.param))
                .is_bounded())

    # put constant bounds with unbounded validity first
    order = [
            (True, False),  # constant, unbounded validity
            (False, False),  # nonconstant, unbounded validity
            (True, True),  # constant, bounded validity
            (False, True),  # nonconstant, bounded validity
            ]

    pieces = flatten([
            [(set, aff) for set, aff in pieces
                if aff.is_cst() == want_is_constant
                and is_bounded(set) == want_is_bounded]
            for want_is_constant, want_is_bounded in order])

    reference = pw_aff.get_aggregate_domain()
    if context is not None:
        reference = reference.intersect(context)

    # {{{ find bounds that are also global bounds

    for set, candidate_aff in pieces:
        # gist can be time-consuming, try without first
        for use_gist in [False, True]:
            if use_gist:
                candidate_aff = candidate_aff.gist(set)

            if constants_only and not candidate_aff.is_cst():
                continue

            if reference <= set_method(pw_aff, candidate_aff):
                return candidate_aff

    # }}}

    raise StaticValueFindingError("a static %s was not found for PwAff '%s'"
            % (what, pw_aff))
Esempio n. 2
0
def static_extremum_of_pw_aff(pw_aff, constants_only, set_method, what, context):
    if context is not None:
        context = isl.align_spaces(context, pw_aff.get_domain_space(),
                obj_bigger_ok=True).params()
        pw_aff = pw_aff.gist(context)

    pieces = pw_aff.get_pieces()
    if len(pieces) == 1:
        (_, result), = pieces
        if constants_only and not result.is_cst():
            raise ValueError("a numeric %s was not found for PwAff '%s'"
                    % (what, pw_aff))
        return result

    from pytools import memoize, flatten

    @memoize
    def is_bounded(set):
        assert set.dim(dim_type.set) == 0
        return (set
                .move_dims(dim_type.set, 0,
                    dim_type.param, 0, set.dim(dim_type.param))
                .is_bounded())

    # put constant bounds with unbounded validity first
    order = [
            (True, False),  # constant, unbounded validity
            (False, False),  # nonconstant, unbounded validity
            (True, True),  # constant, bounded validity
            (False, True),  # nonconstant, bounded validity
            ]

    pieces = flatten([
            [(set, aff) for set, aff in pieces
                if aff.is_cst() == want_is_constant
                and is_bounded(set) == want_is_bounded]
            for want_is_constant, want_is_bounded in order])

    reference = pw_aff.get_aggregate_domain()
    if context is not None:
        reference = reference.intersect(context)

    # {{{ find bounds that are also global bounds

    for set, candidate_aff in pieces:
        # gist can be time-consuming, try without first
        for use_gist in [False, True]:
            if use_gist:
                candidate_aff = candidate_aff.gist(set)

            if constants_only and not candidate_aff.is_cst():
                continue

            if reference <= set_method(pw_aff, candidate_aff):
                return candidate_aff

    # }}}

    raise StaticValueFindingError("a static %s was not found for PwAff '%s'"
            % (what, pw_aff))
Esempio n. 3
0
def test_align_spaces():
    m1 = isl.BasicMap("[m,n] -> {[i,j,k]->[l,o]:}")
    m2 = isl.BasicMap("[m,n] -> {[j,k,l,i]->[o]:}")

    result = isl.align_spaces(m1, m2)
    assert result.get_var_dict() == m2.get_var_dict()

    a1 = isl.Aff("[t0, t1, t2] -> { [(32)] }")
    a2 = isl.Aff("[t1, t0] -> { [(0)] }")

    with pytest.raises(isl.Error):
        a1_aligned = isl.align_spaces(a1, a2)

    a1_aligned = isl.align_spaces(a1, a2, obj_bigger_ok=True)
    a2_aligned = isl.align_spaces(a2, a1)

    assert a1_aligned == isl.Aff("[t1, t0, t2] -> { [(32)] }")
    assert a2_aligned == isl.Aff("[t1, t0, t2] -> { [(0)] }")
Esempio n. 4
0
        def __call__(self, check_inames):
            if not check_inames:
                return []

            domain = isl.align_spaces(
                    self.kernel.get_inames_domain(check_inames),
                    self.impl_domain, obj_bigger_ok=True)
            from loopy.codegen.bounds import get_approximate_convex_bounds_checks
            # Each instruction individually gets its bounds checks,
            # so we can safely overapproximate here.
            return get_approximate_convex_bounds_checks(domain,
                    check_inames, self.impl_domain)
Esempio n. 5
0
def _ensure_dim_names_match_and_align(obj_map, tgt_map):
    # (This function is also defined in independent, unmerged branch
    # new-dependency-and-nest-constraint-semantics-development, and used in
    # child branches thereof. Once these branches are all merged, it may make
    # sense to move this function to a location for more general-purpose
    # machinery. In the other branches, this function's name excludes the
    # leading underscore.)
    from islpy import align_spaces
    from islpy import dim_type as dt

    # first make sure names match
    if not all(
            set(obj_map.get_var_names(dt)) == set(tgt_map.get_var_names(dt))
            for dt in [dt.in_, dt.out, dt.param]):
        raise ValueError("Cannot align spaces; names don't match:\n%s\n%s" %
                         (obj_map, tgt_map))

    return align_spaces(obj_map, tgt_map)
Esempio n. 6
0
def iname_rel_aff(space, iname, rel, aff):
    """*aff*'s domain space is allowed to not match *space*."""

    dt, pos = space.get_var_dict()[iname]
    assert dt in [isl.dim_type.set, isl.dim_type.param]
    if dt == isl.dim_type.set:
        dt = isl.dim_type.in_

    from islpy import align_spaces
    aff = align_spaces(aff, isl.Aff.zero_on_domain(space))

    if rel in ["==", "<="]:
        return aff.add_coefficient_val(dt, pos, -1)
    elif rel == ">=":
        return aff.neg().add_coefficient_val(dt, pos, 1)
    elif rel == "<":
        return (aff - 1).add_coefficient_val(dt, pos, -1)
    elif rel == ">":
        return (aff + 1).neg().add_coefficient_val(dt, pos, 1)
    else:
        raise ValueError("unknown value of 'rel': %s" % rel)
Esempio n. 7
0
def iname_rel_aff(space, iname, rel, aff):
    """*aff*'s domain space is allowed to not match *space*."""

    dt, pos = space.get_var_dict()[iname]
    assert dt in [isl.dim_type.set, isl.dim_type.param]
    if dt == isl.dim_type.set:
        dt = isl.dim_type.in_

    from islpy import align_spaces
    aff = align_spaces(aff, isl.Aff.zero_on_domain(space))

    if rel in ["==", "<="]:
        return aff.add_coefficient_val(dt, pos, -1)
    elif rel == ">=":
        return aff.neg().add_coefficient_val(dt, pos, 1)
    elif rel == "<":
        return (aff-1).add_coefficient_val(dt, pos, -1)
    elif rel == ">":
        return (aff+1).neg().add_coefficient_val(dt, pos, 1)
    else:
        raise ValueError("unknown value of 'rel': %s" % rel)
Esempio n. 8
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(codegen_state.implemented_domain,
                                       dom_and_slab,
                                       obj_bigger_ok=True).params()

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space,
                                                 loop_iname, impl_lbound,
                                                 impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(dim_type.set,
                                            impl_loop.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_loop).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(codegen_state,
                                      kernel.index_dtype,
                                      loop_iname,
                                      ecm(pw_aff_to_expr(lbound), PREC_NONE,
                                          "i"),
                                      is_const=True),
                astb.emit_blank_line(),
                inner,
            ])
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.ast_block_scope_class(
                        inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(
                            simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(
                            simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)
Esempio n. 9
0
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(
            kernel, loop_iname, sched_index, codegen_state)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (
                static_min_of_pw_aff,
                static_max_of_pw_aff)

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .coalesce())

        static_lbound = static_min_of_pw_aff(
                lbound,
                constants_only=False)
        static_ubound = static_max_of_pw_aff(
                ubound,
                constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(
                dom_and_slab.space,
                loop_iname, static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(
                    dim_type.set, impl_slab.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = codegen_state.intersect(impl_slab)

        inner = build_loop_nest(
                intersect_kernel_with_slab(
                    kernel, slab, iname),
                sched_index+1, new_codegen_state)

        # }}}

        if cmt is not None:
            from cgen import Comment
            result.append(Comment(cmt))

        from cgen import Initializer, POD, Const, Line
        from loopy.symbolic import aff_to_expr

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(gen_code_block([
                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                Line(),
                inner,
                ]))

        else:
            result.append(
                kernel.target.emit_sequential_loop(
                       codegen_state, loop_iname, kernel.index_dtype,
                       static_lbound, static_ubound, inner))

    return gen_code_block(result)
Esempio n. 10
0
def test_align_spaces():
    m1 = isl.BasicMap("[m,n] -> {[i,j,k]->[l,o]:}")
    m2 = isl.BasicMap("[m,n] -> {[j,k,l,i]->[o]:}")

    result = isl.align_spaces(m1, m2)
    assert result.get_var_dict() == m2.get_var_dict()
Esempio n. 11
0
def chunk_iname(kernel,
                split_iname,
                num_chunks,
                outer_iname=None,
                inner_iname=None,
                outer_tag=None,
                inner_tag=None,
                slabs=(0, 0),
                do_tagged_check=True,
                within=None):
    """
    Split *split_iname* into two inames (an 'inner' one and an 'outer' one)
    so that ``split_iname == inner + outer*chunk_length`` and *outer* is of
    fixed length *num_chunks*.

    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.

    .. versionadded:: 2016.2
    """

    size = kernel.get_iname_bounds(split_iname).size
    k0 = isl.Aff.zero_on_domain(size.domain().space)
    chunk_ceil = size.div(k0 + num_chunks).ceil()
    chunk_floor = size.div(k0 + num_chunks).floor()
    chunk_diff = chunk_ceil - chunk_floor
    chunk_mod = size.mod_val(num_chunks)

    from loopy.symbolic import pw_aff_to_expr
    from pymbolic.primitives import Min

    def make_new_loop_index(inner, outer):
        # These two expressions are equivalent. Benchmarking between the
        # two was inconclusive, although one is shorter.

        if 0:
            # Triggers isl issues in check pass.
            return (inner + pw_aff_to_expr(chunk_floor) * outer +
                    pw_aff_to_expr(chunk_diff) * Min(
                        (outer, pw_aff_to_expr(chunk_mod))))
        else:
            return (inner + pw_aff_to_expr(chunk_ceil) * Min(
                (outer, pw_aff_to_expr(chunk_mod))) +
                    pw_aff_to_expr(chunk_floor) * (outer - Min(
                        (outer, pw_aff_to_expr(chunk_mod)))))

    # {{{ check that iname is a box iname

    # Since the linearization used in the constraint used to map the domain
    # does not match the linearization in make_new_loop_index, we can't really
    # tolerate if the iname in question has constraints that make it non-boxy,
    # since these sub-indices would end up in the wrong spot.

    for dom in kernel.domains:
        var_dict = dom.get_var_dict()
        if split_iname not in var_dict:
            continue

        dt, idx = var_dict[split_iname]
        assert dt == dim_type.set

        aff_zero = isl.Aff.zero_on_domain(dom.space)
        aff_split_iname = aff_zero.set_coefficient_val(dim_type.in_, idx, 1)
        aligned_size = isl.align_spaces(size, aff_zero)
        box_dom = (dom.eliminate(dt, idx, 1)
                   & aff_zero.le_set(aff_split_iname)
                   & aff_split_iname.lt_set(aligned_size))

        if not (box_dom <= dom and dom <= box_dom):
            raise LoopyError("domain '%s' is not box-shape about iname "
                             "'%s', cannot use chunk_iname()" %
                             (dom, split_iname))

    # }}}

    return _split_iname_backend(kernel,
                                split_iname,
                                fixed_length=num_chunks,
                                fixed_length_is_inner=False,
                                make_new_loop_index=make_new_loop_index,
                                outer_iname=outer_iname,
                                inner_iname=inner_iname,
                                outer_tag=outer_tag,
                                inner_tag=inner_tag,
                                slabs=slabs,
                                do_tagged_check=do_tagged_check,
                                within=within)
Esempio n. 12
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain,
                                          slab,
                                          across_dim_types=True,
                                          obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (static_min_of_pw_aff,
                                       static_max_of_pw_aff)

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())

        static_lbound = static_min_of_pw_aff(lbound, constants_only=False)
        static_ubound = static_max_of_pw_aff(ubound, constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname,
                                                static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(dim_type.set,
                                            impl_slab.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_slab).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        from loopy.symbolic import aff_to_expr

        astb = codegen_state.ast_builder

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(
                merge_codegen_results(codegen_state, [
                    astb.emit_initializer(codegen_state,
                                          kernel.index_dtype,
                                          loop_iname,
                                          ecm(aff_to_expr(static_lbound),
                                              PREC_NONE, "i"),
                                          is_const=True),
                    astb.emit_blank_line(),
                    inner,
                ]))

        else:
            inner_ast = inner.current_ast(codegen_state)
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(codegen_state, loop_iname,
                                              kernel.index_dtype,
                                              static_lbound, static_ubound,
                                              inner_ast)))

    return merge_codegen_results(codegen_state, result)
Esempio n. 13
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(
            codegen_state.implemented_domain,
            dom_and_slab,
            obj_bigger_ok=True,
            across_dim_types=True
            ).params()

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .gist(impl_domain)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .gist(impl_domain)
            .coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(
                dom_and_slab.space,
                loop_iname,
                impl_lbound,
                impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(
                    dim_type.set, impl_loop.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = (
                codegen_state
                .intersect(impl_loop)
                .copy(kernel=intersect_kernel_with_slab(
                    kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index+1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(
                    codegen_state,
                    kernel.index_dtype, loop_iname,
                    ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"),
                    is_const=True),
                astb.emit_blank_line(),
                inner,
                ])
            result.append(
                    inner.with_new_ast(
                        codegen_state,
                        astb.ast_block_scope_class(
                            inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)