Esempio n. 1
0
def generate_unroll_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (static_max_of_pw_aff,
                                   static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError("length of unrolled loop '%s' is not a constant, "
                         "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
            bounds.lower_bound_pw_aff.coalesce(), constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(build_loop_nest(new_codegen_state, sched_index + 1))

    return merge_codegen_results(codegen_state, result)
Esempio n. 2
0
def generate_unroll_loop(kernel, sched_index, codegen_state):
    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError(
                "length of unrolled loop '%s' is not a constant, "
                "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(
                build_loop_nest(kernel, sched_index+1, new_codegen_state))

    return gen_code_block(result)
Esempio n. 3
0
File: loop.py Progetto: tj-sun/loopy
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Esempio n. 4
0
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Esempio n. 5
0
def generate_host_or_device_program(codegen_state, schedule_index):
    ast_builder = codegen_state.ast_builder
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)

    from functools import partial

    from loopy.codegen.control import build_loop_nest
    if codegen_state.is_generating_device_code:
        from loopy.schedule import CallKernel
        assert isinstance(codegen_state.kernel.linearization[schedule_index],
                          CallKernel)

        from loopy.codegen.loop import set_up_hw_parallel_loops
        codegen_result = set_up_hw_parallel_loops(
                codegen_state, schedule_index,
                next_func=partial(build_loop_nest,
                    schedule_index=schedule_index + 1))
    else:
        codegen_result = build_loop_nest(codegen_state, schedule_index)

    if (codegen_state.is_generating_device_code
            or codegen_state.is_entrypoint):
        codegen_result = merge_codegen_results(
                codegen_state,
                ast_builder.generate_top_of_body(codegen_state)
                + temp_decls
                + [codegen_result],
                collapse=False)

        cur_prog = codegen_result.current_program(codegen_state)
        body_ast = cur_prog.ast
        fdecl_ast = ast_builder.get_function_declaration(
                codegen_state, codegen_result, schedule_index)

        fdef_ast = ast_builder.get_function_definition(
                codegen_state, codegen_result,
                schedule_index, fdecl_ast, body_ast)

        codegen_result = codegen_result.with_new_program(
                codegen_state,
                cur_prog.copy(
                    ast=ast_builder.process_ast(fdef_ast),
                    body_ast=ast_builder.process_ast(body_ast)))

    return codegen_result
Esempio n. 6
0
def generate_host_or_device_program(codegen_state, schedule_index):
    ast_builder = codegen_state.ast_builder
    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)

    from functools import partial

    from loopy.codegen.control import build_loop_nest
    if codegen_state.is_generating_device_code:
        from loopy.schedule import CallKernel
        assert isinstance(codegen_state.kernel.schedule[schedule_index], CallKernel)

        from loopy.codegen.loop import set_up_hw_parallel_loops
        codegen_result = set_up_hw_parallel_loops(
                codegen_state, schedule_index,
                next_func=partial(build_loop_nest,
                    schedule_index=schedule_index + 1))
    else:
        codegen_result = build_loop_nest(codegen_state, schedule_index)

    codegen_result = merge_codegen_results(
            codegen_state,
            ast_builder.generate_top_of_body(codegen_state)
            + temp_decls
            + [codegen_result],
            collapse=False)

    cur_prog = codegen_result.current_program(codegen_state)
    body_ast = cur_prog.ast
    fdecl_ast = ast_builder.get_function_declaration(
            codegen_state, codegen_result, schedule_index)

    fdef_ast = ast_builder.get_function_definition(
            codegen_state, codegen_result,
            schedule_index, fdecl_ast, body_ast)

    codegen_result = codegen_result.with_new_program(
            codegen_state,
            cur_prog.copy(
                ast=fdef_ast,
                body_ast=body_ast))

    return codegen_result
Esempio n. 7
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(codegen_state.implemented_domain,
                                       dom_and_slab,
                                       obj_bigger_ok=True).params()

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(
                kernel.assumptions).gist(impl_domain).coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(dom_and_slab.space,
                                                 loop_iname, impl_lbound,
                                                 impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(dim_type.set,
                                            impl_loop.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_loop).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(codegen_state,
                                      kernel.index_dtype,
                                      loop_iname,
                                      ecm(pw_aff_to_expr(lbound), PREC_NONE,
                                          "i"),
                                      is_const=True),
                astb.emit_blank_line(),
                inner,
            ])
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.ast_block_scope_class(
                        inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(
                            simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(
                            simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)
Esempio n. 8
0
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(
            kernel, loop_iname, sched_index, codegen_state)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (
                static_min_of_pw_aff,
                static_max_of_pw_aff)

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .coalesce())

        static_lbound = static_min_of_pw_aff(
                lbound,
                constants_only=False)
        static_ubound = static_max_of_pw_aff(
                ubound,
                constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(
                dom_and_slab.space,
                loop_iname, static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(
                    dim_type.set, impl_slab.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = codegen_state.intersect(impl_slab)

        inner = build_loop_nest(
                intersect_kernel_with_slab(
                    kernel, slab, iname),
                sched_index+1, new_codegen_state)

        # }}}

        if cmt is not None:
            from cgen import Comment
            result.append(Comment(cmt))

        from cgen import Initializer, POD, Const, Line
        from loopy.symbolic import aff_to_expr

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(gen_code_block([
                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                Line(),
                inner,
                ]))

        else:
            result.append(
                kernel.target.emit_sequential_loop(
                       codegen_state, loop_iname, kernel.index_dtype,
                       static_lbound, static_ubound, inner))

    return gen_code_block(result)
Esempio n. 9
0
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
        hw_inames_left=None):
    from loopy.kernel.data import (
            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)

    if hw_inames_left is None:
        hw_inames_left = [iname
                for iname in kernel.all_inames()
                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]

    if not hw_inames_left:
        return build_loop_nest(kernel, sched_index, codegen_state)

    global_size, local_size = kernel.get_grid_sizes()

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    tag = kernel.iname_to_tag.get(iname)

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    assert isinstance(tag, UniqueTag)
    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
            other_iname for other_iname in kernel.all_inames()
            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
            and kernel.iname_to_tag.get(other_iname).key == tag.key
            and other_iname != iname]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(
            kernel, iname, sched_index, codegen_state)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    from loopy.codegen import add_comment

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, iname)
        if len(slabs) == 1:
            cmt = None

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)

        inner = set_up_hw_parallel_loops(
                slabbed_kernel, sched_index,
                new_codegen_state, hw_inames_left)

        result.append(add_comment(cmt, inner))

    from loopy.codegen import gen_code_block
    return gen_code_block(result)
Esempio n. 10
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain,
                                          slab,
                                          across_dim_types=True,
                                          obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (static_min_of_pw_aff,
                                       static_max_of_pw_aff)

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())

        static_lbound = static_min_of_pw_aff(lbound, constants_only=False)
        static_ubound = static_max_of_pw_aff(ubound, constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname,
                                                static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(dim_type.set,
                                            impl_slab.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_slab).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        from loopy.symbolic import aff_to_expr

        astb = codegen_state.ast_builder

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(
                merge_codegen_results(codegen_state, [
                    astb.emit_initializer(codegen_state,
                                          kernel.index_dtype,
                                          loop_iname,
                                          ecm(aff_to_expr(static_lbound),
                                              PREC_NONE, "i"),
                                          is_const=True),
                    astb.emit_blank_line(),
                    inner,
                ]))

        else:
            inner_ast = inner.current_ast(codegen_state)
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(codegen_state, loop_iname,
                                              kernel.index_dtype,
                                              static_lbound, static_ubound,
                                              inner_ast)))

    return merge_codegen_results(codegen_state, result)
Esempio n. 11
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for das_iname in sorted(dom_and_slab.get_var_names(dim_type.set)):
            if das_iname in usable_inames:
                moved_inames.append(das_iname)
                dt, idx = dom_and_slab.get_var_dict()[das_iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        impl_domain = isl.align_spaces(
            codegen_state.implemented_domain,
            dom_and_slab,
            obj_bigger_ok=True,
            across_dim_types=True
            ).params()

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .gist(impl_domain)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .gist(impl_domain)
            .coalesce())

        # }}}

        # {{{ find implemented loop, build inner code

        from loopy.symbolic import pw_aff_to_pw_aff_implemented_by_expr
        impl_lbound = pw_aff_to_pw_aff_implemented_by_expr(lbound)
        impl_ubound = pw_aff_to_pw_aff_implemented_by_expr(ubound)

        # impl_loop may be overapproximated
        from loopy.isl_helpers import make_loop_bounds_from_pwaffs
        impl_loop = make_loop_bounds_from_pwaffs(
                dom_and_slab.space,
                loop_iname,
                impl_lbound,
                impl_ubound)

        for moved_iname in moved_inames:
            # move moved_iname to 'set' dim_type in impl_loop
            dt, idx = impl_loop.get_var_dict()[moved_iname]
            impl_loop = impl_loop.move_dims(
                    dim_type.set, impl_loop.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = (
                codegen_state
                .intersect(impl_loop)
                .copy(kernel=intersect_kernel_with_slab(
                    kernel, slab, loop_iname)))

        inner = build_loop_nest(new_codegen_state, sched_index+1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        astb = codegen_state.ast_builder

        from loopy.symbolic import pw_aff_to_expr

        if impl_ubound.is_equal(impl_lbound):
            # single-trip, generate just a variable assignment, not a loop
            inner = merge_codegen_results(codegen_state, [
                astb.emit_initializer(
                    codegen_state,
                    kernel.index_dtype, loop_iname,
                    ecm(pw_aff_to_expr(lbound), PREC_NONE, "i"),
                    is_const=True),
                astb.emit_blank_line(),
                inner,
                ])
            result.append(
                    inner.with_new_ast(
                        codegen_state,
                        astb.ast_block_scope_class(
                            inner.current_ast(codegen_state))))

        else:
            inner_ast = inner.current_ast(codegen_state)

            from loopy.isl_helpers import simplify_pw_aff

            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(
                        codegen_state, loop_iname, kernel.index_dtype,
                        pw_aff_to_expr(simplify_pw_aff(lbound, kernel.assumptions)),
                        pw_aff_to_expr(simplify_pw_aff(ubound, kernel.assumptions)),
                        inner_ast)))

    return merge_codegen_results(codegen_state, result)