Example #1
0
def generate_unroll_loop(kernel, sched_index, codegen_state):
    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError(
                "length of unrolled loop '%s' is not a constant, "
                "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(
                build_loop_nest(kernel, sched_index+1, new_codegen_state))

    return gen_code_block(result)
Example #2
0
                def gen_code(inner_codegen_state):
                    conditionals = [
                            constraint_to_code(
                                inner_codegen_state.expression_to_code_mapper, cns)
                            for cns in bounds_checks] + list(pred_checks)

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [wrap_in_if(
                             conditionals,
                             gen_code_block(prev_result))]
Example #3
0
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(
            kernel, loop_iname, sched_index, codegen_state)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (
                static_min_of_pw_aff,
                static_max_of_pw_aff)

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .coalesce())

        static_lbound = static_min_of_pw_aff(
                lbound,
                constants_only=False)
        static_ubound = static_max_of_pw_aff(
                ubound,
                constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(
                dom_and_slab.space,
                loop_iname, static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(
                    dim_type.set, impl_slab.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = codegen_state.intersect(impl_slab)

        inner = build_loop_nest(
                intersect_kernel_with_slab(
                    kernel, slab, iname),
                sched_index+1, new_codegen_state)

        # }}}

        if cmt is not None:
            from cgen import Comment
            result.append(Comment(cmt))

        from cgen import Initializer, POD, Const, Line
        from loopy.symbolic import aff_to_expr

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(gen_code_block([
                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                Line(),
                inner,
                ]))

        else:
            result.append(
                kernel.target.emit_sequential_loop(
                       codegen_state, loop_iname, kernel.index_dtype,
                       static_lbound, static_ubound, inner))

    return gen_code_block(result)
Example #4
0
def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
        hw_inames_left=None):
    from loopy.kernel.data import (
            UniqueTag, HardwareParallelTag, LocalIndexTag, GroupIndexTag)

    if hw_inames_left is None:
        hw_inames_left = [iname
                for iname in kernel.all_inames()
                if isinstance(kernel.iname_to_tag.get(iname), HardwareParallelTag)]

    if not hw_inames_left:
        return build_loop_nest(kernel, sched_index, codegen_state)

    global_size, local_size = kernel.get_grid_sizes()

    hw_inames_left = hw_inames_left[:]
    iname = hw_inames_left.pop()

    tag = kernel.iname_to_tag.get(iname)

    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex

    assert isinstance(tag, UniqueTag)
    if isinstance(tag, GroupIndexTag):
        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
    elif isinstance(tag, LocalIndexTag):
        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
    else:
        raise RuntimeError("unexpected hw tag type")

    other_inames_with_same_tag = [
            other_iname for other_iname in kernel.all_inames()
            if isinstance(kernel.iname_to_tag.get(other_iname), UniqueTag)
            and kernel.iname_to_tag.get(other_iname).key == tag.key
            and other_iname != iname]

    # {{{ 'implement' hardware axis boundaries

    if isinstance(tag, LocalIndexTag):
        hw_axis_size = local_size[tag.axis]
    elif isinstance(tag, GroupIndexTag):
        hw_axis_size = global_size[tag.axis]
    else:
        raise RuntimeError("unknown hardware parallel tag")

    result = []

    bounds = kernel.get_iname_bounds(iname)
    domain = kernel.get_inames_domain(iname)

    # It's ok to find a bound that's too "loose". The conditional
    # generators will mop up after us.
    from loopy.isl_helpers import static_min_of_pw_aff
    lower_bound = static_min_of_pw_aff(bounds.lower_bound_pw_aff,
            constants_only=False)

    # These bounds are 'implemented' by the hardware. Make sure
    # that the downstream conditional generators realize that.
    if not isinstance(hw_axis_size, int):
        hw_axis_size, lower_bound = isl.align_two(hw_axis_size, lower_bound)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound, lower_bound+hw_axis_size)
    codegen_state = codegen_state.intersect(slab)

    from loopy.symbolic import pw_aff_to_expr
    hw_axis_expr = hw_axis_expr + pw_aff_to_expr(lower_bound)

    # }}}

    slabs = get_slab_decomposition(
            kernel, iname, sched_index, codegen_state)

    if other_inames_with_same_tag and len(slabs) > 1:
        raise RuntimeError("cannot do slab decomposition on inames that share "
                "a tag with other inames")

    result = []

    from loopy.codegen import add_comment

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, iname)
        if len(slabs) == 1:
            cmt = None

        # Have the conditional infrastructure generate the
        # slabbing conditionals.
        slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname)
        new_codegen_state = codegen_state.copy_and_assign(iname, hw_axis_expr)

        inner = set_up_hw_parallel_loops(
                slabbed_kernel, sched_index,
                new_codegen_state, hw_inames_left)

        result.append(add_comment(cmt, inner))

    from loopy.codegen import gen_code_block
    return gen_code_block(result)
Example #5
0
                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code_block(gen_code(inner_codegen_state))
Example #6
0
def build_loop_nest(kernel, sched_index, codegen_state):
    # Most of the complexity of this function goes towards finding groups of
    # instructions that can be nested inside a shared conditional.

    # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices

    # i.e. go up to the next LeaveLoop, and skip over inner loops.

    my_sched_indices = []

    i = sched_index
    while i < len(kernel.schedule):
        sched_item = kernel.schedule[i]

        if isinstance(sched_item, LeaveLoop):
            break

        my_sched_indices.append(i)

        if isinstance(sched_item, EnterLoop):
            _, i = gather_schedule_subloop(
                    kernel.schedule, i)
        elif isinstance(sched_item, Barrier):
            i += 1

        elif isinstance(sched_item, RunInstruction):
            i += 1
        else:
            raise RuntimeError("unexpected schedule item type: %s"
                    % type(sched_item))

    del i

    # }}}

    # {{{ pass 2: find admissible conditional inames for each sibling schedule item

    from pytools import Record

    class ScheduleIndexInfo(Record):
        """
        .. attribute:: schedule_index
        .. attribute:: admissible_cond_inames
        .. attribute:: required_predicates
        .. attribute:: used_inames_within
        """

    from loopy.schedule import find_used_inames_within
    sched_index_info_entries = [
            ScheduleIndexInfo(
                schedule_indices=[i],
                admissible_cond_inames=(
                    get_admissible_conditional_inames_for(kernel, i)),
                required_predicates=get_required_predicates(kernel, i),
                used_inames_within=find_used_inames_within(kernel, i)
                )
            for i in my_sched_indices
            ]

    sched_index_info_entries = group_by(
            sched_index_info_entries,
            key=lambda sii: (
                sii.admissible_cond_inames,
                sii.required_predicates,
                sii.used_inames_within),
            merge=lambda sii1, sii2: sii1.copy(
                schedule_indices=(
                    sii1.schedule_indices
                    +
                    sii2.schedule_indices)))

    # }}}

    # {{{ pass 3: greedily group schedule items that share admissible inames

    from pytools import memoize_method

    class BoundsCheckCache:
        def __init__(self, kernel, impl_domain):
            self.kernel = kernel
            self.impl_domain = impl_domain

        @memoize_method
        def __call__(self, check_inames):
            if not check_inames:
                return []

            domain = isl.align_spaces(
                    self.kernel.get_inames_domain(check_inames),
                    self.impl_domain, obj_bigger_ok=True)
            from loopy.codegen.bounds import get_bounds_checks
            return get_bounds_checks(domain,
                    check_inames, self.impl_domain,

                    # Each instruction individually gets its bounds checks,
                    # so we can safely overapproximate here.
                    overapproximate=True)

    def build_insn_group(sched_index_info_entries, codegen_state,
            done_group_lengths=set()):
        """
        :arg done_group_lengths: A set of group lengths (integers) that grows
            from empty to include the longest found group and downwards with every
            recursive call.  It serves to prevent infinite recursion by preventing
            recursive calls from doing anything about groups that are too small.
        """

        # The rough plan here is that build_insn_group starts out with the
        # entirety of the current schedule item's downward siblings (i.e. all
        # the ones up to the next LeaveLoop). It will then iterate upward to
        # find the largest usable conditional hoist group.
        #
        # It will then call itself recursively, telling its recursive instances
        # to ignore the hoist group it just found by adding that group length
        # to done_group_length. (It'll also chop the set of schedule indices
        # considered down so that a callee cannot find a *longer* hoist group.)
        #
        # Upon return the hoist is wrapped around the returned code and
        # build_insn_group calls itself for the remainder of schedule indices
        # that were not in the hoist group.

        if not sched_index_info_entries:
            return []

        origin_si_entry = sched_index_info_entries[0]
        current_iname_set = origin_si_entry.admissible_cond_inames
        current_pred_set = (origin_si_entry.required_predicates
                - codegen_state.implemented_predicates)

        # {{{ grow schedule item group

        # Keep growing schedule item group as long as group fulfills minimum
        # size requirement.

        bounds_check_cache = BoundsCheckCache(
                kernel, codegen_state.implemented_domain)

        found_hoists = []

        candidate_group_length = 1
        while candidate_group_length <= len(sched_index_info_entries):
            if candidate_group_length in done_group_lengths:
                candidate_group_length += 1
                continue

            current_iname_set = (
                    current_iname_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .admissible_cond_inames)
            current_pred_set = (
                    current_pred_set
                    & sched_index_info_entries[candidate_group_length-1]
                    .required_predicates)

            # {{{ see which inames are actually used in group

            # And only generate conditionals for those.
            used_inames = set()
            for sched_index_info_entry in \
                    sched_index_info_entries[0:candidate_group_length]:
                used_inames |= sched_index_info_entry.used_inames_within

            # }}}

            only_unshared_inames = kernel.remove_inames_for_shared_hw_axes(
                    current_iname_set & used_inames)

            bounds_checks = bounds_check_cache(only_unshared_inames)

            if (bounds_checks  # found a bounds check
                    or current_pred_set
                    or candidate_group_length == 1):
                # length-1 must always be an option to reach the recursion base
                # case below
                found_hoists.append((candidate_group_length,
                    bounds_checks, current_pred_set))

            if not bounds_checks and not current_pred_set:
                # already no more checks possible, let's not waste time
                # checking longer groups.
                break

            candidate_group_length += 1

        # }}}

        # pick largest such group
        group_length, bounds_checks, pred_checks = max(found_hoists)

        check_set = None
        for cns in bounds_checks:
            cns_set = (isl.BasicSet.universe(cns.get_space())
                    .add_constraint(cns))

            if check_set is None:
                check_set = cns_set
            else:
                check_set, cns_set = isl.align_two(check_set, cns_set)
                check_set = check_set.intersect(cns_set)

        if check_set is None:
            new_codegen_state = codegen_state
            is_empty = False
        else:
            is_empty = check_set.is_empty()
            new_codegen_state = codegen_state.intersect(check_set)

        if pred_checks:
            new_codegen_state = new_codegen_state.copy(
                    implemented_predicates=new_codegen_state.implemented_predicates
                    | pred_checks)

        if is_empty:
            result = []
        else:
            if group_length == 1:
                # group only contains starting schedule item
                def gen_code(inner_codegen_state):
                    result = []
                    for i in origin_si_entry.schedule_indices:
                        inner = generate_code_for_sched_index(
                            kernel, i, inner_codegen_state)

                        if inner is not None:
                            result.append(inner)

                    return result

            else:
                # recurse with a bigger done_group_lengths
                def gen_code(inner_codegen_state):
                    return build_insn_group(
                            sched_index_info_entries[0:group_length],
                            inner_codegen_state,
                            done_group_lengths=(
                                done_group_lengths | set([group_length])))

            # gen_code returns a list

            if bounds_checks or pred_checks:
                from loopy.codegen import wrap_in_if
                from loopy.codegen.bounds import constraint_to_code

                prev_gen_code = gen_code

                def gen_code(inner_codegen_state):
                    conditionals = [
                            constraint_to_code(
                                inner_codegen_state.expression_to_code_mapper, cns)
                            for cns in bounds_checks] + list(pred_checks)

                    prev_result = prev_gen_code(inner_codegen_state)

                    return [wrap_in_if(
                             conditionals,
                             gen_code_block(prev_result))]

                cannot_vectorize = False
                if new_codegen_state.vectorization_info is not None:
                    from loopy.isl_helpers import obj_involves_variable
                    for cond in bounds_checks:
                        if obj_involves_variable(
                                cond,
                                new_codegen_state.vectorization_info.iname):
                            cannot_vectorize = True
                            break

                if cannot_vectorize:
                    def gen_code_wrapper(inner_codegen_state):
                        # gen_code returns a list, but this needs to return a
                        # GeneratedCode instance.

                        return gen_code_block(gen_code(inner_codegen_state))

                    result = [new_codegen_state.unvectorize(gen_code_wrapper)]
                else:
                    result = gen_code(new_codegen_state)

            else:
                result = gen_code(new_codegen_state)

        return result + build_insn_group(
                sched_index_info_entries[group_length:], codegen_state)

    # }}}

    return gen_code_block(
            build_insn_group(sched_index_info_entries, codegen_state))