Example #1
0
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_min_of_pw_aff,
                                       static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff,
                                                    constants_only=False,
                                                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            size = pw_aff_to_expr(
                static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1,
                                     constants_only=False,
                                     context=context))

            return base_index, size

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff,
                                              constants_only=False,
                                              context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        size = pw_aff_to_expr(
            static_max_of_pw_aff(upper_bound_pw_aff - base_index_aff + 1,
                                 constants_only=False,
                                 context=context))

        return base_index, size
Example #2
0
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (
                static_max_of_pw_aff,
                static_min_of_pw_aff,
                static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(
                    lower_bound_pw_aff, constants_only=False,
                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            size = pw_aff_to_expr(static_max_of_pw_aff(
                    upper_bound_pw_aff - base_index_aff + 1, constants_only=False,
                    context=context))

            return base_index, size

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(
                lower_bound_pw_aff, constants_only=False,
                context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        size = pw_aff_to_expr(static_max_of_pw_aff(
                upper_bound_pw_aff - base_index_aff + 1, constants_only=False,
                context=context))

        return base_index, size
Example #3
0
def generate_unroll_loop(kernel, sched_index, codegen_state):
    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError(
                "length of unrolled loop '%s' is not a constant, "
                "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(
                build_loop_nest(kernel, sched_index+1, new_codegen_state))

    return gen_code_block(result)
Example #4
0
def generate_unroll_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (static_max_of_pw_aff,
                                   static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        raise LoopyError("length of unrolled loop '%s' is not a constant, "
                         "cannot unroll")

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
            bounds.lower_bound_pw_aff.coalesce(), constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    result = []

    for i in range(length):
        idx_aff = lower_bound_aff + i
        new_codegen_state = codegen_state.fix(iname, idx_aff)
        result.append(build_loop_nest(new_codegen_state, sched_index + 1))

    return merge_codegen_results(codegen_state, result)
Example #5
0
 def get_constant_iname_length(self, iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import aff_to_expr
     return int(
         aff_to_expr(
             static_max_of_pw_aff(self.get_iname_bounds(
                 iname, constants_only=True).size,
                                  constants_only=True)))
Example #6
0
 def _get_int_iname_size(iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
     size = pw_aff_to_expr(
         static_max_of_pw_aff(kernel.get_iname_bounds(iname).size,
                              constants_only=True))
     assert isinstance(size, six.integer_types)
     return size
Example #7
0
 def _get_int_iname_size(iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
     size = pw_aff_to_expr(
             static_max_of_pw_aff(
                 kernel.get_iname_bounds(iname).size,
                 constants_only=True))
     assert isinstance(size, six.integer_types)
     return size
Example #8
0
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(codegen_state, sched_index)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Example #9
0
File: loop.py Project: tj-sun/loopy
def generate_vectorize_loop(codegen_state, sched_index):
    kernel = codegen_state.kernel

    iname = kernel.schedule[sched_index].iname

    bounds = kernel.get_iname_bounds(iname, constants_only=True)

    from loopy.isl_helpers import (
            static_max_of_pw_aff, static_value_of_pw_aff)
    from loopy.symbolic import pw_aff_to_expr

    length_aff = static_max_of_pw_aff(bounds.size, constants_only=True)

    if not length_aff.is_cst():
        warn(kernel, "vec_upper_not_const",
                "upper bound for vectorized loop '%s' is not a constant, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    length = int(pw_aff_to_expr(length_aff))

    try:
        lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(),
                constants_only=False)
    except Exception as e:
        raise type(e)("while finding lower bound of '%s': " % iname)

    if not lower_bound_aff.plain_is_zero():
        warn(kernel, "vec_lower_not_0",
                "lower bound for vectorized loop '%s' is not zero, "
                "cannot vectorize--unrolling instead")
        return generate_unroll_loop(kernel, sched_index, codegen_state)

    # {{{ 'implement' vectorization bounds

    domain = kernel.get_inames_domain(iname)

    from loopy.isl_helpers import make_slab
    slab = make_slab(domain.get_space(), iname,
            lower_bound_aff, lower_bound_aff+length)
    codegen_state = codegen_state.intersect(slab)

    # }}}

    from loopy.codegen import VectorizationInfo
    new_codegen_state = codegen_state.copy(
            vectorization_info=VectorizationInfo(
                iname=iname,
                length=length,
                space=length_aff.space))

    return build_loop_nest(new_codegen_state, sched_index+1)
Example #10
0
def determine_temporaries_to_promote(kernel, temporaries, name_gen):
    """
    :returns: A :class:`dict` mapping temporary names from `temporaries` to
              :class:`PromotedTemporary` objects
    """
    new_temporaries = {}

    def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel)

    from loopy.kernel.data import LocalIndexTag

    for temporary in temporaries:
        temporary = kernel.temporary_variables[temporary]
        if temporary.scope == temp_var_scope.GLOBAL:
            # Nothing to be done for global temporaries (I hope)
            continue

        assert temporary.base_storage is None, \
            "Cannot promote temporaries with base_storage to global"

        hw_inames = get_common_hw_inames(kernel,
            def_lists[temporary.name] + use_lists[temporary.name])

        # This takes advantage of the fact that g < l in the alphabet :)
        hw_inames = sorted(hw_inames,
            key=lambda iname: str(kernel.iname_to_tag[iname]))

        shape_prefix = []

        backing_hw_inames = []
        for iname in hw_inames:
            tag = kernel.iname_to_tag[iname]
            is_local_iname = isinstance(tag, LocalIndexTag)
            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
                # Restrict shape to that of group inames for locals.
                continue
            backing_hw_inames.append(iname)
            from loopy.isl_helpers import static_max_of_pw_aff
            from loopy.symbolic import aff_to_expr
            shape_prefix.append(
                aff_to_expr(
                    static_max_of_pw_aff(
                        kernel.get_iname_bounds(iname).size, False)))

        backing_temporary = PromotedTemporary(
            name=name_gen(temporary.name),
            orig_temporary=temporary,
            shape_prefix=tuple(shape_prefix),
            hw_inames=backing_hw_inames)
        new_temporaries[temporary.name] = backing_temporary

    return new_temporaries
Example #11
0
def determine_temporaries_to_promote(kernel, temporaries, name_gen):
    """
    :returns: A :class:`dict` mapping temporary names from `temporaries` to
              :class:`PromotedTemporary` objects
    """
    new_temporaries = {}

    def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel)

    from loopy.kernel.data import LocalIndexTag

    for temporary in temporaries:
        temporary = kernel.temporary_variables[temporary]
        if temporary.scope == temp_var_scope.GLOBAL:
            # Nothing to be done for global temporaries (I hope)
            continue

        assert temporary.base_storage is None, \
            "Cannot promote temporaries with base_storage to global"

        hw_inames = get_common_hw_inames(
            kernel, def_lists[temporary.name] + use_lists[temporary.name])

        # This takes advantage of the fact that g < l in the alphabet :)
        hw_inames = sorted(hw_inames,
                           key=lambda iname: str(kernel.iname_to_tag[iname]))

        shape_prefix = []

        backing_hw_inames = []
        for iname in hw_inames:
            tag = kernel.iname_to_tag[iname]
            is_local_iname = isinstance(tag, LocalIndexTag)
            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
                # Restrict shape to that of group inames for locals.
                continue
            backing_hw_inames.append(iname)
            from loopy.isl_helpers import static_max_of_pw_aff
            from loopy.symbolic import aff_to_expr
            shape_prefix.append(
                aff_to_expr(
                    static_max_of_pw_aff(
                        kernel.get_iname_bounds(iname).size, False)))

        backing_temporary = PromotedTemporary(name=name_gen(temporary.name),
                                              orig_temporary=temporary,
                                              shape_prefix=tuple(shape_prefix),
                                              hw_inames=backing_hw_inames)
        new_temporaries[temporary.name] = backing_temporary

    return new_temporaries
Example #12
0
    def base_index_and_length(self, set, iname, context=None):
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr

        size = pw_aff_to_expr(static_max_of_pw_aff(
                upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=False,
                context=context))
        try:
            base_index = pw_aff_to_expr(
                    static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False,
                        context=context))
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': %s" % (iname, str(e)))

        return base_index, size
Example #13
0
 def get_constant_iname_length(self, iname):
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import aff_to_expr
     return int(aff_to_expr(static_max_of_pw_aff(
             self.get_iname_bounds(iname, constants_only=True).size,
             constants_only=True)))
Example #14
0
def duplicate_private_temporaries_for_ilp_and_vec(kernel):
    logger.debug("%s: duplicate temporaries for ilp" % kernel.name)

    wmap = kernel.writer_map()

    from loopy.kernel.data import IlpBaseTag, VectorizeTag

    var_to_new_ilp_inames = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]
            ilp_inames = frozenset(iname
                    for iname in kernel.insn_inames(writer_insn)
                    if isinstance(
                        kernel.iname_to_tag.get(iname),
                        (IlpBaseTag, VectorizeTag)))

            referenced_ilp_inames = (ilp_inames
                    & writer_insn.write_dependency_names())

            new_ilp_inames = ilp_inames - referenced_ilp_inames

            if not new_ilp_inames:
                break

            if tv.name in var_to_new_ilp_inames:
                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
                    raise LoopyError("instruction '%s' requires adding "
                            "indices for ILP inames '%s' on var '%s', but previous "
                            "instructions required inames '%s'"
                            % (writer_insn_id, ", ".join(new_ilp_inames),
                                ", ".join(var_to_new_ilp_inames[tv.name])))

                continue

            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    ilp_iname_to_length = {}
    for ilp_inames in six.itervalues(var_to_new_ilp_inames):
        for iname in ilp_inames:
            if iname in ilp_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=True)
            ilp_iname_to_length[iname] = int(pw_aff_to_expr(
                        static_max_of_pw_aff(bounds.size, constants_only=True)))

            assert static_max_of_pw_aff(
                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_ilp_inames):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
                # Forget what you knew about data layout,
                # create from scratch.
                dim_tags=dim_tags)

    # }}}

    from pymbolic import var
    eiii = ExtraInameIndexInserter(
            dict((var_name, tuple(var(iname) for iname in inames))
                for var_name, inames in six.iteritems(var_to_new_ilp_inames)))

    new_insns = [
            insn.with_transformed_expressions(eiii)
            for insn in kernel.instructions]

    return kernel.copy(
        temporary_variables=new_temp_vars,
        instructions=new_insns)
Example #15
0
def guess_arg_shape_if_requested(kernel, default_order):
    new_args = []

    import loopy as lp
    from loopy.kernel.array import ArrayBase
    from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper

    submap = SubstitutionRuleExpander(kernel.substitutions)

    for arg in kernel.args:
        if isinstance(arg, ArrayBase) and arg.shape is lp.auto:
            armap = AccessRangeMapper(kernel, arg.name)

            try:
                for insn in kernel.instructions:
                    if isinstance(insn, lp.ExpressionInstruction):
                        armap(submap(insn.assignee), kernel.insn_inames(insn))
                        armap(submap(insn.expression), kernel.insn_inames(insn))
            except TypeError as e:
                from traceback import print_exc
                print_exc()

                from loopy.diagnostic import LoopyError
                raise LoopyError(
                        "Failed to (automatically, as requested) find "
                        "shape/strides for argument '%s'. "
                        "Specifying the shape manually should get rid of this. "
                        "The following error occurred: %s"
                        % (arg.name, str(e)))

            if armap.access_range is None:
                if armap.bad_subscripts:
                    raise RuntimeError("cannot determine access range for '%s': "
                            "undetermined index in subscript(s) '%s'"
                            % (arg.name, ", ".join(
                                    str(i) for i in armap.bad_subscripts)))

                # no subscripts found, let's call it a scalar
                shape = ()
            else:
                from loopy.isl_helpers import static_max_of_pw_aff
                from loopy.symbolic import pw_aff_to_expr

                shape = []
                for i in range(armap.access_range.dim(dim_type.set)):
                    try:
                        shape.append(
                                pw_aff_to_expr(static_max_of_pw_aff(
                                    kernel.cache_manager.dim_max(
                                        armap.access_range, i) + 1,
                                    constants_only=False)))
                    except:
                        print("While trying to find shape axis %d of "
                                "argument '%s', the following "
                                "exception occurred:" % (i, arg.name),
                                file=sys.stderr)
                        raise

                shape = tuple(shape)

            if arg.shape is lp.auto:
                arg = arg.copy(shape=shape)

            try:
                arg.strides
            except AttributeError:
                pass
            else:
                if arg.strides is lp.auto:
                    from loopy.kernel.data import make_strides
                    arg = arg.copy(strides=make_strides(shape, default_order))

        new_args.append(arg)

    return kernel.copy(args=new_args)
Example #16
0
def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(
            kernel, loop_iname, sched_index, codegen_state)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain, slab, across_dim_types=True,
                obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
                dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                        dim_type.param, dom_and_slab.dim(dim_type.param),
                        dt, idx, 1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (
                static_min_of_pw_aff,
                static_max_of_pw_aff)

        lbound = (
                kernel.cache_manager.dim_min(
                    dom_and_slab, loop_iname_idx)
                .gist(kernel.assumptions)
                .coalesce())
        ubound = (
            kernel.cache_manager.dim_max(
                dom_and_slab, loop_iname_idx)
            .gist(kernel.assumptions)
            .coalesce())

        static_lbound = static_min_of_pw_aff(
                lbound,
                constants_only=False)
        static_ubound = static_max_of_pw_aff(
                ubound,
                constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(
                dom_and_slab.space,
                loop_iname, static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(
                    dim_type.set, impl_slab.dim(dim_type.set),
                    dt, idx, 1)

        new_codegen_state = codegen_state.intersect(impl_slab)

        inner = build_loop_nest(
                intersect_kernel_with_slab(
                    kernel, slab, iname),
                sched_index+1, new_codegen_state)

        # }}}

        if cmt is not None:
            from cgen import Comment
            result.append(Comment(cmt))

        from cgen import Initializer, POD, Const, Line
        from loopy.symbolic import aff_to_expr

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(gen_code_block([
                Initializer(Const(POD(kernel.index_dtype, loop_iname)),
                    ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
                Line(),
                inner,
                ]))

        else:
            result.append(
                kernel.target.emit_sequential_loop(
                       codegen_state, loop_iname, kernel.index_dtype,
                       static_lbound, static_ubound, inner))

    return gen_code_block(result)
Example #17
0
def determine_temporaries_to_promote(kernel, temporaries, name_gen):
    """
    For each temporary in the passed list of temporaries, construct a
    :class:`PromotedTemporary` which describes how the temporary should
    get promoted into global storage.

    :returns: A :class:`dict` mapping temporary names from `temporaries` to
              :class:`PromotedTemporary` objects
    """
    new_temporaries = {}

    def_lists, use_lists = get_def_and_use_lists_for_all_temporaries(kernel)

    from loopy.kernel.data import LocalIndexTag

    for temporary in temporaries:
        temporary = kernel.temporary_variables[temporary]
        if temporary.scope == temp_var_scope.GLOBAL:
            # Nothing to be done for global temporaries (I hope)
            continue

        assert temporary.base_storage is None, \
            "Cannot promote temporaries with base_storage to global"

        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
        # is associated with. This is used for determining the shape of the
        # global storage needed for saving and restoring the temporary across
        # kernel calls.
        #
        # TODO: Make a policy decision about which dimensions to use. Currently,
        # the code looks at each instruction that defines or uses the temporary,
        # and takes the common set of hw-parallel tagged inames associated with
        # these instructions.
        #
        # Furthermore, in the case of local temporaries, inames that are tagged
        # hw-local do not contribute to the global storage shape.
        hw_inames = get_common_hw_inames(
            kernel, def_lists[temporary.name] + use_lists[temporary.name])

        # This takes advantage of the fact that g < l in the alphabet :)
        hw_inames = sorted(hw_inames,
                           key=lambda iname: str(kernel.iname_to_tag[iname]))

        # Calculate the sizes of the dimensions that get added in front for
        # the global storage of the temporary.
        shape_prefix = []

        backing_hw_inames = []
        for iname in hw_inames:
            tag = kernel.iname_to_tag[iname]
            is_local_iname = isinstance(tag, LocalIndexTag)
            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
                # Restrict shape to that of group inames for locals.
                continue
            backing_hw_inames.append(iname)
            from loopy.isl_helpers import static_max_of_pw_aff
            from loopy.symbolic import aff_to_expr
            shape_prefix.append(
                aff_to_expr(
                    static_max_of_pw_aff(
                        kernel.get_iname_bounds(iname).size, False)))

        backing_temporary = PromotedTemporary(name=name_gen(temporary.name),
                                              orig_temporary=temporary,
                                              shape_prefix=tuple(shape_prefix),
                                              hw_inames=backing_hw_inames)
        new_temporaries[temporary.name] = backing_temporary

    return new_temporaries
Example #18
0
    def base_index_and_length(self,
                              set,
                              iname,
                              context=None,
                              n_allowed_params_in_length=None):
        """
        :arg n_allowed_params_in_length: Simplifies the 'length'
            argument so that only the first that many params
            (in the domain of *set*) occur.
        """
        if not isinstance(iname, int):
            iname_to_dim = set.space.get_var_dict()
            idx = iname_to_dim[iname][1]
        else:
            idx = iname

        lower_bound_pw_aff = self.dim_min(set, idx)
        upper_bound_pw_aff = self.dim_max(set, idx)

        from loopy.diagnostic import StaticValueFindingError
        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_min_of_pw_aff,
                                       static_value_of_pw_aff,
                                       find_max_of_pwaff_with_params)
        from loopy.symbolic import pw_aff_to_expr

        # {{{ first: try to find static lower bound value

        try:
            base_index_aff = static_value_of_pw_aff(lower_bound_pw_aff,
                                                    constants_only=False,
                                                    context=context)
        except StaticValueFindingError:
            base_index_aff = None

        if base_index_aff is not None:
            base_index = pw_aff_to_expr(base_index_aff)

            length = find_max_of_pwaff_with_params(
                upper_bound_pw_aff - base_index_aff + 1,
                n_allowed_params_in_length)
            length = pw_aff_to_expr(
                static_max_of_pw_aff(length,
                                     constants_only=False,
                                     context=context))

            return base_index, length

        # }}}

        # {{{ if that didn't work, try finding a lower bound

        base_index_aff = static_min_of_pw_aff(lower_bound_pw_aff,
                                              constants_only=False,
                                              context=context)

        base_index = pw_aff_to_expr(base_index_aff)

        length = find_max_of_pwaff_with_params(
            upper_bound_pw_aff - base_index_aff + 1,
            n_allowed_params_in_length)
        length = pw_aff_to_expr(
            static_max_of_pw_aff(length, constants_only=False,
                                 context=context))

        return base_index, length
Example #19
0
    def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of all instructions whose IDs are given
        in *insn_ids*.

        :arg insn_ids: a :class:`frozenset` of instruction IDs

        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
        """

        all_inames_by_insns = set()
        for insn_id in insn_ids:
            all_inames_by_insns |= self.insn_inames(insn_id)

        if not all_inames_by_insns <= self.all_inames():
            raise RuntimeError("some inames collected from instructions (%s) "
                    "are not present in domain (%s)"
                    % (", ".join(sorted(all_inames_by_insns)),
                        ", ".join(sorted(self.all_inames()))))

        global_sizes = {}
        local_sizes = {}

        from loopy.kernel.data import (
                GroupIndexTag, LocalIndexTag,
                AutoLocalIndexTagBase)

        for iname in all_inames_by_insns:
            tag = self.iname_to_tag.get(iname)

            if isinstance(tag, GroupIndexTag):
                tgt_dict = global_sizes
            elif isinstance(tag, LocalIndexTag):
                tgt_dict = local_sizes
            elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
                raise RuntimeError("cannot find grid sizes if automatic "
                        "local index tags are present")
            else:
                tgt_dict = None

            if tgt_dict is None:
                continue

            size = self.get_iname_bounds(iname).size

            if tag.axis in tgt_dict:
                size = tgt_dict[tag.axis].max(size)

            from loopy.isl_helpers import static_max_of_pw_aff
            try:
                # insist block size is constant
                size = static_max_of_pw_aff(size,
                        constants_only=isinstance(tag, LocalIndexTag))
            except ValueError:
                pass

            tgt_dict[tag.axis] = size

        def to_dim_tuple(size_dict, which, forced_sizes={}):
            forced_sizes = forced_sizes.copy()

            size_list = []
            sorted_axes = sorted(six.iterkeys(size_dict))

            while sorted_axes or forced_sizes:
                if sorted_axes:
                    cur_axis = sorted_axes.pop(0)
                else:
                    cur_axis = None

                if len(size_list) in forced_sizes:
                    size_list.append(forced_sizes.pop(len(size_list)))
                    continue

                assert cur_axis is not None

                if cur_axis > len(size_list):
                    raise RuntimeError("%s axis %d unused" % (
                        which, len(size_list)))

                size_list.append(size_dict[cur_axis])

            return tuple(size_list)

        return (to_dim_tuple(global_sizes, "global"),
                to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
Example #20
0
File: ilp.py Project: spillai/loopy
def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
    if iname is not None:
        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)

    wmap = kernel.writer_map()

    from loopy.kernel.data import IlpBaseTag, VectorizeTag

    var_to_new_ilp_inames = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            if iname is None:
                ilp_inames = frozenset(
                    iname for iname in kernel.insn_inames(writer_insn)
                    if isinstance(kernel.iname_to_tag.get(iname), (
                        IlpBaseTag, VectorizeTag)))
            else:
                if not isinstance(kernel.iname_to_tag.get(iname),
                                  (IlpBaseTag, VectorizeTag)):
                    raise LoopyError("'%s' is not an ILP iname" % iname)

                ilp_inames = frozenset([iname])

            referenced_ilp_inames = (ilp_inames
                                     & writer_insn.write_dependency_names())

            new_ilp_inames = ilp_inames - referenced_ilp_inames

            if not new_ilp_inames:
                break

            if tv.name in var_to_new_ilp_inames:
                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
                    raise LoopyError(
                        "instruction '%s' requires adding "
                        "indices for ILP inames '%s' on var '%s', but previous "
                        "instructions required inames '%s'" %
                        (writer_insn_id, ", ".join(new_ilp_inames), ", ".join(
                            var_to_new_ilp_inames[tv.name])))

                continue

            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    ilp_iname_to_length = {}
    for ilp_inames in six.itervalues(var_to_new_ilp_inames):
        for iname in ilp_inames:
            if iname in ilp_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=True)
            ilp_iname_to_length[iname] = int(
                pw_aff_to_expr(
                    static_max_of_pw_aff(bounds.size, constants_only=True)))

            assert static_max_of_pw_aff(bounds.lower_bound_pw_aff,
                                        constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_ilp_inames):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(ilp_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(
            shape=shape + extra_shape,
            # Forget what you knew about data layout,
            # create from scratch.
            dim_tags=dim_tags,
            dim_names=None)

    # }}}

    from pymbolic import var
    eiii = ExtraInameIndexInserter(
        dict((var_name, tuple(var(iname) for iname in inames))
             for var_name, inames in six.iteritems(var_to_new_ilp_inames)))

    new_insns = [
        insn.with_transformed_expressions(eiii) for insn in kernel.instructions
    ]

    return kernel.copy(temporary_variables=new_temp_vars,
                       instructions=new_insns)
Example #21
0
def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
    """
    :arg inames: fastest varying last
    :arg within: a stack match as understood by
        :func:`loopy.match.parse_stack_match`.
    """

    # now fastest varying first
    inames = inames[::-1]

    if new_iname is None:
        new_iname = kernel.get_var_name_generator()("_and_".join(inames))

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, frozenset(inames))
    for iname in inames:
        if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
            raise LoopyError("iname '%s' is not 'at home' in the "
                             "join's leaf domain" % iname)

    new_domain = domch.domain
    new_dim_idx = new_domain.dim(dim_type.set)
    new_domain = new_domain.add_dims(dim_type.set, 1)
    new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname)

    joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space)
    subst_dict = {}
    base_divisor = 1

    from pymbolic import var

    for i, iname in enumerate(inames):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
        iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

        joint_aff = joint_aff + base_divisor * iname_aff

        bounds = kernel.get_iname_bounds(iname, constants_only=True)

        from loopy.isl_helpers import (static_max_of_pw_aff,
                                       static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        length = int(
            pw_aff_to_expr(
                static_max_of_pw_aff(bounds.size, constants_only=True)))

        try:
            lower_bound_aff = static_value_of_pw_aff(
                bounds.lower_bound_pw_aff.coalesce(), constants_only=False)
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': " % iname)

        my_val = var(new_iname) // base_divisor
        if i + 1 < len(inames):
            my_val %= length
        my_val += pw_aff_to_expr(lower_bound_aff)
        subst_dict[iname] = my_val

        base_divisor *= length

    from loopy.isl_helpers import iname_rel_aff
    new_domain = new_domain.add_constraint(
        isl.Constraint.equality_from_aff(
            iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff)))

    for i, iname in enumerate(inames):
        iname_to_dim = new_domain.get_space().get_var_dict()
        iname_dt, iname_idx = iname_to_dim[iname]

        if within is None:
            new_domain = new_domain.project_out(iname_dt, iname_idx, 1)

    def subst_within_inames(fid):
        result = set()
        for iname in fid:
            if iname in inames:
                result.add(new_iname)
            else:
                result.add(iname)

        return frozenset(result)

    new_insns = [
        insn.copy(within_inames=subst_within_inames(insn.within_inames))
        for insn in kernel.instructions
    ]

    kernel = (kernel.copy(
        instructions=new_insns,
        domains=domch.get_domains_with(new_domain),
        applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]))

    from loopy.match import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
        kernel.substitutions, kernel.get_var_name_generator())
    ijoin = _InameJoiner(rule_mapping_context, within,
                         make_subst_func(subst_dict), inames, new_iname)

    kernel = rule_mapping_context.finish_kernel(ijoin.map_kernel(kernel))

    if tag is not None:
        kernel = tag_inames(kernel, {new_iname: tag})

    return kernel
Example #22
0
    def auto_promote_temporary(self, temporary_name):
        temporary = self.kernel.temporary_variables[temporary_name]

        if temporary.scope == temp_var_scope.GLOBAL:
            # Nothing to be done for global temporaries (I hope)
            return None

        if temporary.initializer is not None:
            # Temporaries with initializers do not need saving/reloading - the
            # code generation takes care of emitting the initializers.
            assert temporary.read_only
            return None

        if temporary.base_storage is not None:
            raise ValueError(
                "Cannot promote temporaries with base_storage to global")

        # `hw_inames`: The set of hw-parallel tagged inames that this temporary
        # is associated with. This is used for determining the shape of the
        # global storage needed for saving and restoring the temporary across
        # kernel calls.
        #
        # TODO: Make a policy decision about which dimensions to use. Currently,
        # the code looks at each instruction that defines or uses the temporary,
        # and takes the common set of hw-parallel tagged inames associated with
        # these instructions.
        #
        # Furthermore, in the case of local temporaries, inames that are tagged
        # hw-local do not contribute to the global storage shape.
        hw_inames = self.insn_query.common_hw_inames(
            self.insn_query.insns_reading_or_writing(temporary.name))

        # We want hw_inames to be arranged according to the order:
        #    g.0 < g.1 < ... < l.0 < l.1 < ...
        # Sorting lexicographically accomplishes this.
        hw_inames = sorted(
            hw_inames, key=lambda iname: str(self.kernel.iname_to_tag[iname]))

        # Calculate the sizes of the dimensions that get added in front for
        # the global storage of the temporary.
        hw_dims = []

        backing_hw_inames = []

        for iname in hw_inames:
            tag = self.kernel.iname_to_tag[iname]
            from loopy.kernel.data import LocalIndexTag
            is_local_iname = isinstance(tag, LocalIndexTag)
            if is_local_iname and temporary.scope == temp_var_scope.LOCAL:
                # Restrict shape to that of group inames for locals.
                continue
            backing_hw_inames.append(iname)
            from loopy.isl_helpers import static_max_of_pw_aff
            from loopy.symbolic import aff_to_expr
            hw_dims.append(
                aff_to_expr(
                    static_max_of_pw_aff(
                        self.kernel.get_iname_bounds(iname).size, False)))

        non_hw_dims = temporary.shape

        if len(non_hw_dims) == 0 and len(hw_dims) == 0:
            # Scalar not in hardware: ensure at least one dimension.
            non_hw_dims = (1, )

        backing_temporary = self.PromotedTemporary(
            name=self.var_name_gen(temporary.name + "_save_slot"),
            orig_temporary=temporary,
            hw_dims=tuple(hw_dims),
            non_hw_dims=non_hw_dims,
            hw_inames=backing_hw_inames)

        return backing_temporary
Example #23
0
def privatize_temporaries_with_inames(
        kernel, privatizing_inames, only_var_names=None):
    """This function provides each loop iteration of the *privatizing_inames*
    with its own private entry in the temporaries it accesses (possibly
    restricted to *only_var_names*).

    This is accomplished implicitly as part of generating instruction-level
    parallelism by the "ILP" tag and accessible separately through this
    transformation.

    Example::

        for imatrix, i
            acc = 0
            for k
                acc = acc + a[imatrix, i, k] * vec[k]
            end
        end

    might become::

        for imatrix, i
            acc[imatrix] = 0
            for k
                acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k]
            end
        end

    facilitating loop interchange of the *imatrix* loop.
    .. versionadded:: 2018.1
    """

    if isinstance(privatizing_inames, str):
        privatizing_inames = frozenset(
                s.strip()
                for s in privatizing_inames.split(","))

    if isinstance(only_var_names, str):
        only_var_names = frozenset(
                s.strip()
                for s in only_var_names.split(","))

    wmap = kernel.writer_map()

    var_to_new_priv_axis_iname = {}

    # {{{ find variables that need extra indices

    for tv in six.itervalues(kernel.temporary_variables):
        if only_var_names is not None and tv.name not in only_var_names:
            continue

        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames

            referenced_priv_axis_inames = (priv_axis_inames
                    & writer_insn.write_dependency_names())

            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames

            if not new_priv_axis_inames:
                break

            if tv.name in var_to_new_priv_axis_iname:
                if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]):
                    raise LoopyError("instruction '%s' requires adding "
                            "indices for privatizing var '%s' on iname(s) '%s', "
                            "but previous instructions required inames '%s'"
                            % (writer_insn_id, tv.name,
                                ", ".join(new_priv_axis_inames),
                                ", ".join(var_to_new_priv_axis_iname[tv.name])))

                continue

            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    priv_axis_iname_to_length = {}
    for priv_axis_inames in six.itervalues(var_to_new_priv_axis_iname):
        for iname in priv_axis_inames:
            if iname in priv_axis_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=False)
            priv_axis_iname_to_length[iname] = pw_aff_to_expr(
                        static_max_of_pw_aff(bounds.size, constants_only=False))

            assert static_max_of_pw_aff(
                    bounds.lower_bound_pw_aff, constants_only=True).plain_is_zero()

    # }}}

    # {{{ change temporary variables

    from loopy.kernel.data import VectorizeTag

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(priv_axis_iname_to_length[iname] for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if kernel.iname_tags_of_type(iname, VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(shape=shape + extra_shape,
                # Forget what you knew about data layout,
                # create from scratch.
                dim_tags=dim_tags,
                dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = dict(
            (var_name, tuple(var(iname) for iname in inames))
            for var_name, inames in six.iteritems(var_to_new_priv_axis_iname))

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_priv_axis_inames <= insn.within_inames:
            raise LoopyError(
                    "Kernel '%s': Instruction '%s': touched variable that "
                    "(for privatization, e.g. as performed for ILP) "
                    "required iname(s) '%s', but that the instruction was not "
                    "previously within the iname(s). To remedy this, first promote"
                    "the instruction into the iname."
                    % (kernel.name, insn.id, ", ".join(
                        eiii.seen_priv_axis_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(
        temporary_variables=new_temp_vars,
        instructions=new_insns)
Example #24
0
def generate_sequential_loop_dim_code(codegen_state, sched_index):
    kernel = codegen_state.kernel

    ecm = codegen_state.expression_to_code_mapper
    loop_iname = kernel.schedule[sched_index].iname

    slabs = get_slab_decomposition(kernel, loop_iname)

    from loopy.codegen.bounds import get_usable_inames_for_conditional

    # Note: this does not include loop_iname itself!
    usable_inames = get_usable_inames_for_conditional(kernel, sched_index)
    domain = kernel.get_inames_domain(loop_iname)

    result = []

    for slab_name, slab in slabs:
        cmt = "%s slab for '%s'" % (slab_name, loop_iname)
        if len(slabs) == 1:
            cmt = None

        # {{{ find bounds

        aligned_domain = isl.align_spaces(domain,
                                          slab,
                                          across_dim_types=True,
                                          obj_bigger_ok=True)

        dom_and_slab = aligned_domain & slab

        assumptions_non_param = isl.BasicSet.from_params(kernel.assumptions)
        dom_and_slab, assumptions_non_param = isl.align_two(
            dom_and_slab, assumptions_non_param)
        dom_and_slab = dom_and_slab & assumptions_non_param

        # move inames that are usable into parameters
        moved_inames = []
        for iname in dom_and_slab.get_var_names(dim_type.set):
            if iname in usable_inames:
                moved_inames.append(iname)
                dt, idx = dom_and_slab.get_var_dict()[iname]
                dom_and_slab = dom_and_slab.move_dims(
                    dim_type.param, dom_and_slab.dim(dim_type.param), dt, idx,
                    1)

        _, loop_iname_idx = dom_and_slab.get_var_dict()[loop_iname]

        from loopy.isl_helpers import (static_min_of_pw_aff,
                                       static_max_of_pw_aff)

        lbound = (kernel.cache_manager.dim_min(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())
        ubound = (kernel.cache_manager.dim_max(
            dom_and_slab, loop_iname_idx).gist(kernel.assumptions).coalesce())

        static_lbound = static_min_of_pw_aff(lbound, constants_only=False)
        static_ubound = static_max_of_pw_aff(ubound, constants_only=False)

        # }}}

        # {{{ find implemented slab, build inner code

        from loopy.isl_helpers import make_slab_from_bound_pwaffs

        # impl_slab may be overapproximated
        impl_slab = make_slab_from_bound_pwaffs(dom_and_slab.space, loop_iname,
                                                static_lbound, static_ubound)

        for iname in moved_inames:
            dt, idx = impl_slab.get_var_dict()[iname]
            impl_slab = impl_slab.move_dims(dim_type.set,
                                            impl_slab.dim(dim_type.set), dt,
                                            idx, 1)

        new_codegen_state = (codegen_state.intersect(impl_slab).copy(
            kernel=intersect_kernel_with_slab(kernel, slab, iname)))

        inner = build_loop_nest(new_codegen_state, sched_index + 1)

        # }}}

        if cmt is not None:
            result.append(codegen_state.ast_builder.emit_comment(cmt))

        from loopy.symbolic import aff_to_expr

        astb = codegen_state.ast_builder

        if (static_ubound - static_lbound).plain_is_zero():
            # single-trip, generate just a variable assignment, not a loop
            result.append(
                merge_codegen_results(codegen_state, [
                    astb.emit_initializer(codegen_state,
                                          kernel.index_dtype,
                                          loop_iname,
                                          ecm(aff_to_expr(static_lbound),
                                              PREC_NONE, "i"),
                                          is_const=True),
                    astb.emit_blank_line(),
                    inner,
                ]))

        else:
            inner_ast = inner.current_ast(codegen_state)
            result.append(
                inner.with_new_ast(
                    codegen_state,
                    astb.emit_sequential_loop(codegen_state, loop_iname,
                                              kernel.index_dtype,
                                              static_lbound, static_ubound,
                                              inner_ast)))

    return merge_codegen_results(codegen_state, result)
Example #25
0
def guess_var_shape(kernel, var_name):
    from loopy.symbolic import SubstitutionRuleExpander, AccessRangeMapper

    armap = AccessRangeMapper(kernel, var_name)

    submap = SubstitutionRuleExpander(kernel.substitutions)

    def run_through_armap(expr):
        armap(submap(expr), kernel.insn_inames(insn))
        return expr

    try:
        for insn in kernel.instructions:
            insn.with_transformed_expressions(run_through_armap)
    except TypeError as e:
        from traceback import print_exc
        print_exc()

        raise LoopyError(
            "Failed to (automatically, as requested) find "
            "shape/strides for variable '%s'. "
            "Specifying the shape manually should get rid of this. "
            "The following error occurred: %s" % (var_name, str(e)))

    if armap.access_range is None:
        if armap.bad_subscripts:
            from loopy.symbolic import LinearSubscript
            if any(
                    isinstance(sub, LinearSubscript)
                    for sub in armap.bad_subscripts):
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "linear subscript(s) in '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

            n_axes_in_subscripts = set(
                len(sub.index_tuple) for sub in armap.bad_subscripts)

            if len(n_axes_in_subscripts) != 1:
                raise RuntimeError("subscripts of '%s' with differing "
                                   "numbers of axes were found" % var_name)

            n_axes, = n_axes_in_subscripts

            if n_axes == 1:
                # Leave shape undetermined--we can live with that for 1D.
                shape = (None, )
            else:
                raise LoopyError(
                    "cannot determine access range for '%s': "
                    "undetermined index in subscript(s) '%s'" %
                    (var_name, ", ".join(str(i)
                                         for i in armap.bad_subscripts)))

        else:
            # no subscripts found, let's call it a scalar
            shape = ()
    else:
        from loopy.isl_helpers import static_max_of_pw_aff
        from loopy.symbolic import pw_aff_to_expr

        shape = []
        for i in range(armap.access_range.dim(dim_type.set)):
            try:
                shape.append(
                    pw_aff_to_expr(
                        static_max_of_pw_aff(kernel.cache_manager.dim_max(
                            armap.access_range, i) + 1,
                                             constants_only=False)))
            except:
                print("While trying to find shape axis %d of "
                      "variable '%s', the following "
                      "exception occurred:" % (i, var_name),
                      file=sys.stderr)
                print("*** ADVICE: You may need to manually specify the "
                      "shape of argument '%s'." % (var_name),
                      file=sys.stderr)
                raise

        shape = tuple(shape)

    return shape
Example #26
0
def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
    """
    :arg inames: fastest varying last
    :arg within: a stack match as understood by
        :func:`loopy.context_matching.parse_stack_match`.
    """

    # now fastest varying first
    inames = inames[::-1]

    if new_iname is None:
        new_iname = kernel.get_var_name_generator()("_and_".join(inames))

    from loopy.kernel.tools import DomainChanger
    domch = DomainChanger(kernel, frozenset(inames))
    for iname in inames:
        if kernel.get_home_domain_index(iname) != domch.leaf_domain_index:
            raise LoopyError("iname '%s' is not 'at home' in the "
                    "join's leaf domain" % iname)

    new_domain = domch.domain
    new_dim_idx = new_domain.dim(dim_type.set)
    new_domain = new_domain.add_dims(dim_type.set, 1)
    new_domain = new_domain.set_dim_name(dim_type.set, new_dim_idx, new_iname)

    joint_aff = zero = isl.Aff.zero_on_domain(new_domain.space)
    subst_dict = {}
    base_divisor = 1

    from pymbolic import var

    for i, iname in enumerate(inames):
        iname_dt, iname_idx = zero.get_space().get_var_dict()[iname]
        iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)

        joint_aff = joint_aff + base_divisor*iname_aff

        bounds = kernel.get_iname_bounds(iname, constants_only=True)

        from loopy.isl_helpers import (
                static_max_of_pw_aff, static_value_of_pw_aff)
        from loopy.symbolic import pw_aff_to_expr

        length = int(pw_aff_to_expr(
            static_max_of_pw_aff(bounds.size, constants_only=True)))

        try:
            lower_bound_aff = static_value_of_pw_aff(
                    bounds.lower_bound_pw_aff.coalesce(),
                    constants_only=False)
        except Exception as e:
            raise type(e)("while finding lower bound of '%s': " % iname)

        my_val = var(new_iname) // base_divisor
        if i+1 < len(inames):
            my_val %= length
        my_val += pw_aff_to_expr(lower_bound_aff)
        subst_dict[iname] = my_val

        base_divisor *= length

    from loopy.isl_helpers import iname_rel_aff
    new_domain = new_domain.add_constraint(
            isl.Constraint.equality_from_aff(
                iname_rel_aff(new_domain.get_space(), new_iname, "==", joint_aff)))

    for i, iname in enumerate(inames):
        iname_to_dim = new_domain.get_space().get_var_dict()
        iname_dt, iname_idx = iname_to_dim[iname]

        if within is None:
            new_domain = new_domain.project_out(iname_dt, iname_idx, 1)

    def subst_forced_iname_deps(fid):
        result = set()
        for iname in fid:
            if iname in inames:
                result.add(new_iname)
            else:
                result.add(iname)

        return frozenset(result)

    new_insns = [
            insn.copy(
                forced_iname_deps=subst_forced_iname_deps(insn.forced_iname_deps))
            for insn in kernel.instructions]

    kernel = (kernel
            .copy(
                instructions=new_insns,
                domains=domch.get_domains_with(new_domain),
                applied_iname_rewrites=kernel.applied_iname_rewrites + [subst_dict]
                ))

    from loopy.context_matching import parse_stack_match
    within = parse_stack_match(within)

    from pymbolic.mapper.substitutor import make_subst_func
    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, kernel.get_var_name_generator())
    ijoin = _InameJoiner(rule_mapping_context, within,
            make_subst_func(subst_dict),
            inames, new_iname)

    kernel = rule_mapping_context.finish_kernel(
            ijoin.map_kernel(kernel))

    if tag is not None:
        kernel = tag_inames(kernel, {new_iname: tag})

    return kernel
Example #27
0
def privatize_temporaries_with_inames(kernel,
                                      privatizing_inames,
                                      only_var_names=None):
    """This function provides each loop iteration of the *privatizing_inames*
    with its own private entry in the temporaries it accesses (possibly
    restricted to *only_var_names*).

    This is accomplished implicitly as part of generating instruction-level
    parallelism by the "ILP" tag and accessible separately through this
    transformation.

    Example::

        for imatrix, i
            acc = 0
            for k
                acc = acc + a[imatrix, i, k] * vec[k]
            end
        end

    might become::

        for imatrix, i
            acc[imatrix] = 0
            for k
                acc[imatrix] = acc[imatrix] + a[imatrix, i, k] * vec[k]
            end
        end

    facilitating loop interchange of the *imatrix* loop.
    .. versionadded:: 2018.1
    """

    if isinstance(privatizing_inames, str):
        privatizing_inames = frozenset(s.strip()
                                       for s in privatizing_inames.split(","))

    if isinstance(only_var_names, str):
        only_var_names = frozenset(s.strip()
                                   for s in only_var_names.split(","))

    wmap = kernel.writer_map()

    var_to_new_priv_axis_iname = {}

    # {{{ find variables that need extra indices

    for tv in kernel.temporary_variables.values():
        if only_var_names is not None and tv.name not in only_var_names:
            continue

        for writer_insn_id in wmap.get(tv.name, []):
            writer_insn = kernel.id_to_insn[writer_insn_id]

            priv_axis_inames = writer_insn.within_inames & privatizing_inames

            referenced_priv_axis_inames = (
                priv_axis_inames
                & writer_insn.write_dependency_names())

            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames

            if not new_priv_axis_inames:
                break

            if tv.name in var_to_new_priv_axis_iname:
                if new_priv_axis_inames != set(
                        var_to_new_priv_axis_iname[tv.name]):
                    raise LoopyError(
                        "instruction '%s' requires adding "
                        "indices for privatizing var '%s' on iname(s) '%s', "
                        "but previous instructions required inames '%s'" %
                        (writer_insn_id, tv.name,
                         ", ".join(new_priv_axis_inames), ", ".join(
                             var_to_new_priv_axis_iname[tv.name])))

                continue

            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)

    # }}}

    # {{{ find ilp iname lengths

    from loopy.isl_helpers import static_max_of_pw_aff
    from loopy.symbolic import pw_aff_to_expr

    priv_axis_iname_to_length = {}
    iname_to_lbound = {}
    for priv_axis_inames in var_to_new_priv_axis_iname.values():
        for iname in priv_axis_inames:
            if iname in priv_axis_iname_to_length:
                continue

            bounds = kernel.get_iname_bounds(iname, constants_only=False)
            priv_axis_iname_to_length[iname] = pw_aff_to_expr(
                static_max_of_pw_aff(bounds.size, constants_only=False))
            iname_to_lbound[iname] = pw_aff_to_expr(bounds.lower_bound_pw_aff)

    # }}}

    # {{{ change temporary variables

    from loopy.kernel.data import VectorizeTag

    new_temp_vars = kernel.temporary_variables.copy()
    for tv_name, inames in var_to_new_priv_axis_iname.items():
        tv = new_temp_vars[tv_name]
        extra_shape = tuple(priv_axis_iname_to_length[iname]
                            for iname in inames)

        shape = tv.shape
        if shape is None:
            shape = ()

        dim_tags = ["c"] * (len(shape) + len(extra_shape))
        for i, iname in enumerate(inames):
            if kernel.iname_tags_of_type(iname, VectorizeTag):
                dim_tags[len(shape) + i] = "vec"

        new_temp_vars[tv.name] = tv.copy(
            shape=shape + extra_shape,
            # Forget what you knew about data layout,
            # create from scratch.
            dim_tags=dim_tags,
            dim_names=None)

    # }}}

    from pymbolic import var
    var_to_extra_iname = {
        var_name: tuple(var(iname) for iname in inames)
        for var_name, inames in var_to_new_priv_axis_iname.items()
    }

    new_insns = []

    for insn in kernel.instructions:
        eiii = ExtraInameIndexInserter(var_to_extra_iname, iname_to_lbound)
        new_insn = insn.with_transformed_expressions(eiii)
        if not eiii.seen_priv_axis_inames <= insn.within_inames:
            raise LoopyError(
                "Kernel '%s': Instruction '%s': touched variable that "
                "(for privatization, e.g. as performed for ILP) "
                "required iname(s) '%s', but that the instruction was not "
                "previously within the iname(s). To remedy this, first promote"
                "the instruction into the iname." %
                (kernel.name, insn.id,
                 ", ".join(eiii.seen_priv_axis_inames - insn.within_inames)))

        new_insns.append(new_insn)

    return kernel.copy(temporary_variables=new_temp_vars,
                       instructions=new_insns)
Example #28
0
    def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False):
        """Return a tuple (global_size, local_size) containing a grid that
        could accommodate execution of all instructions whose IDs are given
        in *insn_ids*.

        :arg insn_ids: a :class:`frozenset` of instruction IDs

        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
        """

        all_inames_by_insns = set()
        for insn_id in insn_ids:
            all_inames_by_insns |= self.insn_inames(insn_id)

        if not all_inames_by_insns <= self.all_inames():
            raise RuntimeError("some inames collected from instructions (%s) "
                               "are not present in domain (%s)" %
                               (", ".join(sorted(all_inames_by_insns)),
                                ", ".join(sorted(self.all_inames()))))

        global_sizes = {}
        local_sizes = {}

        from loopy.kernel.data import (GroupIndexTag, LocalIndexTag,
                                       AutoLocalIndexTagBase)

        for iname in all_inames_by_insns:
            tag = self.iname_to_tag.get(iname)

            if isinstance(tag, GroupIndexTag):
                tgt_dict = global_sizes
            elif isinstance(tag, LocalIndexTag):
                tgt_dict = local_sizes
            elif isinstance(tag, AutoLocalIndexTagBase) and not ignore_auto:
                raise RuntimeError("cannot find grid sizes if automatic "
                                   "local index tags are present")
            else:
                tgt_dict = None

            if tgt_dict is None:
                continue

            size = self.get_iname_bounds(iname).size

            if tag.axis in tgt_dict:
                size = tgt_dict[tag.axis].max(size)

            from loopy.isl_helpers import static_max_of_pw_aff
            try:
                # insist block size is constant
                size = static_max_of_pw_aff(size,
                                            constants_only=isinstance(
                                                tag, LocalIndexTag))
            except ValueError:
                pass

            tgt_dict[tag.axis] = size

        def to_dim_tuple(size_dict, which, forced_sizes={}):
            forced_sizes = forced_sizes.copy()

            size_list = []
            sorted_axes = sorted(six.iterkeys(size_dict))

            while sorted_axes or forced_sizes:
                if sorted_axes:
                    cur_axis = sorted_axes.pop(0)
                else:
                    cur_axis = None

                if len(size_list) in forced_sizes:
                    size_list.append(forced_sizes.pop(len(size_list)))
                    continue

                assert cur_axis is not None

                if cur_axis > len(size_list):
                    raise RuntimeError("%s axis %d unused" %
                                       (which, len(size_list)))

                size_list.append(size_dict[cur_axis])

            return tuple(size_list)

        return (to_dim_tuple(global_sizes, "global"),
                to_dim_tuple(local_sizes,
                             "local",
                             forced_sizes=self.local_sizes))