Beispiel #1
0
def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
    from loopy.kernel.data import AddressSpace
    from loopy.kernel.tools import get_subkernels

    for subkernel in get_subkernels(kernel):
        defined_base_storage = set()

        from loopy.schedule.tools import (temporaries_written_in_subkernel,
                                          temporaries_read_in_subkernel)

        for temporary in temporaries_written_in_subkernel(kernel, subkernel):
            tval = kernel.temporary_variables[temporary]
            if tval.base_storage is not None:
                defined_base_storage.add(tval.base_storage)

        for temporary in (temporaries_read_in_subkernel(kernel, subkernel) -
                          temporaries_written_in_subkernel(kernel, subkernel)):
            tval = kernel.temporary_variables[temporary]

            if tval.initializer is not None:
                continue

            # For aliased temporaries, check if there is an aliased definition.
            if tval.base_storage is not None:
                if tval.base_storage not in defined_base_storage:
                    from loopy.diagnostic import MissingDefinitionError
                    raise MissingDefinitionError(
                        "temporary variable '%s' gets "
                        "used in subkernel '%s' and neither it nor its "
                        "aliases have a definition" % (temporary, subkernel))
                continue

            if tval.address_space in (AddressSpace.PRIVATE,
                                      AddressSpace.LOCAL):
                from loopy.diagnostic import MissingDefinitionError
                raise MissingDefinitionError(
                    "temporary variable '%s' gets used "
                    "in subkernel '%s' without a definition (maybe you forgot "
                    "to call loopy.save_and_reload_temporaries?)" %
                    (temporary, subkernel))
Beispiel #2
0
def check_that_temporaries_are_defined_in_subkernels_where_used(kernel):
    from loopy.kernel.data import AddressSpace
    from loopy.kernel.tools import get_subkernels

    for subkernel in get_subkernels(kernel):
        defined_base_storage = set()

        from loopy.schedule.tools import (
                temporaries_written_in_subkernel, temporaries_read_in_subkernel)

        for temporary in temporaries_written_in_subkernel(kernel, subkernel):
            tval = kernel.temporary_variables[temporary]
            if tval.base_storage is not None:
                defined_base_storage.add(tval.base_storage)

        for temporary in (
                temporaries_read_in_subkernel(kernel, subkernel) -
                temporaries_written_in_subkernel(kernel, subkernel)):
            tval = kernel.temporary_variables[temporary]

            if tval.initializer is not None:
                continue

            # For aliased temporaries, check if there is an aliased definition.
            if tval.base_storage is not None:
                if tval.base_storage not in defined_base_storage:
                    from loopy.diagnostic import MissingDefinitionError
                    raise MissingDefinitionError("temporary variable '%s' gets "
                            "used in subkernel '%s' and neither it nor its "
                            "aliases have a definition" % (temporary, subkernel))
                continue

            if tval.address_space in (AddressSpace.PRIVATE, AddressSpace.LOCAL):
                from loopy.diagnostic import MissingDefinitionError
                raise MissingDefinitionError("temporary variable '%s' gets used "
                        "in subkernel '%s' without a definition (maybe you forgot "
                        "to call loopy.save_and_reload_temporaries?)"
                        % (temporary, subkernel))
Beispiel #3
0
def save_and_reload_temporaries(knl):
    """
    Add instructions to save and reload temporary variables that are live
    across kernel calls.

    The basic code transformation turns schedule segments::

        t = <...>
        <return followed by call>
        <...> = t

    into this code::

        t = <...>
        t_save_slot = t
        <return followed by call>
        t = t_save_slot
        <...> = t

    where `t_save_slot` is a newly-created global temporary variable.

    :returns: The resulting kernel
    """
    liveness = LivenessAnalysis(knl)
    saver = TemporarySaver(knl)

    from loopy.schedule.tools import (temporaries_read_in_subkernel,
                                      temporaries_written_in_subkernel)

    for sched_idx, sched_item in enumerate(knl.schedule):

        if isinstance(sched_item, CallKernel):
            # Any written temporary that is live-out needs to be read into
            # memory because of the potential for partial writes.
            if sched_idx == 0:
                # Kernel entry: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (
                    temporaries_read_in_subkernel(knl, subkernel)
                    | temporaries_written_in_subkernel(knl, subkernel))

            for temporary in liveness[
                    sched_idx].live_out & interesting_temporaries:
                logger.info("reloading {0} at entry of {1}".format(
                    temporary, sched_item.kernel_name))
                saver.reload(temporary, sched_item.kernel_name)

        elif isinstance(sched_item, ReturnFromKernel):
            if sched_idx == len(knl.schedule) - 1:
                # Kernel exit: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (temporaries_written_in_subkernel(
                    knl, subkernel))

            for temporary in liveness[
                    sched_idx].live_in & interesting_temporaries:
                logger.info("saving {0} before return of {1}".format(
                    temporary, sched_item.kernel_name))
                saver.save(temporary, sched_item.kernel_name)

    return saver.finish()
Beispiel #4
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import AddressSpace

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
        # Getting the temporary variables that are needed for the current
        # sub-kernel.
        from loopy.schedule.tools import (
                temporaries_read_in_subkernel,
                temporaries_written_in_subkernel)
        subkernel = kernel.schedule[schedule_index].kernel_name
        sub_knl_temps = (
                temporaries_read_in_subkernel(kernel, subkernel) |
                temporaries_written_in_subkernel(kernel, subkernel))

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.address_space != AddressSpace.GLOBAL and (
                            tv.name in sub_knl_temps):
                        decl = self.wrap_temporary_decl(
                                self.get_temporary_decl(
                                    codegen_state, schedule_index, tv, idi),
                                tv.address_space)

                        if tv.initializer is not None:
                            assert tv.read_only
                            decl = Initializer(decl, generate_array_literal(
                                codegen_state, tv, tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage, []).append(
                        tv.address_space)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.address_space)

                    if tv._base_storage_access_may_be_aliasing:
                        ptrtype = _ConstPointer
                    else:
                        # The 'restrict' part of this is a complete lie--of course
                        # all these temporaries are aliased. But we're promising to
                        # not use them to shovel data from one representation to the
                        # other. That counts, right?
                        ptrtype = _ConstRestrictPointer

                    cast_decl = ptrtype(cast_decl)
                    temp_var_decl = ptrtype(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        ecm = self.get_expression_to_code_mapper(codegen_state)

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))

            # FIXME: Could try to use isl knowledge to simplify max.
            if all(isinstance(bs, int) for bs in bs_sizes):
                bs_size_max = max(bs_sizes)
            else:
                bs_size_max = p.Max(tuple(bs_sizes))

            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Beispiel #5
0
    def get_temporary_decls(self, codegen_state, schedule_index):
        from loopy.kernel.data import AddressSpace

        kernel = codegen_state.kernel

        base_storage_decls = []
        temp_decls = []

        # {{{ declare temporaries

        base_storage_sizes = {}
        base_storage_to_scope = {}
        base_storage_to_align_bytes = {}

        from cgen import ArrayOf, Initializer, AlignedAttribute, Value, Line
        # Getting the temporary variables that are needed for the current
        # sub-kernel.
        from loopy.schedule.tools import (
                temporaries_read_in_subkernel,
                temporaries_written_in_subkernel)
        subkernel = kernel.schedule[schedule_index].kernel_name
        sub_knl_temps = (
                temporaries_read_in_subkernel(kernel, subkernel) |
                temporaries_written_in_subkernel(kernel, subkernel))

        for tv in sorted(
                six.itervalues(kernel.temporary_variables),
                key=lambda tv: tv.name):
            decl_info = tv.decl_info(self.target, index_dtype=kernel.index_dtype)

            if not tv.base_storage:
                for idi in decl_info:
                    # global temp vars are mapped to arguments or global declarations
                    if tv.address_space != AddressSpace.GLOBAL and (
                            tv.name in sub_knl_temps):
                        decl = self.wrap_temporary_decl(
                                self.get_temporary_decl(
                                    codegen_state, schedule_index, tv, idi),
                                tv.address_space)

                        if tv.initializer is not None:
                            assert tv.read_only
                            decl = Initializer(decl, generate_array_literal(
                                codegen_state, tv, tv.initializer))

                        temp_decls.append(decl)

            else:
                assert tv.initializer is None

                offset = 0
                base_storage_sizes.setdefault(tv.base_storage, []).append(
                        tv.nbytes)
                base_storage_to_scope.setdefault(tv.base_storage, []).append(
                        tv.address_space)

                align_size = tv.dtype.itemsize

                from loopy.kernel.array import VectorArrayDimTag
                for dim_tag, axis_len in zip(tv.dim_tags, tv.shape):
                    if isinstance(dim_tag, VectorArrayDimTag):
                        align_size *= axis_len

                base_storage_to_align_bytes.setdefault(tv.base_storage, []).append(
                        align_size)

                for idi in decl_info:
                    cast_decl = POD(self, idi.dtype, "")
                    temp_var_decl = POD(self, idi.dtype, idi.name)

                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.address_space)
                    temp_var_decl = self.wrap_temporary_decl(
                            temp_var_decl, tv.address_space)

                    if tv._base_storage_access_may_be_aliasing:
                        ptrtype = _ConstPointer
                    else:
                        # The 'restrict' part of this is a complete lie--of course
                        # all these temporaries are aliased. But we're promising to
                        # not use them to shovel data from one representation to the
                        # other. That counts, right?
                        ptrtype = _ConstRestrictPointer

                    cast_decl = ptrtype(cast_decl)
                    temp_var_decl = ptrtype(temp_var_decl)

                    cast_tp, cast_d = cast_decl.get_decl_pair()
                    temp_var_decl = Initializer(
                            temp_var_decl,
                            "(%s %s) (%s + %s)" % (
                                " ".join(cast_tp), cast_d,
                                tv.base_storage,
                                offset))

                    temp_decls.append(temp_var_decl)

                    from pytools import product
                    offset += (
                            idi.dtype.itemsize
                            * product(si for si in idi.shape))

        ecm = self.get_expression_to_code_mapper(codegen_state)

        for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
            bs_var_decl = Value("char", bs_name)
            from pytools import single_valued
            bs_var_decl = self.wrap_temporary_decl(
                    bs_var_decl, single_valued(base_storage_to_scope[bs_name]))

            # FIXME: Could try to use isl knowledge to simplify max.
            if all(isinstance(bs, int) for bs in bs_sizes):
                bs_size_max = max(bs_sizes)
            else:
                bs_size_max = p.Max(tuple(bs_sizes))

            bs_var_decl = ArrayOf(bs_var_decl, ecm(bs_size_max))

            alignment = max(base_storage_to_align_bytes[bs_name])
            bs_var_decl = AlignedAttribute(alignment, bs_var_decl)

            base_storage_decls.append(bs_var_decl)

        # }}}

        result = base_storage_decls + temp_decls

        if result:
            result.append(Line())

        return result
Beispiel #6
0
def save_and_reload_temporaries(program, entrypoint=None):
    """
    Add instructions to save and reload temporary variables that are live
    across kernel calls.

    The basic code transformation turns schedule segments::

        t = <...>
        <return followed by call>
        <...> = t

    into this code::

        t = <...>
        t_save_slot = t
        <return followed by call>
        t = t_save_slot
        <...> = t

    where `t_save_slot` is a newly-created global temporary variable.

    :returns: The resulting kernel
    """
    if entrypoint is None:
        if len(program.entrypoints) != 1:
            raise LoopyError("Missing argument 'entrypoint'.")
        entrypoint = list(program.entrypoints)[0]

    knl = program[entrypoint]

    if not knl.linearization:
        program = lp.preprocess_program(program)
        from loopy.schedule import get_one_linearized_kernel
        knl = get_one_linearized_kernel(program[entrypoint],
                                        program.callables_table)

    assert knl.linearization is not None

    liveness = LivenessAnalysis(knl)
    saver = TemporarySaver(knl, program.callables_table)

    from loopy.schedule.tools import (temporaries_read_in_subkernel,
                                      temporaries_written_in_subkernel)

    for sched_idx, sched_item in enumerate(knl.linearization):

        if isinstance(sched_item, CallKernel):
            # Any written temporary that is live-out needs to be read into
            # memory because of the potential for partial writes.
            if sched_idx == 0:
                # Kernel entry: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (
                    temporaries_read_in_subkernel(knl, subkernel)
                    | temporaries_written_in_subkernel(knl, subkernel))

            for temporary in liveness[
                    sched_idx].live_out & interesting_temporaries:
                logger.info("reloading {} at entry of {}".format(
                    temporary, sched_item.kernel_name))
                saver.reload(temporary, sched_item.kernel_name)

        elif isinstance(sched_item, ReturnFromKernel):
            if sched_idx == len(knl.linearization) - 1:
                # Kernel exit: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (temporaries_written_in_subkernel(
                    knl, subkernel))

            for temporary in liveness[
                    sched_idx].live_in & interesting_temporaries:
                logger.info("saving {} before return of {}".format(
                    temporary, sched_item.kernel_name))
                saver.save(temporary, sched_item.kernel_name)

    return program.with_kernel(saver.finish())
Beispiel #7
0
def save_and_reload_temporaries(knl):
    """
    Add instructions to save and reload temporary variables that are live
    across kernel calls.

    The basic code transformation turns schedule segments::

        t = <...>
        <return followed by call>
        <...> = t

    into this code::

        t = <...>
        t_save_slot = t
        <return followed by call>
        t = t_save_slot
        <...> = t

    where `t_save_slot` is a newly-created global temporary variable.

    :returns: The resulting kernel
    """
    liveness = LivenessAnalysis(knl)
    saver = TemporarySaver(knl)

    from loopy.schedule.tools import (
        temporaries_read_in_subkernel, temporaries_written_in_subkernel)

    for sched_idx, sched_item in enumerate(knl.schedule):

        if isinstance(sched_item, CallKernel):
            # Any written temporary that is live-out needs to be read into
            # memory because of the potential for partial writes.
            if sched_idx == 0:
                # Kernel entry: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (
                    temporaries_read_in_subkernel(knl, subkernel)
                    | temporaries_written_in_subkernel(knl, subkernel))

            for temporary in liveness[sched_idx].live_out & interesting_temporaries:
                logger.info("reloading {0} at entry of {1}"
                        .format(temporary, sched_item.kernel_name))
                saver.reload(temporary, sched_item.kernel_name)

        elif isinstance(sched_item, ReturnFromKernel):
            if sched_idx == len(knl.schedule) - 1:
                # Kernel exit: nothing live
                interesting_temporaries = set()
            else:
                subkernel = sched_item.kernel_name
                interesting_temporaries = (
                    temporaries_written_in_subkernel(knl, subkernel))

            for temporary in liveness[sched_idx].live_in & interesting_temporaries:
                logger.info("saving {0} before return of {1}"
                        .format(temporary, sched_item.kernel_name))
                saver.save(temporary, sched_item.kernel_name)

    return saver.finish()