Esempio n. 1
0
    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
        kernel = self.kernel

        from loopy.kernel.tools import add_dtypes

        if var_to_dtype_set:
            var_to_dtype = {}
            for var, dtype in var_to_dtype_set:
                try:
                    dest_name = kernel.impl_arg_to_arg[var].name
                except KeyError:
                    dest_name = var

                try:
                    var_to_dtype[dest_name] = dtype
                except KeyError:
                    raise LoopyError("cannot set type for '%s': "
                            "no known variable/argument with that name"
                            % var)

            kernel = add_dtypes(kernel, var_to_dtype)

            from loopy.preprocess import infer_unknown_types
            kernel = infer_unknown_types(kernel, expect_completion=True)

        if kernel.schedule is None:
            from loopy.preprocess import preprocess_kernel
            kernel = preprocess_kernel(kernel)

            from loopy.schedule import get_one_scheduled_kernel
            kernel = get_one_scheduled_kernel(kernel)

        return kernel
Esempio n. 2
0
def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)`` to
    :class:`islpy.PwQPolynomial` instances capturing the number of bytes  are
    read/written (where *direction* is either ``read`` or ``write`` on array
    *var_name*

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.kernel import kernel_state
    if kernel.state < kernel_state.PREPROCESSED:
        kernel = preprocess_kernel(kernel)

    result = {}
    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)

    for key, var_fp in fp.items():
        vname, direction = key

        var_descr = kernel.get_var_descriptor(vname)
        bytes_transferred = (
                int(var_descr.dtype.numpy_dtype.itemsize)
                * count(kernel, var_fp))
        if key in result:
            result[key] += bytes_transferred
        else:
            result[key] = bytes_transferred

    return result
Esempio n. 3
0
    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
        kernel = self.kernel

        from loopy.kernel.tools import add_dtypes

        if var_to_dtype_set:
            var_to_dtype = {}
            for var, dtype in var_to_dtype_set:
                try:
                    dest_name = kernel.impl_arg_to_arg[var].name
                except KeyError:
                    dest_name = var

                try:
                    var_to_dtype[dest_name] = dtype
                except KeyError:
                    raise LoopyError("cannot set type for '%s': "
                            "no known variable/argument with that name"
                            % var)

            kernel = add_dtypes(kernel, var_to_dtype)

            from loopy.preprocess import infer_unknown_types
            kernel = infer_unknown_types(kernel, expect_completion=True)

        if kernel.schedule is None:
            from loopy.preprocess import preprocess_kernel
            kernel = preprocess_kernel(kernel)

            from loopy.schedule import get_one_scheduled_kernel
            kernel = get_one_scheduled_kernel(kernel)

        return kernel
Esempio n. 4
0
def get_op_poly(knl, numpy_types=True):

    """Count the number of operations in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **)**
             **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The string specifies the operation type as
               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.

             - The :class:`islpy.PwQPolynomial` holds the number of operations of
               the kind specified in the key (in terms of the
               :class:`loopy.LoopKernel` *parameter inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        poly = get_op_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = ToCountMap()
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.assignee) + op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    result = op_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind), count)
                for (dtype, kind), count in six.iteritems(result))

    return result
Esempio n. 5
0
def get_op_poly(knl, numpy_types=True):

    """Count the number of operations in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **)**
             **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The string specifies the operation type as
               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.

             - The :class:`islpy.PwQPolynomial` holds the number of operations of
               the kind specified in the key (in terms of the
               :class:`loopy.LoopKernel` *parameter inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        poly = get_op_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = ToCountMap()
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.assignee) + op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    result = op_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind), count)
                for (dtype, kind), count in six.iteritems(result))

    return result
Esempio n. 6
0
def add_and_infer_dtypes(knl, dtype_dict):
    processed_dtype_dict = {}

    for k, v in six.iteritems(dtype_dict):
        for subkey in k.split(","):
            subkey = subkey.strip()
            if subkey:
                processed_dtype_dict[subkey] = v

    knl = add_dtypes(knl, processed_dtype_dict)

    from loopy.preprocess import infer_unknown_types
    return infer_unknown_types(knl, expect_completion=True)
Esempio n. 7
0
def gather_access_footprints(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)``
    to :class:`islpy.Set` instances capturing which indices
    of each the array *var_name* are read/written (where
    *direction* is either ``read`` or ``write``.

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)
    kernel = preprocess_kernel(kernel)

    write_footprints = []
    read_footprints = []

    for insn in kernel.instructions:
        if not isinstance(insn, MultiAssignmentBase):
            warn(
                kernel, "count_non_assignment",
                "Non-assignment instruction encountered in "
                "gather_access_footprints, not counted")
            continue

        insn_inames = kernel.insn_inames(insn)
        inames_domain = kernel.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames,
                                                   [dim_type.set]))

        afg = AccessFootprintGatherer(kernel,
                                      domain,
                                      ignore_uncountable=ignore_uncountable)

        for assignee in insn.assignees:
            write_footprints.append(afg(insn.assignees))
        read_footprints.append(afg(insn.expression))

    write_footprints = AccessFootprintGatherer.combine(write_footprints)
    read_footprints = AccessFootprintGatherer.combine(read_footprints)

    result = {}

    for vname, footprint in six.iteritems(write_footprints):
        result[(vname, "write")] = footprint

    for vname, footprint in six.iteritems(read_footprints):
        result[(vname, "read")] = footprint

    return result
Esempio n. 8
0
def gather_access_footprints(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)``
    to :class:`islpy.Set` instances capturing which indices
    of each the array *var_name* are read/written (where
    *direction* is either ``read`` or ``write``.

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)
    kernel = preprocess_kernel(kernel)

    write_footprints = []
    read_footprints = []

    for insn in kernel.instructions:
        if not isinstance(insn, MultiAssignmentBase):
            warn(kernel, "count_non_assignment",
                    "Non-assignment instruction encountered in "
                    "gather_access_footprints, not counted")
            continue

        insn_inames = kernel.insn_inames(insn)
        inames_domain = kernel.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))

        afg = AccessFootprintGatherer(kernel, domain,
                ignore_uncountable=ignore_uncountable)

        for assignee in insn.assignees:
            write_footprints.append(afg(insn.assignees))
        read_footprints.append(afg(insn.expression))

    write_footprints = AccessFootprintGatherer.combine(write_footprints)
    read_footprints = AccessFootprintGatherer.combine(read_footprints)

    result = {}

    for vname, footprint in six.iteritems(write_footprints):
        result[(vname, "write")] = footprint

    for vname, footprint in six.iteritems(read_footprints):
        result[(vname, "read")] = footprint

    return result
Esempio n. 9
0
def get_barrier_poly(knl):

    """Count the number of barriers each thread encounters in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls
             made (in terms of the :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import EnterLoop, LeaveLoop, Barrier
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []
    barrier_poly = isl.PwQPolynomial('{ 0 }')

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()
        elif isinstance(sched_item, Barrier):
            if iname_list:  # (if iname_list is not empty)
                ct = (count(knl, (
                                knl.get_inames_domain(iname_list).
                                project_out_except(iname_list, [dim_type.set])
                                )), )
                barrier_poly += reduce(mul, ct)
            else:
                barrier_poly += isl.PwQPolynomial('{ 1 }')

    return barrier_poly
Esempio n. 10
0
def generate_body(kernel):
    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel

        kernel = get_one_scheduled_kernel(kernel)
    from loopy.kernel import kernel_state

    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been " "scheduled")

    from loopy.preprocess import infer_unknown_types

    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks

    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.kind == "c":
            allow_complex = True

    seen_dtypes = set()
    seen_functions = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
        kernel=kernel,
        implemented_domain=initial_implemented_domain,
        implemented_predicates=frozenset(),
        seen_dtypes=seen_dtypes,
        seen_functions=seen_functions,
        var_subst_map={},
        allow_complex=allow_complex,
    )

    code_str, implemented_domains = kernel.target.generate_body(kernel, codegen_state)

    from loopy.check import check_implemented_domains

    assert check_implemented_domains(kernel, implemented_domains, code_str)

    logger.info("%s: generate code: done" % kernel.name)

    return code_str
Esempio n. 11
0
def get_op_poly(knl):
    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = 0
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    return op_poly
Esempio n. 12
0
def estimate_regs_per_thread(knl):

    """Estimate registers per thread usage by a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.

    :return: An :class:`integer` holding an estimate for the number of registers
             used per thread. This number will most likely be too low, but will
             hopefully be consistantly too low by the same constant factor.

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction  # noqa
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    max_regs = 0
    block_reg_totals = [0]
    # counters to track nested sets of previously used iname+index combinations
    reg_counters = [RegisterUsageEstimator(knl)]

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            block_reg_totals.append(0)
            # start a new estimator
            reg_counters.append(RegisterUsageEstimator(knl))

        elif isinstance(sched_item, LeaveLoop):
            if block_reg_totals[-1] > max_regs:
                max_regs = block_reg_totals[-1]
            # pop to resume previous total
            block_reg_totals.pop()
            reg_counters.pop()

        elif isinstance(sched_item, RunInstruction):
            insn = knl.id_to_insn[sched_item.insn_id]
            block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \
                                    reg_counters[-1](insn.expression)

    # finished looping, check outer block
    if block_reg_totals[-1] > max_regs:
        max_regs = block_reg_totals[-1]

    return max_regs
Esempio n. 13
0
def get_synchronization_poly(knl):

    """Count the number of synchronization events each thread encounters in a
    loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: A dictionary mapping each type of synchronization event to a
            :class:`islpy.PwQPolynomial` holding the number of such events
            per thread.

            Possible keys include ``barrier_local``, ``barrier_global``
            (if supported by the target) and ``kernel_launch``.

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
            CallKernel, ReturnFromKernel, RunInstruction)
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []

    result = ToCountMap()

    one = isl.PwQPolynomial('{ 1 }')

    def get_count_poly(iname_list):
        if iname_list:  # (if iname_list is not empty)
            ct = (count(knl, (
                            knl.get_inames_domain(iname_list).
                            project_out_except(iname_list, [dim_type.set])
                            )), )
            return reduce(mul, ct)
        else:
            return one

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()

        elif isinstance(sched_item, Barrier):
            result = result + ToCountMap(
                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})

        elif isinstance(sched_item, CallKernel):
            result = result + ToCountMap(
                    {"kernel_launch": get_count_poly(iname_list)})

        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
            pass

        else:
            raise LoopyError("unexpected schedule item: %s"
                    % type(sched_item).__name__)

    return result.dict
Esempio n. 14
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import kernel_state
    if kernel.state == kernel_state.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                         "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.debug("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.preprocess import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                arg.decl_info(kernel.target,
                              is_written=is_written,
                              index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(
                ImplementedDataInfo(target=kernel.target,
                                    name=arg.name,
                                    dtype=arg.dtype,
                                    arg_class=ValueArg,
                                    is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
        kernel=kernel,
        implemented_data_info=implemented_data_info,
        implemented_domain=initial_implemented_domain,
        implemented_predicates=frozenset(),
        seen_dtypes=seen_dtypes,
        seen_functions=seen_functions,
        seen_atomic_dtypes=seen_atomic_dtypes,
        var_subst_map={},
        allow_complex=allow_complex,
        var_name_generator=kernel.get_var_name_generator(),
        is_generating_device_code=False,
        gen_program_name=(kernel.target.host_program_name_prefix +
                          kernel.name +
                          kernel.target.host_program_name_suffix),
        schedule_index_end=len(kernel.schedule))

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(codegen_state,
                                                     schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel,
                                     codegen_result.implemented_domains,
                                     device_code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    preamble_info = PreambleInfo(
        kernel=kernel,
        seen_dtypes=seen_dtypes,
        seen_functions=seen_functions,
        # a set of LoopyTypes (!)
        seen_atomic_dtypes=seen_atomic_dtypes)

    preamble_generators = (
        kernel.preamble_generators +
        kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    codegen_result = codegen_result.copy(device_preambles=preambles)

    # }}}

    logger.info("%s: generate code: done" % kernel.name)

    if CACHING_ENABLED:
        code_gen_cache[input_kernel] = codegen_result

    return codegen_result
Esempio n. 15
0
def get_synchronization_poly(knl):
    """Count the number of synchronization events each thread encounters in a
    loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: A dictionary mapping each type of synchronization event to a
            :class:`islpy.PwQPolynomial` holding the number of such events
            per thread.

            Possible keys include ``barrier_local``, ``barrier_global``
            (if supported by the target) and ``kernel_launch``.

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel,
                                ReturnFromKernel, RunInstruction)
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []

    result = ToCountMap()

    one = isl.PwQPolynomial('{ 1 }')

    def get_count_poly(iname_list):
        if iname_list:  # (if iname_list is not empty)
            ct = (count(knl,
                        (knl.get_inames_domain(iname_list).project_out_except(
                            iname_list, [dim_type.set]))), )
            return reduce(mul, ct)
        else:
            return one

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()

        elif isinstance(sched_item, Barrier):
            result = result + ToCountMap(
                {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})

        elif isinstance(sched_item, CallKernel):
            result = result + ToCountMap(
                {"kernel_launch": get_count_poly(iname_list)})

        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
            pass

        else:
            raise LoopyError("unexpected schedule item: %s" %
                             type(sched_item).__name__)

    return result.dict
Esempio n. 16
0
def get_gmem_access_poly(knl,
                         numpy_types=True):  # for now just counting subscripts
    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types

    class CacheHolder(object):
        pass

    cache_holder = CacheHolder()

    @memoize_in(cache_holder, "insn_count")
    def get_insn_count(knl, insn_inames, uniform=False):
        if uniform:
            from loopy.kernel.data import LocalIndexTag
            insn_inames = [
                iname for iname in insn_inames
                if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)
            ]
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames,
                                                   [dim_type.set]))
        return count(knl, domain)

    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        # count subscripts, distinguishing loads and stores
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(
            dict((key + ("load", ), val)
                 for key, val in six.iteritems(subs_expr.dict)))
        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(
            dict((key + ("store", ), val)
                 for key, val in six.iteritems(subs_assignee.dict)))

        insn_inames = knl.insn_inames(insn)

        # use count excluding local index tags for uniform accesses
        for key in subs_expr.dict:
            poly = ToCountMap({key: subs_expr.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly * get_insn_count(
                    knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames)
        for key in subs_assignee.dict:
            poly = ToCountMap({key: subs_assignee.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly * get_insn_count(
                    knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames)

    result = subs_poly.dict

    if numpy_types:
        result = dict(
            ((dtype.numpy_dtype, kind, direction), count)
            for (dtype, kind, direction), count in six.iteritems(result))

    return result
Esempio n. 17
0
def auto_test_vs_ref(ref_knl,
                     ctx,
                     test_knl=None,
                     op_count=[],
                     op_label=[],
                     parameters={},
                     print_ref_code=False,
                     print_code=True,
                     warmup_rounds=2,
                     dump_binary=False,
                     fills_entire_output=None,
                     do_check=True,
                     check_result=None,
                     max_test_kernel_count=1,
                     quiet=False,
                     blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                         "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

    from loopy.compiled import CompiledKernel, get_highlighted_cl_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated",
             DeprecationWarning,
             stacklevel=2)

    # {{{ compile and run reference code

    from loopy.preprocess import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(
            ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("%s (ref): trying %s for the reference calculation" %
                    (ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75 * "-")
            print("Reference Code:")
            print(75 * "-")
            print(get_highlighted_cl_code(ref_compiled.code))
            print(75 * "-")

        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_cl_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75 * "-",
                    "On %s:" % dev, 75 * "-",
                    traceback.format_exc(), 75 * "-"
                ]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("%s (ref): using %s for the reference calculation" %
                    (ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop - ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9 * (ref_evt.profile.END -
                                    ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                         "reference computation.\n"
                         "These errors were encountered:\n" +
                         "\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    args = None
    from loopy.kernel import kernel_state
    if test_knl.state not in [
            kernel_state.PREPROCESSED, kernel_state.SCHEDULED
    ]:
        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.preprocess import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        if args is None:
            cl_kernel_info = compiled.cl_kernel_info(frozenset())

            args = make_args(kernel, cl_kernel_info.implemented_data_info,
                             queue, ref_arg_data, parameters)
        args["out_host"] = False

        if not quiet:
            print(75 * "-")
            print("Kernel #%d:" % i)
            print(75 * "-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75 * "-")
            if dump_binary:
                print(type(compiled.cl_program))
                print(compiled.cl_program.binaries[0])
                print(75 * "-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                        arg_desc.ref_storage_array.get(),
                        shape=arg_desc.ref_shape,
                        strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                        arg_desc.test_storage_array.get(),
                        shape=arg_desc.test_shape,
                        strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = warmup_rounds

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9 * evt_end.profile.START -
                                         1e-9 * evt_start.profile.START) /
                                        timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time - start_time) / timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " %g %s/s" % (cnt / elapsed_wall, lbl)

        if not quiet:

            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                  "(%d rounds)%s" %
                  (format_float_or_none(elapsed_event),
                   format_float_or_none(elapsed_event_marker),
                   format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " %g %s/s" % (cnt / ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: %g s event, %g s wall%s" %
                      (ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict
Esempio n. 18
0
def get_gmem_access_poly(knl):  # for now just counting subscripts

    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The :class:`numpy.dtype` specifies the type of the data being
               accessed.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(dict(
            (key + ("load",), val)
            for key, val in six.iteritems(subs_expr.dict)))

        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(dict(
            (key + ("store",), val)
            for key, val in six.iteritems(subs_assignee.dict)))

        subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain)
    return subs_poly.dict
Esempio n. 19
0
def generate_code(kernel, device=None):
    if device is not None:
        from warnings import warn
        warn("passing 'device' to generate_code() is deprecated",
                DeprecationWarning, stacklevel=2)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)
    from loopy.kernel import kernel_state
    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.info("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.preprocess import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    impl_arg_info = []

    for arg in kernel.args:
        if isinstance(arg, ArrayBase):
            impl_arg_info.extend(
                    arg.decl_info(
                        kernel.target,
                        is_written=arg.name in kernel.get_written_variables(),
                        index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            impl_arg_info.append(ImplementedDataInfo(
                target=kernel.target,
                name=arg.name,
                dtype=arg.dtype,
                cgen_declarator=arg.get_arg_decl(kernel.target),
                arg_class=ValueArg))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.kind == "c":
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
            kernel=kernel,
            implemented_domain=initial_implemented_domain,
            implemented_predicates=frozenset(),
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            var_subst_map={},
            allow_complex=allow_complex)

    code_str, implemented_domains = kernel.target.generate_code(
            kernel, codegen_state, impl_arg_info)

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel, implemented_domains,
            code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    preamble_generators = (kernel.preamble_generators
            + kernel.target.preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions))

    seen_preamble_tags = set()
    dedup_preambles = []

    for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]):
        if tag in seen_preamble_tags:
            continue

        seen_preamble_tags.add(tag)
        dedup_preambles.append(preamble)

    from loopy.tools import remove_common_indentation
    preamble_codes = [
            remove_common_indentation(lines) + "\n"
            for lines in dedup_preambles]

    code_str = "".join(preamble_codes) + code_str

    # }}}

    logger.info("%s: generate code: done" % kernel.name)

    result = code_str, impl_arg_info

    if CACHING_ENABLED:
        code_gen_cache[input_kernel] = result

    return result
Esempio n. 20
0
def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscripts

    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types

    class CacheHolder(object):
        pass

    cache_holder = CacheHolder()

    @memoize_in(cache_holder, "insn_count")
    def get_insn_count(knl, insn_inames, uniform=False):
        if uniform:
            from loopy.kernel.data import LocalIndexTag
            insn_inames = [iname for iname in insn_inames if not
                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(
                                insn_inames, [dim_type.set]))
        return count(knl, domain)

    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        # count subscripts, distinguishing loads and stores
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(dict(
            (key + ("load",), val)
            for key, val in six.iteritems(subs_expr.dict)))
        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(dict(
            (key + ("store",), val)
            for key, val in six.iteritems(subs_assignee.dict)))

        insn_inames = knl.insn_inames(insn)

        # use count excluding local index tags for uniform accesses
        for key in subs_expr.dict:
            poly = ToCountMap({key: subs_expr.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
        for key in subs_assignee.dict:
            poly = ToCountMap({key: subs_assignee.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)

    result = subs_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind, direction), count)
                for (dtype, kind, direction), count in six.iteritems(result))

    return result
Esempio n. 21
0
def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
    knl = _add_dtypes_overdetermined(knl, dtype_dict)

    from loopy.preprocess import infer_unknown_types
    return infer_unknown_types(knl, expect_completion=True)
Esempio n. 22
0
def auto_test_vs_ref(
        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
        print_ref_code=False, print_code=True, warmup_rounds=2,
        dump_binary=False,
        fills_entire_output=None, do_check=True, check_result=None,
        max_test_kernel_count=1,
        quiet=False, blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

    from loopy.compiled import CompiledKernel, get_highlighted_cl_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2)

    # {{{ compile and run reference code

    from loopy.preprocess import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(ref_ctx,
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("%s (ref): trying %s for the reference calculation" % (
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75*"-")
            print("Reference Code:")
            print(75*"-")
            print(get_highlighted_cl_code(ref_compiled.code))
            print(75*"-")

        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_cl_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75*"-",
                    "On %s:" % dev,
                    75*"-",
                    traceback.format_exc(),
                    75*"-"]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("%s (ref): using %s for the reference calculation" % (
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop-ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                "reference computation.\n"
                "These errors were encountered:\n"+"\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)

    args = None
    from loopy.kernel import kernel_state
    if test_knl.state not in [
            kernel_state.PREPROCESSED,
            kernel_state.SCHEDULED]:
        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.preprocess import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        if args is None:
            cl_kernel_info = compiled.cl_kernel_info(frozenset())

            args = make_args(kernel,
                    cl_kernel_info.implemented_data_info,
                    queue, ref_arg_data, parameters)
        args["out_host"] = False

        if not quiet:
            print(75*"-")
            print("Kernel #%d:" % i)
            print(75*"-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75*"-")
            if dump_binary:
                print(type(compiled.cl_program))
                print(compiled.cl_program.binaries[0])
                print(75*"-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                            arg_desc.ref_storage_array.get(),
                            shape=arg_desc.ref_shape,
                            strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                            arg_desc.test_storage_array.get(),
                            shape=arg_desc.test_shape,
                            strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = warmup_rounds

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9*evt_end.profile.START
                            - 1e-9*evt_start.profile.START)
                        / timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time-start_time)/timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)

        if not quiet:
            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                    "(%d rounds)%s" % (
                        format_float_or_none(elapsed_event),
                        format_float_or_none(elapsed_event_marker),
                        format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: %g s event, %g s wall%s" % (
                        ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict
Esempio n. 23
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import kernel_state
    if kernel.state == kernel_state.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.info("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.preprocess import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                    arg.decl_info(
                        kernel.target,
                        is_written=is_written,
                        index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(ImplementedDataInfo(
                target=kernel.target,
                name=arg.name,
                dtype=arg.dtype,
                arg_class=ValueArg,
                is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
            kernel=kernel,
            implemented_data_info=implemented_data_info,
            implemented_domain=initial_implemented_domain,
            implemented_predicates=frozenset(),
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            seen_atomic_dtypes=seen_atomic_dtypes,
            var_subst_map={},
            allow_complex=allow_complex,
            var_name_generator=kernel.get_var_name_generator(),
            is_generating_device_code=False,
            gen_program_name=(
                kernel.target.host_program_name_prefix
                + kernel.name
                + kernel.target.host_program_name_suffix),
            schedule_index_end=len(kernel.schedule))

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(
            codegen_state,
            schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel, codegen_result.implemented_domains,
            device_code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    from pytools import Record

    class PreambleInfo(Record):
        pass

    preamble_info = PreambleInfo(
            kernel=kernel,
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            # a set of LoopyTypes (!)
            seen_atomic_dtypes=seen_atomic_dtypes)

    preamble_generators = (kernel.preamble_generators
            + kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    seen_preamble_tags = set()
    dedup_preambles = []

    for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]):
        if tag in seen_preamble_tags:
            continue

        seen_preamble_tags.add(tag)
        dedup_preambles.append(preamble)

    from loopy.tools import remove_common_indentation
    preamble_codes = [
            remove_common_indentation(lines) + "\n"
            for lines in dedup_preambles]

    codegen_result = codegen_result.copy(
            device_preambles=preamble_codes)

    # }}}

    logger.info("%s: generate code: done" % kernel.name)

    if CACHING_ENABLED:
        code_gen_cache[input_kernel] = codegen_result

    return codegen_result