Example #1
0
def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)`` to
    :class:`islpy.PwQPolynomial` instances capturing the number of bytes  are
    read/written (where *direction* is either ``read`` or ``write`` on array
    *var_name*

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.kernel import kernel_state
    if kernel.state < kernel_state.PREPROCESSED:
        kernel = preprocess_kernel(kernel)

    result = {}
    fp = gather_access_footprints(kernel, ignore_uncountable=ignore_uncountable)

    for key, var_fp in fp.items():
        vname, direction = key

        var_descr = kernel.get_var_descriptor(vname)
        bytes_transferred = (
                int(var_descr.dtype.numpy_dtype.itemsize)
                * count(kernel, var_fp))
        if key in result:
            result[key] += bytes_transferred
        else:
            result[key] = bytes_transferred

    return result
Example #2
0
    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
        kernel = self.kernel

        from loopy.kernel.tools import add_dtypes

        if var_to_dtype_set:
            var_to_dtype = {}
            for var, dtype in var_to_dtype_set:
                try:
                    dest_name = kernel.impl_arg_to_arg[var].name
                except KeyError:
                    dest_name = var

                try:
                    var_to_dtype[dest_name] = dtype
                except KeyError:
                    raise LoopyError("cannot set type for '%s': "
                            "no known variable/argument with that name"
                            % var)

            kernel = add_dtypes(kernel, var_to_dtype)

            from loopy.preprocess import infer_unknown_types
            kernel = infer_unknown_types(kernel, expect_completion=True)

        if kernel.schedule is None:
            from loopy.preprocess import preprocess_kernel
            kernel = preprocess_kernel(kernel)

            from loopy.schedule import get_one_scheduled_kernel
            kernel = get_one_scheduled_kernel(kernel)

        return kernel
Example #3
0
def test_kernel_splitting_with_loop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            c[k,i] = a[k, i + 1]
            out[k,i] = c[k,i]
            """)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Example #4
0
    def get_typed_and_scheduled_kernel(self, var_to_dtype_set):
        kernel = self.kernel

        from loopy.kernel.tools import add_dtypes

        if var_to_dtype_set:
            var_to_dtype = {}
            for var, dtype in var_to_dtype_set:
                try:
                    dest_name = kernel.impl_arg_to_arg[var].name
                except KeyError:
                    dest_name = var

                try:
                    var_to_dtype[dest_name] = dtype
                except KeyError:
                    raise LoopyError("cannot set type for '%s': "
                            "no known variable/argument with that name"
                            % var)

            kernel = add_dtypes(kernel, var_to_dtype)

            from loopy.preprocess import infer_unknown_types
            kernel = infer_unknown_types(kernel, expect_completion=True)

        if kernel.schedule is None:
            from loopy.preprocess import preprocess_kernel
            kernel = preprocess_kernel(kernel)

            from loopy.schedule import get_one_scheduled_kernel
            kernel = get_one_scheduled_kernel(kernel)

        return kernel
Example #5
0
def get_op_poly(knl, numpy_types=True):

    """Count the number of operations in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **)**
             **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The string specifies the operation type as
               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.

             - The :class:`islpy.PwQPolynomial` holds the number of operations of
               the kind specified in the key (in terms of the
               :class:`loopy.LoopKernel` *parameter inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        poly = get_op_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = ToCountMap()
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.assignee) + op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    result = op_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind), count)
                for (dtype, kind), count in six.iteritems(result))

    return result
Example #6
0
def get_op_poly(knl, numpy_types=True):

    """Count the number of operations in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **)**
             **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The string specifies the operation type as
               *add*, *sub*, *mul*, *div*, *pow*, *shift*, *bw* (bitwise), etc.

             - The :class:`islpy.PwQPolynomial` holds the number of operations of
               the kind specified in the key (in terms of the
               :class:`loopy.LoopKernel` *parameter inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        poly = get_op_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
        f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = ToCountMap()
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.assignee) + op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    result = op_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind), count)
                for (dtype, kind), count in six.iteritems(result))

    return result
Example #7
0
def gather_access_footprints(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)``
    to :class:`islpy.Set` instances capturing which indices
    of each the array *var_name* are read/written (where
    *direction* is either ``read`` or ``write``.

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)
    kernel = preprocess_kernel(kernel)

    write_footprints = []
    read_footprints = []

    for insn in kernel.instructions:
        if not isinstance(insn, MultiAssignmentBase):
            warn(
                kernel, "count_non_assignment",
                "Non-assignment instruction encountered in "
                "gather_access_footprints, not counted")
            continue

        insn_inames = kernel.insn_inames(insn)
        inames_domain = kernel.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames,
                                                   [dim_type.set]))

        afg = AccessFootprintGatherer(kernel,
                                      domain,
                                      ignore_uncountable=ignore_uncountable)

        for assignee in insn.assignees:
            write_footprints.append(afg(insn.assignees))
        read_footprints.append(afg(insn.expression))

    write_footprints = AccessFootprintGatherer.combine(write_footprints)
    read_footprints = AccessFootprintGatherer.combine(read_footprints)

    result = {}

    for vname, footprint in six.iteritems(write_footprints):
        result[(vname, "write")] = footprint

    for vname, footprint in six.iteritems(read_footprints):
        result[(vname, "read")] = footprint

    return result
Example #8
0
def get_barrier_poly(knl):

    """Count the number of barriers each thread encounters in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: An :class:`islpy.PwQPolynomial` holding the number of barrier calls
             made (in terms of the :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import EnterLoop, LeaveLoop, Barrier
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []
    barrier_poly = isl.PwQPolynomial('{ 0 }')

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()
        elif isinstance(sched_item, Barrier):
            if iname_list:  # (if iname_list is not empty)
                ct = (count(knl, (
                                knl.get_inames_domain(iname_list).
                                project_out_except(iname_list, [dim_type.set])
                                )), )
                barrier_poly += reduce(mul, ct)
            else:
                barrier_poly += isl.PwQPolynomial('{ 1 }')

    return barrier_poly
Example #9
0
def gather_access_footprints(kernel, ignore_uncountable=False):
    """Return a dictionary mapping ``(var_name, direction)``
    to :class:`islpy.Set` instances capturing which indices
    of each the array *var_name* are read/written (where
    *direction* is either ``read`` or ``write``.

    :arg ignore_uncountable: If *True*, an error will be raised for
        accesses on which the footprint cannot be determined (e.g.
        data-dependent or nonlinear indices)
    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)
    kernel = preprocess_kernel(kernel)

    write_footprints = []
    read_footprints = []

    for insn in kernel.instructions:
        if not isinstance(insn, MultiAssignmentBase):
            warn(kernel, "count_non_assignment",
                    "Non-assignment instruction encountered in "
                    "gather_access_footprints, not counted")
            continue

        insn_inames = kernel.insn_inames(insn)
        inames_domain = kernel.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))

        afg = AccessFootprintGatherer(kernel, domain,
                ignore_uncountable=ignore_uncountable)

        for assignee in insn.assignees:
            write_footprints.append(afg(insn.assignees))
        read_footprints.append(afg(insn.expression))

    write_footprints = AccessFootprintGatherer.combine(write_footprints)
    read_footprints = AccessFootprintGatherer.combine(read_footprints)

    result = {}

    for vname, footprint in six.iteritems(write_footprints):
        result[(vname, "write")] = footprint

    for vname, footprint in six.iteritems(read_footprints):
        result[(vname, "read")] = footprint

    return result
Example #10
0
def get_op_poly(knl):
    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    op_poly = 0
    op_counter = ExpressionOpCounter(knl)
    for insn in knl.instructions:
        # how many times is this instruction executed?
        # check domain size:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        ops = op_counter(insn.expression)
        op_poly = op_poly + ops*count(knl, domain)
    return op_poly
Example #11
0
def estimate_regs_per_thread(knl):

    """Estimate registers per thread usage by a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose reg usage will be estimated.

    :return: An :class:`integer` holding an estimate for the number of registers
             used per thread. This number will most likely be too low, but will
             hopefully be consistantly too low by the same constant factor.

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import EnterLoop, LeaveLoop, Barrier, RunInstruction  # noqa
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    max_regs = 0
    block_reg_totals = [0]
    # counters to track nested sets of previously used iname+index combinations
    reg_counters = [RegisterUsageEstimator(knl)]

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            block_reg_totals.append(0)
            # start a new estimator
            reg_counters.append(RegisterUsageEstimator(knl))

        elif isinstance(sched_item, LeaveLoop):
            if block_reg_totals[-1] > max_regs:
                max_regs = block_reg_totals[-1]
            # pop to resume previous total
            block_reg_totals.pop()
            reg_counters.pop()

        elif isinstance(sched_item, RunInstruction):
            insn = knl.id_to_insn[sched_item.insn_id]
            block_reg_totals[-1] += reg_counters[-1](insn.assignee) + \
                                    reg_counters[-1](insn.expression)

    # finished looping, check outer block
    if block_reg_totals[-1] > max_regs:
        max_regs = block_reg_totals[-1]

    return max_regs
Example #12
0
def test_kernel_splitting_with_loop_and_private_temporary(ctx_factory):
    ctx = ctx_factory()

    pytest.xfail("spilling doesn't yet use local axes")

    knl = lp.make_kernel(
        "{ [i,k]: 0<=i<n and 0<=k<3 }", """
            <> t_private_scalar = a[k,i+1]
            <> t_private_array[i % 2] = a[k,i+1]
            c[k,i] = a[k,i+1]
            out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2]
            """)

    knl = lp.add_and_infer_dtypes(knl, {
        "a": np.float32,
        "c": np.float32,
        "out": np.float32,
        "n": np.int32
    })
    knl = lp.set_temporary_scope(knl, "t_private_scalar", "private")
    knl = lp.set_temporary_scope(knl, "t_private_array", "private")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Example #13
0
def test_kernel_splitting_with_loop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i,k]: 0<=i<n and 0<=k<3 }", """
            c[k,i] = a[k, i + 1]
            out[k,i] = c[k,i]
            """)

    knl = lp.add_and_infer_dtypes(knl, {
        "a": np.float32,
        "c": np.float32,
        "out": np.float32,
        "n": np.int32
    })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
Example #14
0
def get_synchronization_poly(knl):
    """Count the number of synchronization events each thread encounters in a
    loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: A dictionary mapping each type of synchronization event to a
            :class:`islpy.PwQPolynomial` holding the number of such events
            per thread.

            Possible keys include ``barrier_local``, ``barrier_global``
            (if supported by the target) and ``kernel_launch``.

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier, CallKernel,
                                ReturnFromKernel, RunInstruction)
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []

    result = ToCountMap()

    one = isl.PwQPolynomial('{ 1 }')

    def get_count_poly(iname_list):
        if iname_list:  # (if iname_list is not empty)
            ct = (count(knl,
                        (knl.get_inames_domain(iname_list).project_out_except(
                            iname_list, [dim_type.set]))), )
            return reduce(mul, ct)
        else:
            return one

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()

        elif isinstance(sched_item, Barrier):
            result = result + ToCountMap(
                {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})

        elif isinstance(sched_item, CallKernel):
            result = result + ToCountMap(
                {"kernel_launch": get_count_poly(iname_list)})

        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
            pass

        else:
            raise LoopyError("unexpected schedule item: %s" %
                             type(sched_item).__name__)

    return result.dict
Example #15
0
def get_gmem_access_poly(knl,
                         numpy_types=True):  # for now just counting subscripts
    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types

    class CacheHolder(object):
        pass

    cache_holder = CacheHolder()

    @memoize_in(cache_holder, "insn_count")
    def get_insn_count(knl, insn_inames, uniform=False):
        if uniform:
            from loopy.kernel.data import LocalIndexTag
            insn_inames = [
                iname for iname in insn_inames
                if not isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)
            ]
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames,
                                                   [dim_type.set]))
        return count(knl, domain)

    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        # count subscripts, distinguishing loads and stores
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(
            dict((key + ("load", ), val)
                 for key, val in six.iteritems(subs_expr.dict)))
        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(
            dict((key + ("store", ), val)
                 for key, val in six.iteritems(subs_assignee.dict)))

        insn_inames = knl.insn_inames(insn)

        # use count excluding local index tags for uniform accesses
        for key in subs_expr.dict:
            poly = ToCountMap({key: subs_expr.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly * get_insn_count(
                    knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames)
        for key in subs_assignee.dict:
            poly = ToCountMap({key: subs_assignee.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly * get_insn_count(
                    knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly * get_insn_count(knl, insn_inames)

    result = subs_poly.dict

    if numpy_types:
        result = dict(
            ((dtype.numpy_dtype, kind, direction), count)
            for (dtype, kind, direction), count in six.iteritems(result))

    return result
        """
        for i, k
            ... gbarrier
            c[k,i] = a[k, i + 1]
            ... gbarrier
            out[k,i] = c[k,i]
        end
        """, seq_dependencies=True)

# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl,
        {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)

from loopy.schedule import get_one_scheduled_kernel
knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"],
                                               knl.callables_table))


# map schedule onto host or device
print(knl)

cgr = lp.generate_code_v2(knl)

print(cgr.device_code())
print(cgr.host_code())
Example #17
0
def get_synchronization_poly(knl):

    """Count the number of synchronization events each thread encounters in a
    loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted.

    :return: A dictionary mapping each type of synchronization event to a
            :class:`islpy.PwQPolynomial` holding the number of such events
            per thread.

            Possible keys include ``barrier_local``, ``barrier_global``
            (if supported by the target) and ``kernel_launch``.

    Example usage::

        # (first create loopy kernel and specify array data types)

        barrier_poly = get_barrier_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}
        barrier_count = barrier_poly.eval_with_dict(params)

        # (now use this count to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
            CallKernel, ReturnFromKernel, RunInstruction)
    from operator import mul
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    iname_list = []

    result = ToCountMap()

    one = isl.PwQPolynomial('{ 1 }')

    def get_count_poly(iname_list):
        if iname_list:  # (if iname_list is not empty)
            ct = (count(knl, (
                            knl.get_inames_domain(iname_list).
                            project_out_except(iname_list, [dim_type.set])
                            )), )
            return reduce(mul, ct)
        else:
            return one

    for sched_item in knl.schedule:
        if isinstance(sched_item, EnterLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.append(sched_item.iname)
        elif isinstance(sched_item, LeaveLoop):
            if sched_item.iname:  # (if not empty)
                iname_list.pop()

        elif isinstance(sched_item, Barrier):
            result = result + ToCountMap(
                    {"barrier_%s" % sched_item.kind: get_count_poly(iname_list)})

        elif isinstance(sched_item, CallKernel):
            result = result + ToCountMap(
                    {"kernel_launch": get_count_poly(iname_list)})

        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
            pass

        else:
            raise LoopyError("unexpected schedule item: %s"
                    % type(sched_item).__name__)

    return result.dict
Example #18
0
def get_gmem_access_poly(knl, numpy_types=True):  # for now just counting subscripts

    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** *type* **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The *type* specifies the type of the data being
               accessed. This can be a :class:`numpy.dtype` if
               *numpy_types* is True, otherwise the internal
               loopy type.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types

    class CacheHolder(object):
        pass

    cache_holder = CacheHolder()

    @memoize_in(cache_holder, "insn_count")
    def get_insn_count(knl, insn_inames, uniform=False):
        if uniform:
            from loopy.kernel.data import LocalIndexTag
            insn_inames = [iname for iname in insn_inames if not
                           isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)]
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(
                                insn_inames, [dim_type.set]))
        return count(knl, domain)

    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        # count subscripts, distinguishing loads and stores
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(dict(
            (key + ("load",), val)
            for key, val in six.iteritems(subs_expr.dict)))
        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(dict(
            (key + ("store",), val)
            for key, val in six.iteritems(subs_assignee.dict)))

        insn_inames = knl.insn_inames(insn)

        # use count excluding local index tags for uniform accesses
        for key in subs_expr.dict:
            poly = ToCountMap({key: subs_expr.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)
        for key in subs_assignee.dict:
            poly = ToCountMap({key: subs_assignee.dict[key]})
            if key[1] == "uniform":
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames, True)
            else:
                subs_poly = subs_poly + poly*get_insn_count(knl, insn_inames)

    result = subs_poly.dict

    if numpy_types:
        result = dict(
                ((dtype.numpy_dtype, kind, direction), count)
                for (dtype, kind, direction), count in six.iteritems(result))

    return result
Example #19
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import KernelState
    if kernel.state == KernelState.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != KernelState.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.debug("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.type_inference import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                    arg.decl_info(
                        kernel.target,
                        is_written=is_written,
                        index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(ImplementedDataInfo(
                target=kernel.target,
                name=arg.name,
                dtype=arg.dtype,
                arg_class=ValueArg,
                is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
            kernel=kernel,
            implemented_data_info=implemented_data_info,
            implemented_domain=initial_implemented_domain,
            implemented_predicates=frozenset(),
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            seen_atomic_dtypes=seen_atomic_dtypes,
            var_subst_map={},
            allow_complex=allow_complex,
            var_name_generator=kernel.get_var_name_generator(),
            is_generating_device_code=False,
            gen_program_name=(
                kernel.target.host_program_name_prefix
                + kernel.name
                + kernel.target.host_program_name_suffix),
            schedule_index_end=len(kernel.schedule))

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(
            codegen_state,
            schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel, codegen_result.implemented_domains,
            device_code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    preamble_info = PreambleInfo(
            kernel=kernel,
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            # a set of LoopyTypes (!)
            seen_atomic_dtypes=seen_atomic_dtypes,
            codegen_state=codegen_state
            )

    preamble_generators = (kernel.preamble_generators
            + kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    codegen_result = codegen_result.copy(device_preambles=preambles)

    # }}}

    # For faster unpickling in the common case when implemented_domains isn't needed.
    from loopy.tools import LazilyUnpicklingDict
    codegen_result = codegen_result.copy(
            implemented_domains=LazilyUnpicklingDict(
                    codegen_result.implemented_domains))

    logger.info("%s: generate code: done" % kernel.name)

    if CACHING_ENABLED:
        code_gen_cache.store_if_not_present(input_kernel, codegen_result)

    return codegen_result
Example #20
0
def get_gmem_access_poly(knl):  # for now just counting subscripts

    """Count the number of global memory accesses in a loopy kernel.

    :parameter knl: A :class:`loopy.LoopKernel` whose DRAM accesses are to be
                    counted.

    :return: A mapping of **{(** :class:`numpy.dtype` **,** :class:`string` **,**
             :class:`string` **)** **:** :class:`islpy.PwQPolynomial` **}**.

             - The :class:`numpy.dtype` specifies the type of the data being
               accessed.

             - The first string in the map key specifies the global memory
               access type as
               *consecutive*, *nonconsecutive*, or *uniform*.

             - The second string in the map key specifies the global memory
               access type as a
               *load*, or a *store*.

             - The :class:`islpy.PwQPolynomial` holds the number of DRAM accesses
               with the characteristics specified in the key (in terms of the
               :class:`loopy.LoopKernel` *inames*).

    Example usage::

        # (first create loopy kernel and specify array data types)

        subscript_map = get_gmem_access_poly(knl)
        params = {'n': 512, 'm': 256, 'l': 128}

        f32_uncoalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'nonconsecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_load = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'load')
                            ].eval_with_dict(params)
        f32_coalesced_store = subscript_map.dict[
                            (np.dtype(np.float32), 'consecutive', 'store')
                            ].eval_with_dict(params)

        # (now use these counts to predict performance)

    """

    from loopy.preprocess import preprocess_kernel, infer_unknown_types
    knl = infer_unknown_types(knl, expect_completion=True)
    knl = preprocess_kernel(knl)

    subs_poly = ToCountMap()
    subscript_counter = GlobalSubscriptCounter(knl)
    for insn in knl.instructions:
        insn_inames = knl.insn_inames(insn)
        inames_domain = knl.get_inames_domain(insn_inames)
        domain = (inames_domain.project_out_except(insn_inames, [dim_type.set]))
        subs_expr = subscript_counter(insn.expression)
        subs_expr = ToCountMap(dict(
            (key + ("load",), val)
            for key, val in six.iteritems(subs_expr.dict)))

        subs_assignee = subscript_counter(insn.assignee)
        subs_assignee = ToCountMap(dict(
            (key + ("store",), val)
            for key, val in six.iteritems(subs_assignee.dict)))

        subs_poly = subs_poly + (subs_expr + subs_assignee)*count(knl, domain)
    return subs_poly.dict
Example #21
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import kernel_state
    if kernel.state == kernel_state.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != kernel_state.SCHEDULED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.debug("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.type_inference import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    logger.info("%s: generate code: start" % kernel.name)

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                    arg.decl_info(
                        kernel.target,
                        is_written=is_written,
                        index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(ImplementedDataInfo(
                target=kernel.target,
                name=arg.name,
                dtype=arg.dtype,
                arg_class=ValueArg,
                is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
    codegen_state = CodeGenerationState(
            kernel=kernel,
            implemented_data_info=implemented_data_info,
            implemented_domain=initial_implemented_domain,
            implemented_predicates=frozenset(),
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            seen_atomic_dtypes=seen_atomic_dtypes,
            var_subst_map={},
            allow_complex=allow_complex,
            var_name_generator=kernel.get_var_name_generator(),
            is_generating_device_code=False,
            gen_program_name=(
                kernel.target.host_program_name_prefix
                + kernel.name
                + kernel.target.host_program_name_suffix),
            schedule_index_end=len(kernel.schedule))

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(
            codegen_state,
            schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel, codegen_result.implemented_domains,
            device_code_str)

    # {{{ handle preambles

    for arg in kernel.args:
        seen_dtypes.add(arg.dtype)
    for tv in six.itervalues(kernel.temporary_variables):
        seen_dtypes.add(tv.dtype)

    preambles = kernel.preambles[:]

    preamble_info = PreambleInfo(
            kernel=kernel,
            seen_dtypes=seen_dtypes,
            seen_functions=seen_functions,
            # a set of LoopyTypes (!)
            seen_atomic_dtypes=seen_atomic_dtypes)

    preamble_generators = (kernel.preamble_generators
            + kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    codegen_result = codegen_result.copy(device_preambles=preambles)

    # }}}

    logger.info("%s: generate code: done" % kernel.name)

    if CACHING_ENABLED:
        code_gen_cache[input_kernel] = codegen_result

    return codegen_result
Example #22
0
knl = lp.make_kernel(
        "{ [i,k]: 0<=i<n and 0<=k<3 }",
        """
        for i, k
            ... gbarrier
            c[k,i] = a[k, i + 1]
            ... gbarrier
            out[k,i] = c[k,i]
        end
        """, seq_dependencies=True)

# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl,
        {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)

from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)

# map schedule onto host or device
print(knl)

cgr = lp.generate_code_v2(knl)

print(cgr.device_code())
print(cgr.host_code())