コード例 #1
0
ファイル: test_domain.py プロジェクト: inducer/loopy
def test_dependent_loop_bounds_4():
    # https://gitlab.tiker.net/inducer/loopy/issues/23
    import loopy as lp

    loopy_knl = lp.make_kernel(
        [
            "{[a]: 0<=a<10}",
            "{[b]: b_start<=b<b_end}",
            "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}",
        ],
        """
        for a
         <> b_start = 1
         <> b_end = 2
         for b
          <> c_start = 1
          <> c_end = 2

          for c
           ... nop
          end

          <>t[idim] = 1
         end
        end
        """,
        "...",
        seq_dependencies=True)

    loopy_knl = lp.fix_parameters(loopy_knl, dim=3)

    with lp.CacheMode(False):
        lp.generate_code_v2(loopy_knl)
コード例 #2
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_ispc_streaming_stores():
    stream_dtype = np.float32
    index_dtype = np.int32

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = b[i] + scalar * c[i]",
            target=lp.ISPCTarget(), index_dtype=index_dtype,
            name="stream_triad")

    vars = ["a", "b", "c", "scalar"]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(
        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
    knl = lp.tag_instructions(knl, "!streaming_store")

    knl = lp.add_and_infer_dtypes(knl, {
        var: stream_dtype
        for var in vars
        })

    knl = lp.set_argument_order(knl, vars + ["n"])

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    lp.generate_code_v2(knl).all_code()
コード例 #3
0
def test_check_bounds_with_caller_assumptions(ctx_factory):
    import islpy as isl
    from loopy.diagnostic import LoopyIndexError

    arange = lp.make_function("{[i]: 0<=i<n}",
                              """
        y[i] = i
        """,
                              name="arange")

    knl = lp.make_kernel(
        "{[i]: 0<=i<20}",
        """
        [i]: Y[i] = arange(N)
        """,
        [lp.GlobalArg("Y", shape=(20, )),
         lp.ValueArg("N", dtype=np.int32)],
        name="epoint")

    knl = lp.merge([knl, arange])

    with pytest.raises(LoopyIndexError):
        lp.generate_code_v2(knl)

    knl = knl.with_kernel(
        lp.assume(knl.default_entrypoint, isl.BasicSet("[N] -> { : N <= 20}")))

    lp.auto_test_vs_ref(knl, ctx_factory(), parameters={"N": 15})
コード例 #4
0
ファイル: test_domain.py プロジェクト: connorjward/loopy
def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[jj]: 0<=jj<row_len}",
    ], [
        "<> row_len = a_row_lengths[i]",
        "a[i,jj] = 1",
    ], [
        lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
        lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
        lp.ValueArg("n", np.int32),
    ],
                         target=lp.PyOpenCLTarget(ctx.devices[0]),
                         name="loopy_kernel")

    assert knl["loopy_kernel"].parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print(lp.generate_code_v2(knl).device_code())

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1")

    with pytest.raises(RuntimeError):
        list(lp.generate_code_v2(knl_bad))
コード例 #5
0
ファイル: test_target.py プロジェクト: yueyedeai/loopy
def test_ispc_streaming_stores():
    stream_dtype = np.float32
    index_dtype = np.int32

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = b[i] + scalar * c[i]",
            target=lp.ISPCTarget(), index_dtype=index_dtype,
            name="stream_triad")

    vars = ["a", "b", "c", "scalar"]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(
        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
    knl = lp.tag_instructions(knl, "!streaming_store")

    knl = lp.add_and_infer_dtypes(knl, {
        var: stream_dtype
        for var in vars
        })

    knl = lp.set_argument_order(knl, vars + ["n"])

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    lp.generate_code_v2(knl).all_code()
コード例 #6
0
def test_dependent_loop_bounds_4():
    # https://gitlab.tiker.net/inducer/loopy/issues/23
    import loopy as lp

    loopy_knl = lp.make_kernel([
        "{[a]: 0<=a<10}",
        "{[b]: b_start<=b<b_end}",
        "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}",
    ],
                               """
        for a
         <> b_start = 1
         <> b_end = 2
         for b
          <> c_start = 1
          <> c_end = 2

          for c
           ... nop
          end

          <>t[idim] = 1
         end
        end
        """,
                               "...",
                               seq_dependencies=True)

    loopy_knl = lp.fix_parameters(loopy_knl, dim=3)

    with lp.CacheMode(False):
        lp.generate_code_v2(loopy_knl)
コード例 #7
0
def test_double_hw_axes_used_in_knl_call(inline):
    from loopy.diagnostic import LoopyError

    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            y[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{[i]: 0<=i<10}",
                         """
            y[:, i] = twice(x[:, i])
            """, [
                             lp.GlobalArg("x", shape=(10, 10), dtype=float),
                             lp.GlobalArg("y", shape=(10, 10))
                         ],
                         name="outer")

    twice = lp.tag_inames(twice, {"i": "l.0"})
    knl = lp.tag_inames(knl, {"i": "l.0"})
    knl = lp.merge([knl, twice])

    if inline:
        knl = lp.inline_callable_kernel(knl, "twice")

    with pytest.raises(LoopyError):
        lp.generate_code_v2(knl)
コード例 #8
0
def test_idempotency(form):
    k1 = compile_form(form)[0]
    k2 = compile_form(form)[0]

    assert k1.ast.gencode() == k2.ast.gencode()

    # Test loopy backend
    import loopy
    k1 = compile_form(form, coffee=False)[0]
    k2 = compile_form(form, coffee=False)[0]

    assert loopy.generate_code_v2(
        k1.ast).device_code() == loopy.generate_code_v2(k2.ast).device_code()
コード例 #9
0
def test_lpy_wide_array_splitter(opts):
    from pymbolic.primitives import Subscript, Variable
    # create array split
    asplit = array_splitter(opts)

    # create a test kernel
    arg1 = lp.GlobalArg('a1', shape=(10, 10), order=opts.order)
    arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order)

    k = lp.make_kernel([
        '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format(
            int(np.ceil(10 / VECTOR_WIDTH))),
        '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH)
    ],
                       """
        for i, j_outer, j_inner
            a1[j_outer, i] = 1 {id=a1}
            a2[j_outer, i] = 1 {id=a2}
        end
        """, [arg1, arg2],
                       silenced_warnings=['no_device_in_pre_codegen_checks'],
                       target=lp.OpenCLTarget())

    a1_hold = k.arg_dict['a1'].copy()
    a2_hold = k.arg_dict['a2'].copy()
    k = asplit.split_loopy_arrays(k)
    k = lp.tag_inames(k, {'j_inner': 'l.0' if not opts.is_simd else 'vec'})

    # ensure there's no loopy errors
    lp.generate_code_v2(k).device_code()

    def __indexer():
        if opts.order == 'C':
            return (Variable('j_outer'), Variable('i'), Variable('j_inner'))
        else:
            return (Variable('j_inner'), Variable('j_outer'), Variable('i'))

    # check dim
    a1 = k.arg_dict['a1']
    assert a1.shape == asplit.split_shape(a1_hold)[0]
    # and indexing
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1')
    # construct index
    assert isinstance(assign, Subscript) and assign.index == __indexer()

    # now test with evenly sized
    a2 = k.arg_dict['a2']
    assert a2.shape == asplit.split_shape(a2_hold)[0]
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2')
    assert isinstance(assign, Subscript) and assign.index == __indexer()
コード例 #10
0
def test_lpy_deep_array_splitter(opts):
    from pymbolic.primitives import Subscript, Variable
    # create array split
    asplit = array_splitter(opts)

    # create a test kernel
    size = VECTOR_WIDTH * 3
    loop_bound = VECTOR_WIDTH * 2
    arg1 = lp.GlobalArg('a1', shape=(size, size), order=opts.order)
    arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order)

    k = lp.make_kernel('{{[i]: 0 <= i < {}}}'.format(loop_bound),
                       """
            a1[0, i] = 1 {id=a1}
            a2[0, i] = 1 {id=a2}
        """, [arg1, arg2],
                       silenced_warnings=['no_device_in_pre_codegen_checks'],
                       target=lp.OpenCLTarget())

    k = lp.split_iname(k,
                       'i',
                       VECTOR_WIDTH,
                       inner_tag='l.0' if not opts.is_simd else 'vec')
    a1_hold = k.arg_dict['a1'].copy()
    a2_hold = k.arg_dict['a2'].copy()
    k = asplit.split_loopy_arrays(k)

    # ensure there's no loopy errors
    lp.generate_code_v2(k).device_code()

    def __indexer():
        if opts.order == 'C':
            return (0, Variable('i_outer'), Variable('i_inner'))
        else:
            return (Variable('i_inner'), 0, Variable('i_outer'))

    # check dim
    a1 = k.arg_dict['a1']
    assert a1.shape == asplit.split_shape(a1_hold)[0]
    # and indexing
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1')
    # construct index
    assert isinstance(assign, Subscript) and assign.index == __indexer()

    # now test with evenly sized
    a2 = k.arg_dict['a2']
    assert a2.shape == asplit.split_shape(a2_hold)[0]
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2')
    assert isinstance(assign, Subscript) and assign.index == __indexer()
コード例 #11
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_global_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n}",
            """
            <> c[i] = a[i + 1]
            out[i] = c[i]
            """)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
    knl = lp.set_temporary_scope(knl, "c", "global")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    #print(cgr.device_code())
    #print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #12
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_global_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n}", """
            <> c[i] = a[i + 1]
            out[i] = c[i]
            """)

    knl = lp.add_and_infer_dtypes(knl, {
        "a": np.float32,
        "c": np.float32,
        "out": np.float32,
        "n": np.int32
    })
    knl = lp.set_temporary_scope(knl, "c", "global")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    #print(cgr.device_code())
    #print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #13
0
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    AS = lp.AddressSpace  # noqa
    n = 10

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n}",
        """
            a[i] = b[i]
        """, [
            lp.GlobalArg("a", shape=(n, ), dtype=np.int32),
            lp.TemporaryVariable("b",
                                 shape=(n, ),
                                 initializer=np.arange(n, dtype=np.int32),
                                 dtype=np.int32,
                                 read_only=True,
                                 address_space=AS.GLOBAL)
        ],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ("int b[%d]" % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
コード例 #14
0
def test_missing_compilers():
    from loopy.target.c import ExecutableCTarget, CTarget
    from loopy.target.c.c_execution import CCompiler
    from codepy.toolchain import GCCToolchain

    def __test(evalfunc, target, **targetargs):
        n = 10

        knl = lp.make_kernel(
            "{[i]: 0 <= i < n}",
            """
                a[i] = b[i]
            """, [
                lp.GlobalArg("a", shape=(n, ), dtype=np.int32),
                lp.GlobalArg("b", shape=(n, ), dtype=np.int32)
            ],
            target=target(**targetargs))

        knl = lp.fix_parameters(knl, n=n)
        return evalfunc(knl)

    assert __test(lambda knl: lp.generate_code_v2(knl).device_code(), CTarget)

    from pytools.prefork import ExecError

    def eval_tester(knl):
        return np.allclose(
            knl(a=np.zeros(10, dtype=np.int32),
                b=np.arange(10, dtype=np.int32))[1], np.arange(10))

    import os
    path_store = os.environ["PATH"]
    ccomp = None
    try:
        # test with path wiped out such that we can't find gcc
        with pytest.raises(ExecError):
            os.environ["PATH"] = ""
            ccomp = CCompiler()
            __test(eval_tester, ExecutableCTarget, compiler=ccomp)
    finally:
        # make sure we restore the path
        os.environ["PATH"] = path_store
        # and, with the path restored we should now be able to properly execute with
        # the default (non-guessed) toolchain!
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)

    # and test that we will fail if we remove a required attribute
    del ccomp.toolchain.undefines
    with pytest.raises(AttributeError):
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)

    # next test that some made up compiler can be specified
    ccomp = CCompiler(cc="foo")
    assert isinstance(ccomp.toolchain, GCCToolchain)
    assert ccomp.toolchain.cc == "foo"

    # and that said made up compiler errors out

    with pytest.raises(ExecError):
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)
コード例 #15
0
    def make_kernels(self, Vf, Vc):
        """
        Interpolation and restriction kernels between arbitrary elements.

        This is temporary while we wait for structure-preserving tfsc kernels.
        """
        self.prolong_kernel = self.prolongation_transfer_kernel_action(
            Vf, self.uc)
        matrix_kernel = self.prolongation_transfer_kernel_action(
            Vf, firedrake.TestFunction(Vc))
        # The way we transpose the prolongation kernel is suboptimal.
        # A local matrix is generated each time the kernel is executed.
        element_kernel = loopy.generate_code_v2(
            matrix_kernel.code).device_code()
        element_kernel = element_kernel.replace(
            "void expression_kernel", "static void expression_kernel")
        dimc = Vc.finat_element.space_dimension() * Vc.value_size
        dimf = Vf.finat_element.space_dimension() * Vf.value_size
        restrict_code = f"""
        {element_kernel}

        void restriction({ScalarType_c} *restrict Rc, const {ScalarType_c} *restrict Rf, const {ScalarType_c} *restrict w)
        {{
            {ScalarType_c} Afc[{dimf}*{dimc}] = {{0}};
            expression_kernel(Afc);
            for ({IntType_c} i = 0; i < {dimf}; i++)
               for ({IntType_c} j = 0; j < {dimc}; j++)
                   Rc[j] += Afc[i*{dimc} + j] * Rf[i] * w[i];
        }}
        """
        self.restrict_kernel = op2.Kernel(restrict_code, "restriction")
コード例 #16
0
def test_recursive_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[itgt]: 0 <= itgt < ntgts}", "{[isrc_box]: 0 <= isrc_box < nboxes}",
        "{[isrc]: 0 <= isrc < npart}"
    ],
                         """
            for itgt
                for isrc_box
                    <> npart = nparticles_per_box[isrc_box]
                    <> boxsum = sum(isrc, isrc+isrc_box+itgt)
                end
                a[itgt] = sum(isrc_box, boxsum)
            end
            """, [
                             lp.ValueArg("n", np.int32),
                             lp.GlobalArg("a", dtype, ("n", )),
                             lp.GlobalArg("nparticles_per_box", np.int32,
                                          ("nboxes", )),
                             lp.ValueArg("ntgts", np.int32),
                             lp.ValueArg("nboxes", np.int32),
                         ],
                         assumptions="ntgts>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    print(lp.generate_code_v2(knl).device_code())
コード例 #17
0
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    from loopy.kernel.data import temp_var_scope as scopes
    n = 10

    knl = lp.make_kernel(
        '{[i]: 0 <= i < n}',
        """
            a[i] = b[i]
        """, [
            lp.GlobalArg('a', shape=(n, ), dtype=np.int32),
            lp.TemporaryVariable('b',
                                 shape=(n, ),
                                 initializer=np.arange(n, dtype=np.int32),
                                 dtype=np.int32,
                                 read_only=True,
                                 scope=scopes.GLOBAL)
        ],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
コード例 #18
0
ファイル: test_scan.py プロジェクト: shwina/loopy
def test_automatic_scan_detection():
    knl = lp.make_kernel(["[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}"], """
        a[i] = sum(j, j**2)
        """)

    cgr = lp.generate_code_v2(knl)
    assert "scan" not in cgr.device_code()
コード例 #19
0
ファイル: sequential.py プロジェクト: OP2/PyOP2
def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=None, wrapper_name=None):
    """Generates wrapper for a single cell. No iteration loop, but cellwise data is extracted.
    Cell is expected as an argument to the wrapper. For extruded, the numbering of the cells
    is columnwise continuous, bottom to top.

    :param iterset: The iteration set
    :param args: :class:`Arg`s
    :param forward_args: To forward unprocessed arguments to the kernel via the wrapper,
                         give an iterable of strings describing their C types.
    :param kernel_name: Kernel function name
    :param wrapper_name: Wrapper function name

    :return: string containing the C code for the single-cell wrapper
    """
    from pyop2.codegen.builder import WrapperBuilder
    from pyop2.codegen.rep2loopy import generate
    from loopy.types import OpaqueType

    forward_arg_types = [OpaqueType(fa) for fa in forward_args]
    builder = WrapperBuilder(iterset=iterset, single_cell=True, forward_arg_types=forward_arg_types)
    for arg in args:
        builder.add_argument(arg)
    builder.set_kernel(Kernel("", kernel_name))
    wrapper = generate(builder, wrapper_name)
    code = loopy.generate_code_v2(wrapper)

    return code.device_code()
コード例 #20
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_kernel_splitting_with_loop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            c[k,i] = a[k, i + 1]
            out[k,i] = c[k,i]
            """)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #21
0
ファイル: sequential.py プロジェクト: JaroslavHron/PyOP2
def generate_single_cell_wrapper(iterset,
                                 args,
                                 forward_args=(),
                                 kernel_name=None,
                                 wrapper_name=None):
    """Generates wrapper for a single cell. No iteration loop, but cellwise data is extracted.
    Cell is expected as an argument to the wrapper. For extruded, the numbering of the cells
    is columnwise continuous, bottom to top.

    :param iterset: The iteration set
    :param args: :class:`Arg`s
    :param forward_args: To forward unprocessed arguments to the kernel via the wrapper,
                         give an iterable of strings describing their C types.
    :param kernel_name: Kernel function name
    :param wrapper_name: Wrapper function name

    :return: string containing the C code for the single-cell wrapper
    """
    from pyop2.codegen.builder import WrapperBuilder
    from pyop2.codegen.rep2loopy import generate
    from loopy.types import OpaqueType

    forward_arg_types = [OpaqueType(fa) for fa in forward_args]
    builder = WrapperBuilder(iterset=iterset,
                             single_cell=True,
                             forward_arg_types=forward_arg_types)
    for arg in args:
        builder.add_argument(arg)
    builder.set_kernel(Kernel("", kernel_name))
    wrapper = generate(builder, wrapper_name)
    code = loopy.generate_code_v2(wrapper)

    return code.device_code()
コード例 #22
0
def build_loopy_kernel_A_text():
    knl_name = "kernel_tensor_A"

    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }",
                         """
            A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """,
                         name=knl_name,
                         assumptions="n >= 1 and m >= 1",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(
        knl, {
            "A": np.dtype(np.double),
            "B": np.dtype(np.double),
            "c": np.dtype(np.double)
        })
    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
コード例 #23
0
def build_loopy_kernel_b_text():
    knl_name = "kernel_tensor_b"

    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         """
            b[i] = c
        """,
                         name="kernel_tensor_b",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(knl, {
        "b": np.dtype(np.double),
        "c": np.dtype(np.double)
    })
    knl = lp.fix_parameters(knl, n=3)
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_b(b, Ae / 6.0);"

    return knl_name, knl_call, knl_c, knl_h
コード例 #24
0
def test_generate_c_snippet():
    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [
        Instr(f[I], l_sum(k, q_v[k, I] * u)),
        Instr(df[I], l_sum(k, q_v[k, I])),
    ], [
        lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
        lp.GlobalArg("f,df", np.float64, shape="nSpace"),
        lp.ValueArg("u", np.float64),
        "...",
    ],
                         target=CTarget(),
                         assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")
    print(lp.generate_code_v2(knl))
コード例 #25
0
    def test_solve_callable(self, zero_vec, solve_mat, solve_vec):
        loopy.set_caching_enabled(False)

        k = loopy.make_kernel(
            ["{[i,j] : 0 <= i,j < 2}"],
            """
            x[:] = solve(A[:,:], b[:])
            """, [
                loopy.GlobalArg('x', dtype=np.float64, shape=(2, )),
                loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2)),
                loopy.GlobalArg(
                    'b',
                    dtype=np.float64,
                    shape=(2, ),
                )
            ],
            target=loopy.CTarget(),
            name="callable_kernel2",
            lang_version=(2018, 2))

        k = loopy.register_function_id_to_in_knl_callable_mapper(
            k, solve_fn_lookup)
        code = loopy.generate_code_v2(k).device_code()
        code.replace('void callable_kernel2', 'static void callable_kernel2')
        loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"])
        args = [zero_vec(op2.READ), solve_mat(op2.READ), solve_vec(op2.WRITE)]

        op2.par_loop(loopykernel, solve_mat.dataset.set, *args)
        expected = np.linalg.solve(solve_mat.data, solve_vec.data)
        assert np.allclose(expected, zero_vec.data)
コード例 #26
0
    def test_inverse_callable(self, zero_mat, inv_mat):
        loopy.set_caching_enabled(False)

        k = loopy.make_kernel(
            ["{[i,j] : 0 <= i,j < 2}"],
            """
            B[:,:] = inv(A[:,:])
            """, [
                loopy.GlobalArg('B', dtype=np.float64, shape=(2, 2)),
                loopy.GlobalArg('A', dtype=np.float64, shape=(2, 2))
            ],
            target=loopy.CTarget(),
            name="callable_kernel",
            lang_version=(2018, 2))

        k = loopy.register_function_id_to_in_knl_callable_mapper(
            k, inv_fn_lookup)
        code = loopy.generate_code_v2(k).device_code()
        code.replace('void callable_kernel', 'static void callable_kernel')

        loopykernel = op2.Kernel(code, k.name, ldargs=["-llapack"])

        op2.par_loop(loopykernel, zero_mat.dataset.set, zero_mat(op2.WRITE),
                     inv_mat(op2.READ))
        expected = np.linalg.inv(inv_mat.data)
        assert np.allclose(expected, zero_mat.data)
コード例 #27
0
def loopy_example():
    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         "out[i] = 2*a[i]",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
    print(lp.generate_code_v2(knl).device_code())
コード例 #28
0
ファイル: test_target.py プロジェクト: tj-sun/loopy
def test_numba_target():
    knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}",
                         "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
                         target=lp.NumbaTarget())

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #29
0
ファイル: loopy_utils.py プロジェクト: zjuchenll/polymath
def make_domain(prog):

    knl_code = lp.generate_code_v2(prog)

    for id, v in knl_code.implemented_domains.items():
        # print(v[0].compute_schedule())
        print(v[0].to_str())
        print(dir(v[0]))
コード例 #30
0
def write_ptx(ctx, knl, filename=None):
    cl_program = cl.Program(ctx,
                            lp.generate_code_v2(knl).device_code()).build(
                                options=knl.options.cl_build_options)
    ptx_src = cl_program.binaries[0]
    if not filename:
        filename = "ptx_" + knl.name + ".ptx"
    ptx_src_file = open(filename, 'w')
    ptx_src_file.write(ptx_src.decode('utf-8', 'ignore'))
コード例 #31
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_numba_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaTarget())

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #32
0
ファイル: test_c_execution.py プロジェクト: inducer/loopy
def test_missing_compilers():
    from loopy.target.c import ExecutableCTarget, CTarget
    from loopy.target.c.c_execution import CCompiler
    from codepy.toolchain import GCCToolchain

    def __test(evalfunc, target, **targetargs):
        n = 10

        knl = lp.make_kernel('{[i]: 0 <= i < n}',
            """
                a[i] = b[i]
            """,
            [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
             lp.GlobalArg('b', shape=(n,), dtype=np.int32)],
            target=target(**targetargs))

        knl = lp.fix_parameters(knl, n=n)
        return evalfunc(knl)

    assert __test(lambda knl: lp.generate_code_v2(knl).device_code(), CTarget)

    from pytools.prefork import ExecError

    def eval_tester(knl):
        return np.allclose(knl(a=np.zeros(10, dtype=np.int32),
                               b=np.arange(10, dtype=np.int32))[1], np.arange(10))
    import os
    path_store = os.environ["PATH"]
    ccomp = None
    try:
        # test with path wiped out such that we can't find gcc
        with pytest.raises(ExecError):
            os.environ["PATH"] = ''
            ccomp = CCompiler()
            __test(eval_tester, ExecutableCTarget, compiler=ccomp)
    finally:
        # make sure we restore the path
        os.environ["PATH"] = path_store
        # and, with the path restored we should now be able to properly execute with
        # the default (non-guessed) toolchain!
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)

    # and test that we will fail if we remove a required attribute
    del ccomp.toolchain.undefines
    with pytest.raises(AttributeError):
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)

    # next test that some made up compiler can be specified
    ccomp = CCompiler(cc='foo')
    assert isinstance(ccomp.toolchain, GCCToolchain)
    assert ccomp.toolchain.cc == 'foo'

    # and that said made up compiler errors out

    with pytest.raises(ExecError):
        __test(eval_tester, ExecutableCTarget, compiler=ccomp)
コード例 #33
0
ファイル: test_domain.py プロジェクト: connorjward/loopy
def test_triangle_domain(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}",
                         "a[i,j] = 17",
                         assumptions="n>=1",
                         target=lp.PyOpenCLTarget(ctx.devices[0]))

    print(knl)
    print(lp.generate_code_v2(knl).device_code())
コード例 #34
0
ファイル: test_target.py プロジェクト: sailfish009/loopy
def test_math_function(target, tp):
    # Test correct maths functions are generated for C and OpenCL
    # backend instead for different data type

    data_type = {"f32": np.float32, "f64": np.float64}[tp]

    import pymbolic.primitives as p

    i = p.Variable("i")
    xi = p.Subscript(p.Variable("x"), i)
    yi = p.Subscript(p.Variable("y"), i)
    zi = p.Subscript(p.Variable("z"), i)

    n = 100
    domain = "{[i]: 0<=i<%d}" % n
    data = [
        lp.GlobalArg("x", data_type, shape=(n, )),
        lp.GlobalArg("y", data_type, shape=(n, )),
        lp.GlobalArg("z", data_type, shape=(n, ))
    ]

    inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmin" in code

    if tp == "f32" and target == CTarget:
        assert "fminf" in code
    else:
        assert "fminf" not in code

    inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmax" in code

    if tp == "f32" and target == CTarget:
        assert "fmaxf" in code
    else:
        assert "fmaxf" not in code
コード例 #35
0
def test_lpy_iname_presplit(opts):
    """
    Tests that inames access to pre-split inames in non-split loopy arrays are
    correctly handled
    """
    from pymbolic.primitives import Subscript, Variable
    # create array split
    asplit = array_splitter(opts)

    # create a test kernel
    arg1 = lp.GlobalArg('a1', shape=(20, 10), order=opts.order)
    arg2 = lp.GlobalArg('a2', shape=(16, 16), order=opts.order)

    k = lp.make_kernel([
        '{[i]: 0 <= i < 10}', '{{[j_outer]: 0 <= j_outer < {}}}'.format(
            int(np.ceil(10 / VECTOR_WIDTH))),
        '{{[j_inner]: 0 <= j_inner < {}}}'.format(VECTOR_WIDTH)
    ],
                       """
            a1[j_outer, i] = 1 {id=a1}
            a2[j_outer, i] = 1 {id=a2}
        """, [arg1, arg2],
                       silenced_warnings=['no_device_in_pre_codegen_checks'],
                       target=lp.OpenCLTarget())

    k = asplit.split_loopy_arrays(k, dont_split=['a1', 'a2'])

    # ensure there's no loopy errors
    lp.generate_code_v2(k).device_code()

    def __indexer():
        return (Variable('j_outer') * VECTOR_WIDTH + Variable('j_inner'),
                Variable('i'))

    # check indexing
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a1')
    # construct index
    assert isinstance(assign, Subscript) and assign.index == __indexer()

    # now test with evenly sized
    assign = next(insn.assignee for insn in k.instructions if insn.id == 'a2')
    assert isinstance(assign, Subscript) and assign.index == __indexer()
コード例 #36
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_math_function(target, tp):
    # Test correct maths functions are generated for C and OpenCL
    # backend instead for different data type

    data_type = {"f32": np.float32,
                 "f64": np.float64}[tp]

    import pymbolic.primitives as p

    i = p.Variable("i")
    xi = p.Subscript(p.Variable("x"), i)
    yi = p.Subscript(p.Variable("y"), i)
    zi = p.Subscript(p.Variable("z"), i)

    n = 100
    domain = "{[i]: 0<=i<%d}" % n
    data = [lp.GlobalArg("x", data_type, shape=(n,)),
            lp.GlobalArg("y", data_type, shape=(n,)),
            lp.GlobalArg("z", data_type, shape=(n,))]

    inst = [lp.Assignment(xi, p.Variable("min")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmin" in code

    if tp == "f32" and target == CTarget:
        assert "fminf" in code
    else:
        assert "fminf" not in code

    inst = [lp.Assignment(xi, p.Variable("max")(yi, zi))]
    knl = lp.make_kernel(domain, inst, data, target=target())
    code = lp.generate_code_v2(knl).device_code()

    assert "fmax" in code

    if tp == "f32" and target == CTarget:
        assert "fmaxf" in code
    else:
        assert "fmaxf" not in code
コード例 #37
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp,dep=init_w:aind_init}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1",
            target=lp.PyOpenCLTarget(ctx.devices[0])
            )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
    knl = lp.add_dtypes(knl, dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                ))
    print(lp.generate_code_v2(knl))
コード例 #38
0
 def create_rand_args(ctx, knl, param_dict):
     queue = cl.CommandQueue(ctx)
     info = lp.generate_code_v2(knl).implemented_data_info
     args, arg_data = lp.auto_test.make_ref_args(knl, info, queue,
                                                 param_dict)
     args.clear()
     del args
     rand_args = lp.auto_test.make_args(knl, info, queue, arg_data,
                                        param_dict)
     del arg_data[:]
     del arg_data
     return rand_args
コード例 #39
0
ファイル: test_scan.py プロジェクト: inducer/loopy
def test_automatic_scan_detection():
    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0<=i<n}",
            "{[j]: 0<=j<=2*i}"
        ],
        """
        a[i] = sum(j, j**2)
        """
        )

    cgr = lp.generate_code_v2(knl)
    assert "scan" not in cgr.device_code()
コード例 #40
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_cuda_short_vector():
    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #41
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_numba_cuda_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
コード例 #42
0
ファイル: sequential.py プロジェクト: OP2/PyOP2
    def code_to_compile(self):

        from pyop2.codegen.builder import WrapperBuilder
        from pyop2.codegen.rep2loopy import generate

        builder = WrapperBuilder(iterset=self._iterset, iteration_region=self._iteration_region, pass_layer_to_kernel=self._pass_layer_arg)
        for arg in self._args:
            builder.add_argument(arg)
        builder.set_kernel(self._kernel)

        wrapper = generate(builder)
        code = loopy.generate_code_v2(wrapper)

        if self._kernel._cpp:
            from loopy.codegen.result import process_preambles
            preamble = "".join(process_preambles(getattr(code, "device_preambles", [])))
            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
            return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
        return code.device_code()
コード例 #43
0
ファイル: test_reduction.py プロジェクト: inducer/loopy
def test_reduction_with_conditional():
    # The purpose of the 'l' iname is to force the entire kernel (including the
    # predicate) into device code.

    knl = lp.make_kernel(
                "{ [l,i] : 0<=l,i<42 }",
                """
                if l > 0
                    b[l] = sum(i, l*a[i])
                end
                """,
                [lp.ValueArg("n", dtype=np.int32), "..."])

    knl = lp.tag_inames(knl, "l:g.0")
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    code = lp.generate_code_v2(knl).device_code()
    print(code)

    # Check that the if appears before the loop that realizes the reduction.
    assert code.index("if") < code.index("for")
コード例 #44
0
ファイル: test_target.py プロジェクト: cmsquared/loopy
def test_ispc_target(occa_mode=False):
    from loopy.target.ispc import ISPCTarget

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i] = 2*a[i]",
            [
                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                "..."
                ],
            target=ISPCTarget(occa_mode=occa_mode))

    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

    codegen_result = lp.generate_code_v2(
                lp.get_one_scheduled_kernel(
                    lp.preprocess_kernel(knl)))

    print(codegen_result.device_code())
    print(codegen_result.host_code())
コード例 #45
0
ファイル: test_c_execution.py プロジェクト: inducer/loopy
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    from loopy.kernel.data import temp_var_scope as scopes
    n = 10

    knl = lp.make_kernel('{[i]: 0 <= i < n}',
        """
            a[i] = b[i]
        """,
        [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
         lp.TemporaryVariable('b', shape=(n,),
                              initializer=np.arange(n, dtype=np.int32),
                              dtype=np.int32,
                              read_only=True,
                              scope=scopes.GLOBAL)],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
コード例 #46
0
def gen_code(knl):
    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    codegen_result = lp.generate_code_v2(knl)

    return codegen_result.device_code() + "\n" + codegen_result.host_code()
コード例 #47
0
ファイル: hello-loopy.py プロジェクト: inducer/loopy
import loopy as lp
import pyopencl as cl
import pyopencl.array
from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2

# setup
# -----
ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 15 * 10**6
a = cl.array.arange(queue, n, dtype=np.float32)

# create
# ------
knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]")

# transform
# ---------
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

# execute
# -------
evt, (out,) = knl(queue, a=a)
# ENDEXAMPLE

knl = lp.add_and_infer_dtypes(knl, {"a": np.dtype(np.float32)})
print(lp.generate_code_v2(knl).device_code())
コード例 #48
0
knl = lp.make_kernel(
        "{ [i,k]: 0<=i<n and 0<=k<3 }",
        """
        for i, k
            ... gbarrier
            c[k,i] = a[k, i + 1]
            ... gbarrier
            out[k,i] = c[k,i]
        end
        """, seq_dependencies=True)

# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
knl = lp.add_and_infer_dtypes(knl,
        {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

# schedule
from loopy.preprocess import preprocess_kernel
knl = preprocess_kernel(knl)

from loopy.schedule import get_one_scheduled_kernel
knl = get_one_scheduled_kernel(knl)

# map schedule onto host or device
print(knl)

cgr = lp.generate_code_v2(knl)

print(cgr.device_code())
print(cgr.host_code())