Ejemplo n.º 1
0
        def get_2d_knl(context, dtype):
            knl = lp.make_kernel("{[i,j]: 0<=i,j<n}",
                                 """
                    <> xx = 4*i/(n-1)
                    <> yy = 4*j/(n-1)
                    <float64> angle = 0.3
                    <> s = sin(angle)
                    <> c = cos(angle)
                    x[i,j] = c*xx + s*yy - 2
                    y[i,j] = -s*xx + c*yy - 2
                    """, [
                                     lp.GlobalArg("x,y", dtype, shape=lp.auto),
                                     lp.ValueArg("n", np.int32),
                                 ],
                                 assumptions="n>0")

            knl = lp.split_iname(knl,
                                 "i",
                                 16,
                                 outer_tag="g.1",
                                 inner_tag="l.1")
            knl = lp.split_iname(knl,
                                 "j",
                                 16,
                                 outer_tag="g.0",
                                 inner_tag="l.0")

            return lp.CompiledKernel(context, knl)
Ejemplo n.º 2
0
def test_recursive_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[itgt]: 0 <= itgt < ntgts}",
                "{[isrc_box]: 0 <= isrc_box < nboxes}",
                "{[isrc]: 0 <= isrc < npart}"
                ],
            """
            for itgt
                for isrc_box
                    <> npart = nparticles_per_box[isrc_box]
                    <> boxsum = sum(isrc, isrc+isrc_box+itgt)
                end
                a[itgt] = sum(isrc_box, boxsum)
            end
            """,
            [
                lp.ValueArg("n", np.int32),
                lp.GlobalArg("a", dtype, ("n",)),
                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
                lp.ValueArg("ntgts", np.int32),
                lp.ValueArg("nboxes", np.int32),
                ],
            assumptions="ntgts>=1")

    cknl = lp.CompiledKernel(ctx, knl)
    print(cknl.get_code())
Ejemplo n.º 3
0
def test_offsets_and_slicing(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 20

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m }",
                         """
                b[i,j] = 2*a[i,j]
                """,
                         assumptions="n>=1 and m>=1",
                         default_offset=lp.auto)

    knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1")

    cknl = lp.CompiledKernel(ctx, knl)

    a_full = cl.clrandom.rand(queue, (n, n), np.float64)
    a_full_h = a_full.get()
    b_full = cl.clrandom.rand(queue, (n, n), np.float64)
    b_full_h = b_full.get()

    a_sub = (slice(3, 10), slice(5, 10))
    a = a_full[a_sub]

    b_sub = (slice(3 + 3, 10 + 3), slice(5 + 4, 10 + 4))
    b = b_full[b_sub]

    b_full_h[b_sub] = 2 * a_full_h[a_sub]

    print(cknl.get_highlighted_code({"a": a.dtype}))
    cknl(queue, a=a, b=b)

    import numpy.linalg as la
    assert la.norm(b_full.get() - b_full_h) < 1e-13
Ejemplo n.º 4
0
        def get_3d_knl(context, dtype):
            knl = lp.make_kernel(
                "{[i,j]: 0<=i,j<n}", """
                    <> phi = 2*M_PI/n * i
                    <> theta = 2*M_PI/n * j
                    x[i,j] = 5*cos(phi) * (3 + cos(theta))
                    y[i,j] = 5*sin(phi) * (3 + cos(theta))
                    z[i,j] = 5*sin(theta)
                    """, [
                    lp.GlobalArg("x,y,z,", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
                ])

            knl = lp.split_iname(knl,
                                 "i",
                                 16,
                                 outer_tag="g.1",
                                 inner_tag="l.1")
            knl = lp.split_iname(knl,
                                 "j",
                                 16,
                                 outer_tag="g.0",
                                 inner_tag="l.0")

            return lp.CompiledKernel(context, knl)
Ejemplo n.º 5
0
def test_dependent_loop_bounds_2(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[jj]: 0<=jj<row_len}",
    ], [
        "<> row_start = a_rowstarts[i]",
        "<> row_len = a_rowstarts[i+1] - row_start",
        "ax[i] = sum(jj, a_values[[row_start+jj]])",
    ], [
        lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
        lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
        lp.GlobalArg("a_values", dtype, strides=(1, )),
        lp.GlobalArg("ax", dtype, shape=lp.auto),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1 and row_len>=1")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")
Ejemplo n.º 6
0
def test_dependent_domain_insn_iname_finding(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel([
        "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
        "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
    ], """
                <> src_ibox = source_boxes[isrc_box]
                <> isrc_start = box_source_starts[src_ibox]
                <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]
                <> strength = strengths[isrc] {id=set_strength}
                """, [
        lp.GlobalArg(
            "box_source_starts,box_source_counts_nonchild", None, shape=None),
        lp.GlobalArg("strengths", None, shape="nsources"), "..."
    ])

    print(knl)
    assert "isrc_box" in knl.insn_inames("set_strength")

    print(
        lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                source_boxes=np.int32,
                box_source_starts=np.int32,
                box_source_counts_nonchild=np.int32,
                strengths=np.float64,
                nsources=np.int32,
            )))
Ejemplo n.º 7
0
def test_independent_multi_domain(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel([
        "{[i]: 0<=i<n}",
        "{[j]: 0<=j<n}",
    ], [
        "a[i] = 1",
        "b[j] = 2",
    ], [
        lp.GlobalArg("a", dtype, shape=("n"), order="C"),
        lp.GlobalArg("b", dtype, shape=("n"), order="C"),
        lp.ValueArg("n", np.int32),
    ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
    assert knl.parents_per_domain() == 2 * [None]

    n = 50
    cknl = lp.CompiledKernel(ctx, knl)
    evt, (a, b) = cknl(queue, n=n, out_host=True)

    assert a.shape == (50, )
    assert b.shape == (50, )
    assert (a == 1).all()
    assert (b == 2).all()
Ejemplo n.º 8
0
def test_multi_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[itgt]: 0 <= itgt < ntgts}",
                "{[isrc_box]: 0 <= isrc_box < nboxes}",
                "{[isrc]: 0 <= isrc < npart}"
                ],
            [
                "<> npart = nparticles_per_box[isrc_box]",
                "a[itgt] = sum((isrc_box, isrc), 1)",
                ],
            [
                lp.ValueArg("n", np.int32),
                lp.GlobalArg("a", dtype, ("n",)),
                lp.GlobalArg("nparticles_per_box", np.int32, ("nboxes",)),
                lp.ValueArg("ntgts", np.int32),
                lp.ValueArg("nboxes", np.int32),
                ],
            assumptions="ntgts>=1")

    cknl = lp.CompiledKernel(ctx, knl)
    print(cknl.get_code())
Ejemplo n.º 9
0
    def time_kernel(
        self,
        knl,
        param_dict,
    ):

        if param_dict is None:
            raise ValueError(
                "Wall time requires dictionary of kernel parameters.")

        ctx = self.get_cl_context(knl)
        queue = cl.CommandQueue(ctx)

        arg_arrays = create_rand_args(ctx, knl, param_dict)
        knl = lp.set_options(knl, no_numpy=True)
        compiled = lp.CompiledKernel(ctx, knl)

        wtimes = []

        import time
        for t in range(self.n_time_trials + self.n_warmup_time_trials):
            queue.finish()
            tstart = time.time()
            evt, out = compiled(queue, **arg_arrays)
            queue.finish()
            tend = time.time()
            wtimes.append(tend - tstart)

        import numpy as np
        return np.average(wtimes[self.n_warmup_time_trials:])
Ejemplo n.º 10
0
def test_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[j]: 0<=j<i+sumlen}"
                ],
            [
                "<> sumlen = ell[i]",
                "a[i] = sum(j, j)",
                ],
            [
                lp.ValueArg("n", np.int32),
                lp.GlobalArg("a", dtype, ("n",)),
                lp.GlobalArg("ell", np.int32, ("n",)),
                ])

    cknl = lp.CompiledKernel(ctx, knl)

    n = 330
    ell = np.arange(n, dtype=np.int32)
    evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)

    tgt_result = (2*ell-1)*2*ell/2
    assert (a == tgt_result).all()
Ejemplo n.º 11
0
def test_dependent_loop_bounds(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[jj]: 0<=jj<row_len}",
                ],
            [
                "<> row_len = a_rowstarts[i+1] - a_rowstarts[i]",
                "a_sum[i] = sum(jj, a_values[[a_rowstarts[i]+jj]])",
                ],
            [
                lp.GlobalArg("a_rowstarts", np.int32, shape=lp.auto),
                lp.GlobalArg("a_indices", np.int32, shape=lp.auto),
                lp.GlobalArg("a_values", dtype),
                lp.GlobalArg("a_sum", dtype, shape=lp.auto),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1 and row_len>=1")

    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")
Ejemplo n.º 12
0
def test_triangle_domain(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n and i <= j}",
                         "a[i,j] = 17",
                         assumptions="n>=1")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Ejemplo n.º 13
0
def solve_it(n, r, ctx, a, b):
    bcopy = b.copy()

    decompose_knl = LU_decomposition(ctx)
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    cknl = lp.CompiledKernel(ctx, decompose_knl)
    parameters = {"syst": a, "n": r}
    evt, (LU) = cknl(queue, **parameters)

    LU = LU[0].astype(np.float64)
    solve_knl = LU_solver(ctx)
    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
    cknl = lp.CompiledKernel(ctx, solve_knl)
    parameters = {"LU": LU, "bcopy": bcopy, "n": r, "r": n}
    evt, (c) = cknl(queue, **parameters)
    return c[0].get().transpose().astype(np.float64).copy()
Ejemplo n.º 14
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
        """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.
            lp.GlobalArg("coeffs", None, shape=None),
            "..."
        ],
        assumptions="deg>=0 and nels>=1")

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl,
                         "el_outer",
                         2,
                         outer_tag="g.0",
                         inner_tag="ilp",
                         slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    print(
        lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
            )))
Ejemplo n.º 15
0
def test_modulo_indexing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}", """
                b[i] = sum(j, a[(i+j)%n])
                """, [lp.GlobalArg("a", None, shape="n"), "..."])

    print(knl)
    print(
        lp.CompiledKernel(ctx, knl).get_highlighted_code(dict(a=np.float32, )))
Ejemplo n.º 16
0
def test_arg_guessing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a = 1.5 + sum((i,j), i*j)
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """,
                         assumptions="n>=1")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Ejemplo n.º 17
0
def test_triangle_domain(ctx):
  

  knl = lp.make_kernel(ctx.devices[0], 
  [
  "{[i,j]: 0<=i,j<n and i <= j}",
  ],
  "a[i,j] = 17",
  assumptions="n>=1")

  print knl
  print lp.CompiledKernel(ctx, knl).get_highlighted_code()   
  return knl
Ejemplo n.º 18
0
def test_owed_barriers(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i]"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))])

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
Ejemplo n.º 19
0
def test_nonlinear_index(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a[i*i] = 17
                """, [
                             lp.GlobalArg("a", shape="n"),
                             lp.ValueArg("n"),
                         ],
                         assumptions="n>=1")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Ejemplo n.º 20
0
def test_arg_guessing_with_reduction(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a = 1.5 + simul_reduce(sum, (i,j), i*j)
                d = 1.5 + simul_reduce(sum, (i,j), b[i,j])
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """,
                         assumptions="n>=1")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Ejemplo n.º 21
0
def test_simple_side_effect(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<100}", """
                a[i] = a[i] + 1
                """, [lp.GlobalArg("a", np.float32, shape=(100, ))])

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
Ejemplo n.º 22
0
def test_fuzz_code_generator(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if ctx.devices[0].platform.vendor.startswith("Advanced Micro"):
        pytest.skip("crashes on AMD 15.12")

    #from expr_fuzz import get_fuzz_examples
    #for expr, var_values in get_fuzz_examples():
    for expr, var_values in generate_random_fuzz_examples(50):
        from pymbolic import evaluate
        try:
            true_value = evaluate(expr, var_values)
        except ZeroDivisionError:
            continue

        def get_dtype(x):
            if isinstance(x, (complex, np.complexfloating)):
                return np.complex128
            else:
                return np.float64

        knl = lp.make_kernel("{ : }", [lp.Assignment("value", expr)],
                             [lp.GlobalArg("value", np.complex128, shape=())] +
                             [
                                 lp.ValueArg(name, get_dtype(val))
                                 for name, val in six.iteritems(var_values)
                             ])
        ck = lp.CompiledKernel(ctx, knl)
        evt, (lp_value, ) = ck(queue, out_host=True, **var_values)
        err = abs(true_value - lp_value) / abs(true_value)
        if abs(err) > 1e-10:
            print(80 * "-")
            print("WRONG: rel error=%g" % err)
            print("true=%r" % true_value)
            print("loopy=%r" % lp_value)
            print(80 * "-")
            print(ck.get_code())
            print(80 * "-")
            print(var_values)
            print(80 * "-")
            print(repr(expr))
            print(80 * "-")
            print(expr)
            print(80 * "-")
            1 / 0
Ejemplo n.º 23
0
def test_wg_too_small(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}",
                         ["<float32> z[i] = a[i] {id=copy}"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))],
                         local_sizes={0: 16})

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    import pytest
    for gen_knl in kernel_gen:
        with pytest.raises(RuntimeError):
            lp.CompiledKernel(ctx, gen_knl).get_code()
Ejemplo n.º 24
0
def test_multi_cse(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}",
                         ["<float32> z[i] = a[i] + a[i]**2"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))],
                         local_sizes={0: 16})

    knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
    knl = lp.add_prefetch(knl, "a", [])

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
Ejemplo n.º 25
0
        def get_2d_knl(context, dtype):
            knl = lp.make_kernel(
                "{[i]: 0<=i<n}", """
                    <> phi = 2*M_PI/n * i
                    x[i] = 0.5* (3*cos(phi) + 2*sin(3*phi))
                    y[i] = 0.5* (1*sin(phi) + 1.5*sin(2*phi))
                    """, [
                    lp.GlobalArg("x,y", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
                ])

            knl = lp.split_iname(knl,
                                 "i",
                                 128,
                                 outer_tag="g.0",
                                 inner_tag="l.0")

            return lp.CompiledKernel(context, knl)
Ejemplo n.º 26
0
def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1",
                         [lp.GlobalArg("a", np.float32, shape="n"), "..."])

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.set_loop_priority(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
        assert "if" not in compiled.get_code()
Ejemplo n.º 27
0
def test_arg_shape_guessing(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a = 1.5 + sum((i,j), i*j)
                b[i, j] = i*j
                c[i+j, j] = b[j,i]
                """, [
                             lp.GlobalArg("a", shape=lp.auto),
                             lp.GlobalArg("b", shape=lp.auto),
                             lp.GlobalArg("c", shape=lp.auto),
                             lp.ValueArg("n"),
                         ],
                         assumptions="n>=1")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Ejemplo n.º 28
0
def test_write_parameter(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }",
                         """
                a = sum((i,j), i*j)
                b = sum(i, sum(j, i*j))
                n = 15
                """, [
                             lp.GlobalArg("a", dtype, shape=()),
                             lp.GlobalArg("b", dtype, shape=()),
                             lp.ValueArg("n", np.int32, approximately=1000),
                         ],
                         assumptions="n>=1")

    import pytest
    with pytest.raises(RuntimeError):
        lp.CompiledKernel(ctx, knl).get_code()
Ejemplo n.º 29
0
        def get_3d_knl(context, dtype):
            knl = lp.make_kernel(
                "{[i,j,k]: 0<=i,j,k<n}",
                """
                    <> xx = i/(n-1)
                    <> yy = j/(n-1)
                    <> zz = k/(n-1)

                    <float64> phi = 0.3
                    <> s1 = sin(phi)
                    <> c1 = cos(phi)

                    <> xxx = c1*xx + s1*yy
                    <> yyy = -s1*xx + c1*yy
                    <> zzz = zz

                    <float64> theta = 0.7
                    <> s2 = sin(theta)
                    <> c2 = cos(theta)

                    x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
                    y[i,j,k] = 4 * yyy - 2
                    z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
                    """, [
                    lp.GlobalArg("x,y,z", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
                ],
                assumptions="n>0")

            knl = lp.split_iname(knl,
                                 "j",
                                 16,
                                 outer_tag="g.1",
                                 inner_tag="l.1")
            knl = lp.split_iname(knl,
                                 "k",
                                 16,
                                 outer_tag="g.0",
                                 inner_tag="l.0")

            return lp.CompiledKernel(context, knl)
Ejemplo n.º 30
0
def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[jj]: 0<=jj<row_len}",
                ],
            [
                "<> row_len = a_row_lengths[i]",
                "a[i,jj] = 1",
                ],
            [
                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                lp.ValueArg("n", np.int32),
                ])

    assert knl.parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
            inner_tag="l.0")

    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
            inner_tag="l.1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with pytest.raises(RuntimeError):
        list(lp.generate_loop_schedules(knl_bad))