Beispiel #1
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    if (not ctx.devices[0].image_support
            or ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Beispiel #2
0
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 6*16*2

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                ],
            name="matmul")

    seq_knl = knl

    i_reg = 2
    j_reg = 2
    i_chunks = 16
    j_chunks = 16
    knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
    knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1")
    knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
    knl = lp.split_iname(knl, "k", 16)
    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={})
Beispiel #3
0
def test_fancy_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "[n] -> {[i,j,k]: 0<=i,j,k<n }",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.GlobalArg("a", dtype, shape="(n, n)", order=order),
                lp.GlobalArg("b", dtype, shape="(n, n)", order=order),
                lp.GlobalArg("c", dtype, shape="(n, n)", order=order),
                lp.ValueArg("n", np.int32, approximately=1000),
                ], name="fancy_matmul", assumptions="n>=1")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 16, slabs=(0, 1))
    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
    knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters=dict(n=n))
Beispiel #4
0
def get_tensor(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}",
    
  ],
  [
    "res[i,j,k]=sum((alpha), u[alpha,i]*v[alpha,j]*w[alpha,k])",
  ],
  [
    lp.GlobalArg("res", dtype, shape="n, n, n", order=order),
    lp.GlobalArg("v", dtype, shape="r, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("w", dtype, shape="r, n", order=order),
    lp.ValueArg("n", np.int32),
    lp.ValueArg("r", np.int32),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "i", 8,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "alpha", 2)
  knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" )
  
  return knl
Beispiel #5
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             fetch_bounding_box=True)
     knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Beispiel #6
0
def test_plain_matrix_mul(ctx_factory):
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    for dtype, check, vec_size in [
            (cl_array.vec.float4, check_float4, 4),
            (np.float32, None, 1),
            ]:
        knl = lp.make_kernel(
                "{[i,j,k]: 0<=i,j,k<%d}" % n,
                [
                    "c[i, j] = sum(k, a[i, k]*b[k, j])"
                    ],
                [
                    lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                    ],
                name="matmul")

        ref_knl = knl

        knl = lp.split_iname(knl, "i", 16,
                outer_tag="g.0", inner_tag="l.1")
        knl = lp.split_iname(knl, "j", 16,
                outer_tag="g.1", inner_tag="l.0")
        knl = lp.split_iname(knl, "k", 16)
        knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
        knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])

        lp.auto_test_vs_ref(ref_knl, ctx, knl,
                op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"],
                parameters={"n": n}, check_result=check)
Beispiel #7
0
def test_transpose(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<%d}" % n,
            [
                "b[i, j] = a[j, i]"
                ],
            [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                ],
            name="transpose")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"],
            parameters={})
Beispiel #8
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Beispiel #9
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.tag_inames(knl, dict(ifeat="g.2"))
     knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
     knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")
     return knl
Beispiel #10
0
        def get_3d_knl(context, dtype):
            knl = lp.make_kernel(
                "{[i,j,k]: 0<=i,j,k<n}",
                """
                    <> xx = i/(n-1)
                    <> yy = j/(n-1)
                    <> zz = k/(n-1)

                    <float64> phi = 0.3
                    <> s1 = sin(phi)
                    <> c1 = cos(phi)

                    <> xxx = c1*xx + s1*yy
                    <> yyy = -s1*xx + c1*yy
                    <> zzz = zz

                    <float64> theta = 0.7
                    <> s2 = sin(theta)
                    <> c2 = cos(theta)

                    x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2
                    y[i,j,k] = 4 * yyy - 2
                    z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2
                    """,
                [
                    lp.GlobalArg("x,y,z", dtype, shape=lp.auto),
                    lp.ValueArg("n", np.int32),
                    ], assumptions="n>0")

            knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
            knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0")

            return lp.CompiledKernel(context, knl)
Beispiel #11
0
def no_test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            <> key = make_uint2(i, 324830944)  {inames=i}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    # ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer")
    print(knl)
    1/0
    knl = lp.realize_reduction(knl)

    evt, (z,) = knl(queue, n=size)
Beispiel #12
0
def test_equality_constraints(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    n = 10

    knl = lp.make_kernel([
            "[n] -> {[i,j]: 0<=i,j<n }",
            "{[k]: k =i+5 and k < n}",
            ],
            [
                "a[i,j] = 5 {id=set_all}",
                "b[i,k] = 22 {dep=set_all}",
                ],
            [
                lp.GlobalArg("a,b", dtype, shape="n, n", order=order),
                lp.ValueArg("n", np.int32, approximately=1000),
                ],
            name="equality_constraints", assumptions="n>=1")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
    #print(knl)
    #print(knl.domains[0].detect_equalities())

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            parameters=dict(n=n), print_ref_code=True)
Beispiel #13
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
Beispiel #14
0
def test_ilp_race_matmul(ctx_factory):
    dtype = np.float32
    order = "C"

    n = 9

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.ImageArg("a", dtype, shape=(n, n)),
                lp.ImageArg("b", dtype, shape=(n, n)),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                ],
            name="matmul")

    knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 2)
    knl = lp.add_prefetch(knl, 'b', ["k_inner"])

    with lp.CacheMode(False):
        from loopy.diagnostic import WriteRaceConditionWarning
        from warnings import catch_warnings
        with catch_warnings(record=True) as warn_list:
            knl = lp.preprocess_kernel(knl)
            list(lp.generate_loop_schedules(knl))

            assert any(isinstance(w.message, WriteRaceConditionWarning)
                    for w in warn_list)
Beispiel #15
0
def right_u(ctx):
    order = "C"
    dtype = np.float32
    n=128
    r=3
    knl = lp.make_kernel(ctx.devices[0],
            "{[i,j,k,s]: 0<=i<%d and 0<=j<%d and 0<=k<%d and 0<=s<%d}" % (n,n,n,r),
           [
            "f[i,s] = sum((j,k), a[i,j,k]*v[j,s]*w[k,s])",
            ],
            [
            lp.GlobalArg("a", dtype, shape=(n, n, n), order=order),
            lp.GlobalArg("v", dtype, shape=(n, r), order=order),
            lp.GlobalArg("w", dtype, shape=(n, r), order=order),
            lp.GlobalArg("f", dtype, shape=(n, r), order=order),
            ],
            name="pravaya")
    knl = lp.split_iname(knl, "i", 2,outer_tag = "g.0", inner_tag="l.1")

    knl = lp.split_iname(knl, "s", r, outer_tag = "g.1", inner_tag = "l.0")
    knl = lp.split_iname(knl, "j", 16)
    knl = lp.split_iname(knl, "k", 16) #,16,outer_tag="g.2", inner_tag="l.2")
#    knl = lp.add_prefetch(knl, "a", ["k_inner","j_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "v", ["s_inner", "j_inner"])
    knl = lp.add_prefetch(knl, "w", ["s_inner", "k_inner"])
    #knl = lp.add_prefetch(knl, "a", ["k_inner", "j_inner"])
    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    seq_knl = knl
    return knl
Beispiel #16
0
def test_independent_multi_domain(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[j]: 0<=j<n}",
                ],
            [
                "a[i] = 1",
                "b[j] = 2",
                ],
            [
                lp.GlobalArg("a", dtype, shape=("n"), order="C"),
                lp.GlobalArg("b", dtype, shape=("n"), order="C"),
                lp.ValueArg("n", np.int32),
                ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0",
            inner_tag="l.0")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0",
            inner_tag="l.0")
    assert knl.parents_per_domain() == 2*[None]

    n = 50
    cknl = lp.CompiledKernel(ctx, knl)
    evt, (a, b) = cknl(queue, n=n, out_host=True)

    assert a.shape == (50,)
    assert b.shape == (50,)
    assert (a == 1).all()
    assert (b == 2).all()
Beispiel #17
0
def test_ispc_streaming_stores():
    stream_dtype = np.float32
    index_dtype = np.int32

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = b[i] + scalar * c[i]",
            target=lp.ISPCTarget(), index_dtype=index_dtype,
            name="stream_triad")

    vars = ["a", "b", "c", "scalar"]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(
        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
    knl = lp.tag_instructions(knl, "!streaming_store")

    knl = lp.add_and_infer_dtypes(knl, {
        var: stream_dtype
        for var in vars
        })

    knl = lp.set_argument_order(knl, vars + ["n"])

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    lp.generate_code_v2(knl).all_code()
Beispiel #18
0
def left_W(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}",
    
  ],
  [
    "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((j),v[alpha,j]*v[alpha1,j])",
  ],
  [

    lp.GlobalArg("v", dtype, shape="r, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("l", dtype, shape="r, r", order=order),
    lp.ValueArg("n", np.int64),
    lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "j", 16)
  knl = lp.split_iname(knl, "i", 16)
  
  return knl
Beispiel #19
0
def test_atomic(ctx_factory, dtype):
    ctx = ctx_factory()

    if (
            np.dtype(dtype).itemsize == 8
            and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
        pytest.skip("64-bit atomics not supported on device")

    import pyopencl.version  # noqa
    if (
            cl.version.VERSION < (2015, 2)
            and dtype == np.int64):
        pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2")

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i%20] = out[i%20] + 2*a[i] {atomic}",
            [
                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
                lp.GlobalArg("a", dtype, shape=lp.auto),
                "..."
                ],
            assumptions="n>0")

    ref_knl = knl
    knl = lp.split_iname(knl, "i", 512)
    knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
Beispiel #20
0
def left_V(ctx):
  order='C'
  dtype = np.float64
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[i,k,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=i,k<n}",
    
  ],
  [
    "l[alpha,alpha1]=sum((i), u[i,alpha]*u[i,alpha1])*sum((k),w[k,alpha]*w[k,alpha1])",
  ],
  [
    
    lp.GlobalArg("u", dtype, shape="n, r", order=order),
    lp.GlobalArg("w", dtype, shape="n, r", order=order),
    lp.GlobalArg("l", dtype, shape="r, r", order=order),
    lp.ValueArg("n", np.int64),
    lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "i", 16)
  knl = lp.split_iname(knl, "k", 16)
  
  return knl
Beispiel #21
0
def LU_solver(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[l,k,i,j,m]: 0<=l<r and 0<=k<n-1 and k+1<=i<n and 0<=j<n-1 and 0<=m<n-1-j}",
    
  ],
  [
  "bcopy[i,l] = bcopy[i,l]-bcopy[k,l]*LU[i,k] {id=lab1}",
  "bcopy[n-1-j,l]=bcopy[n-j-1,l]/LU[n-j-1,n-1-j] {id=l2, dep=lab1}",
  "bcopy[m,l]= bcopy[m,l]-bcopy[n-j-1,l]*LU[m,n-1-j] {id=l3, dep =l2}",
  "bcopy[0,l]=bcopy[0,l]/LU[0,0]{id=l4, dep=l2}",
  ],
  [
  lp.GlobalArg("LU", dtype, shape = "n, n" , order=order),
  lp.GlobalArg("bcopy", dtype, shape = "n, r" , order=order),
  lp.ValueArg("n", np.int64),
  lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "k", 1)
  knl = lp.split_iname(knl, "i", 32)
  knl = lp.split_iname(knl, "j", 32)
  knl = lp.split_iname(knl, "l", 32, outer_tag="g.0", inner_tag="l.0")

#  print knl
#  print lp.CompiledKernel(ctx, knl).get_highlighted_code()   
  return knl
Beispiel #22
0
def LU_decomposition(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    "{[k,i]: 0<=k<n-1 and k+1<=i<n}",
    "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}",
  ],
  [
  "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}",
  "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}",
  ],
  [
  lp.GlobalArg("syst", dtype, shape = "n, n" , order=order),
  lp.ValueArg("n", np.int32),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "k", n)
  knl = lp.split_iname(knl, "i", 32)
  knl = lp.split_iname(knl, "j", 32)
  knl = lp.split_iname(knl, "l", 32)

#  print knl
#  print lp.CompiledKernel(ctx, knl).get_highlighted_code()   
  return knl
Beispiel #23
0
def Prav_V(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}",
    
  ],
  [
    "f[alpha,j]=sum((k,i), a[i,j,k]*w[alpha, k]*u[alpha, i])",
  ],
  [
    lp.GlobalArg("a", dtype, shape="n, n, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("w", dtype, shape="r, n", order=order),
    lp.GlobalArg("f", dtype, shape="r, n", order=order),
    lp.ValueArg("n", np.int64),
    lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "j", 16,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "i", 16)
  knl = lp.split_iname(knl, "k", 16) 
   
  return knl
Beispiel #24
0
 def variant_gpu(knl):
     unroll = 4
     block_size = 256
     knl = lp.split_iname(knl, "i", unroll*block_size,
             outer_tag="g.0", slabs=(0, 1))
     knl = lp.split_iname(knl, "i_inner", block_size,
             outer_tag="unr", inner_tag="l.0")
     return knl
Beispiel #25
0
def test_red2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 16

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ue(a,b) := u[e,a,b]",
            "ur(a,b) := sum_float32(@o, D[a,o]*ue(o,b))",
            "us(a,b) := sum_float32(@o, D[b,o]*ue(a,o))",
            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j)+G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m)+G[2,e,i,m]*us(i,m)))"
            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32
    
    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"])
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"])
    knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"])
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"])
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"])
    knl = lp.split_iname(knl, "e", 2, outer_tag="g.0")
    knl = lp.split_iname(knl, "j", n, inner_tag="l.0")#, slabs=(0, 1))
    knl = lp.split_iname(knl, "i", n, inner_tag="l.1")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))

    
    knl = lp.add_prefetch(knl, "G", [2,3]) # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*((n**3)*2*2 + n*n*2*3 + (n**3)*2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
Beispiel #26
0
    def variant_3(knl):
        knl = lp.split_iname(knl, "i", 16,
                outer_tag="g.0", inner_tag="l.0")
        knl = lp.split_iname(knl, "j", 16,
                outer_tag="g.1", inner_tag="l.1")

        knl = lp.add_prefetch(knl, "a", ["i_inner"])
        knl = lp.add_prefetch(knl, "b", ["j_inner"])
        return knl
Beispiel #27
0
 def variant_overfetch(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1",
             slabs=(1, 1))
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0",
            slabs=(1, 1))
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"],
             fetch_bounding_box=True, default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Beispiel #28
0
    def variant_2(knl):
        knl = lp.split_iname(knl, "i", 16,
                outer_tag="g.0", inner_tag="l.0")
        knl = lp.split_iname(knl, "j", 16,
                outer_tag="g.1", inner_tag="l.1")

        knl = knl.set_loop_priority(knl, ["i", "j"])
        knl = lp.add_prefetch(knl, "a")
        knl = lp.add_prefetch(knl, "b")
        return knl
Beispiel #29
0
def test_all_counters_parallel_matmul():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")

    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    sync_poly = lp.get_synchronization_poly(knl)
    assert len(sync_poly) == 1
    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1

    op_map = lp.get_op_poly(knl)
    f32mul = op_map[
                        (np.dtype(np.float32), 'mul')
                        ].eval_with_dict(params)
    f32add = op_map[
                        (np.dtype(np.float32), 'add')
                        ].eval_with_dict(params)
    i32ops = op_map[
                        (np.dtype(np.int32), 'add')
                        ].eval_with_dict(params)
    i32ops += op_map[
                        (np.dtype(np.int32), 'mul')
                        ].eval_with_dict(params)

    assert f32mul+f32add == n*m*l*2
    assert i32ops == n*m*l*4 + l*n*4

    subscript_map = lp.get_gmem_access_poly(knl)
    f32uncoal = subscript_map[
                        (np.dtype(np.float32), 'nonconsecutive', 'load')
                        ].eval_with_dict(params)
    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'load')
                        ].eval_with_dict(params)

    assert f32uncoal == n*m*l
    assert f32coal == n*m*l

    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'store')
                        ].eval_with_dict(params)

    assert f32coal == n*l
Beispiel #30
0
 def variant_gpu(knl):
     knl = lp.expand_subst(knl)
     knl = lp.split_iname(knl, "i", 256,
             outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 256)
     knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
             ["x_fetch_j", "x_fetch_k"], default_tag=None)
     knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
     knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
     knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
     return knl
Beispiel #31
0
 def variant_more_per_work_group(knl):
     knl = lp.tag_inames(knl, dict(n="l.0"))
     knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
     return knl
Beispiel #32
0
def test_all_counters_parallel_matmul():

    bsize = 16
    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", bsize)
    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}

    sync_map = lp.get_synchronization_map(knl)
    assert len(sync_map) == 2
    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
    assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / bsize

    op_map = lp.get_op_map(knl, count_redundant_work=True)
    f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params)
    f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params)
    i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params)
    i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params)

    assert f32mul + f32add == n * m * ell * 2

    op_map = lp.get_mem_access_map(knl, count_redundant_work=True)

    f32s1lb = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='load',
                                  variable='b')].eval_with_dict(params)
    f32s1la = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='load',
                                  variable='a')].eval_with_dict(params)

    assert f32s1lb == n * m * ell / bsize
    assert f32s1la == n * m * ell / bsize

    f32coal = op_map[lp.MemAccess('global',
                                  np.float32,
                                  stride=1,
                                  direction='store',
                                  variable='c')].eval_with_dict(params)

    assert f32coal == n * ell

    local_mem_map = lp.get_mem_access_map(
        knl, count_redundant_work=True).filter_by(mtype=['local'])
    local_mem_l = local_mem_map[lp.MemAccess(
        'local', np.dtype(np.float32),
        direction='load')].eval_with_dict(params)
    assert local_mem_l == n * m * ell * 2
Beispiel #33
0
def test_advect_dealias(ctx_factory):
    1 / 0  # not ready

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}"
        % M % N[

            # interpolate u to integration nodes
            "CSE:  u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])",
            "CSE:  u1[i,j,kp,e]  = sum_float32(@o, I[j,o]*u0[i,o,kp,e])",
            "CSE:  Iu[i,j,k,e]   = sum_float32(@o, I[k,o]*u1[i,j,o,e])",

            # differentiate u on integration nodes
            "CSE:  Iur[i,j,k,e]  = sum_float32(@m, D[i,m]*Iu[m,j,k,e])",
            "CSE:  Ius[i,j,k,e]  = sum_float32(@m, D[j,m]*Iu[i,m,k,e])",
            "CSE:  Iut[i,j,k,e]  = sum_float32(@m, D[k,m]*Iu[i,j,m,e])",

            # interpolate v to integration nodes
            "CSE:  v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])",
            "CSE:  v1[i,j,kp,e]  = sum_float32(@o, I[j,o]*v0[i,o,kp,e])",
            "CSE:  Iv[i,j,k,e]   = sum_float32(@o, I[k,o]*v1[i,j,o,e])",

            # differentiate v on integration nodes
            "CSE:  Ivr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iv[m,j,k,e])",
            "CSE:  Ivs[i,j,k,e]  = sum_float32(@m, D[j,m]*Iv[i,m,k,e])",
            "CSE:  Ivt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iv[i,j,m,e])",

            # interpolate w to integration nodes
            "CSE:  w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])",
            "CSE:  w1[i,j,kp,e]  = sum_float32(@o, I[j,o]*w0[i,o,kp,e])",
            "CSE:  Iw[i,j,k,e]   = sum_float32(@o, I[k,o]*w1[i,j,o,e])",

            # differentiate v on integration nodes
            "CSE:  Iwr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iw[m,j,k,e])",
            "CSE:  Iws[i,j,k,e]  = sum_float32(@m, D[j,m]*Iw[i,m,k,e])",
            "CSE:  Iwt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iw[i,j,m,e])",

            # find velocity in (r,s,t) coordinates
            # QUESTION: should I use CSE here ?
            "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]",
            "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]",
            "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]",

            # form nonlinear term on integration nodes
            # QUESTION: should I use CSE here ?
            "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]",
            "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]",
            "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]",

            # L2 project Nu back to Lagrange basis
            "CSE: Nu2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])",
            "CSE: Nu1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])",
            "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])",

            # L2 project Nv back to Lagrange basis
            "CSE: Nv2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])",
            "CSE: Nv1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])",
            "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])",

            # L2 project Nw back to Lagrange basis
            "CSE: Nw2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])",
            "CSE: Nw1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])",
            "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ],
        [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("v", dtype, shape=field_shape, order=order),
            lp.ArrayArg("w", dtype, shape=field_shape, order=order),
            lp.ArrayArg("INu", dtype, shape=field_shape, order=order),
            lp.ArrayArg("INv", dtype, shape=field_shape, order=order),
            lp.ArrayArg("INw", dtype, shape=field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(M, M), order=order),
            lp.ArrayArg("I", dtype, shape=(M, N), order=order),
            lp.ArrayArg("V", dtype, shape=(N, M), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="sem_advect",
        assumptions="K>=1")

    print(knl)
    1 / 0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=0,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True,
    )
Beispiel #34
0
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 15 * 10**6
a = cl.array.arange(queue, n, dtype=np.float32)

knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]")

knl = lp.set_options(knl, write_code=True)
knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
knl = lp.tag_array_axes(knl, "a,out", "C,vec")

knl(queue, a=a.reshape(-1, 4), n=n)
Beispiel #35
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
    ctx = ctx_factory()

    filename = os.path.join(os.path.dirname(__file__),
                            "strongVolumeKernels.f90")
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, seq_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s
    hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True)

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.prioritize_loops(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
                            lp.find_one_rule_matching(
                                hsv, prep_var_name + "_*subst*"),
                            default_tag="l.auto")

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv,
                          "Q[ii,jj,k,:,:,e]",
                          sweep_inames=ilp_inames,
                          default_tag="l.auto")

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_map = lp.get_op_map(hsv)
        print(lp.stringify_stats_mapping(op_map))

        print("MEM")
        gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
        print(lp.stringify_stats_mapping(gmem_map))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Beispiel #36
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "i", 256,
             outer_tag="g.0", inner_tag="l.0",
             slabs=(0, 1))
     knl = lp.split_iname(knl, "j", 256, slabs=(0, 1))
     return knl
Beispiel #37
0
def test_red2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 16

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [
            "ue(a,b) := u[e,a,b]",
            "ur(a,b) := sum_float32(@o, D[a,o]*ue(o,b))",
            "us(a,b) := sum_float32(@o, D[b,o]*ue(a,o))", "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j)+G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m)+G[2,e,i,m]*us(i,m)))"
        ], [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap2D",
        assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto")
    knl = lp.precompute(knl,
                        "ue",
                        np.float32, ["a", "b", "m"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "ur",
                        np.float32, ["a", "b"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "us",
                        np.float32, ["a", "b"],
                        default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 2, outer_tag="g.0")
    knl = lp.split_iname(knl, "j", n, inner_tag="l.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "i", n, inner_tag="l.1")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))

    knl = lp.add_prefetch(knl, "G", [2, 3],
                          default_tag="l.auto")  # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl,
                        ctx,
                        kernel_gen,
                        op_count=K * ((n**3) * 2 * 2 + n * n * 2 * 3 +
                                      (n**3) * 2 * 2) / 1e9,
                        op_label="GFlops",
                        parameters={"K": K})
Beispiel #38
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], default_tag="l.auto")
     knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Beispiel #39
0
def test_rob_stroud_bernstein_full(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    # NOTE: result would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                0 <= el < nels and \
                0 <= i2 < nqp1d and \
                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
                \
                0 <= i1_2 < nqp1d and \
                0 <= alpha1_2 <= deg and \
                0 <= i2_2 < nqp1d \
                }",
        """
        for el
            for i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
                            {id=write_tmp,dep=init_w:aind_init}
                    for alpha2
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                            {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end

            for i1_2
                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                <> s2 = 1-xi2
                <> r2 = xi2/s2
                <> w2 = s2**deg  {id=w2_init}

                for alpha1_2
                    for i2_2
                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
                                w2 * tmp[alpha1_2, i2_2]  {id=res2,dep=w2_init}
                    end

                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)  \
                            {id=w2_update, dep=res2}
                end
            end
        end
        """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.

            lp.GlobalArg("coeffs", None, shape=None),
            "..."
            ],
        assumptions="deg>=0 and nels>=1"
        )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)

    if 0:
        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
                slabs=(0, 1))
        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    from pickle import dumps, loads
    knl = loads(dumps(knl))

    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                tmp=np.float32,
                coeffs=np.float32,
                result=np.float32,
                ))
    print(knl)
Beispiel #40
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
     knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
     return knl
Beispiel #41
0
def test_lbm(ctx_factory):
    ctx = ctx_factory()

    # D2Q4Q4Q4 lattice Boltzmann scheme for the shallow water equations
    # Example by Loic Gouarin <*****@*****.**>
    knl = lp.make_kernel(
        "{[ii,jj]:0<=ii<nx-2 and 0<=jj<ny-2}",
        """  # noqa (silences flake8 line length warning)
        i := ii + 1
        j := jj + 1
        for ii, jj
            with {id_prefix=init_m}
                <> m[0] =   +    f[i-1, j, 0] +    f[i, j-1, 1] + f[i+1, j, 2] +  f[i, j+1, 3]
                m[1] =   + 4.*f[i-1, j, 0] - 4.*f[i+1, j, 2]
                m[2] =   + 4.*f[i, j-1, 1] - 4.*f[i, j+1, 3]
                m[3] =   +    f[i-1, j, 0] -    f[i, j-1, 1] + f[i+1, j, 2] -  f[i, j+1, 3]
                m[4] =   +    f[i-1, j, 4] +    f[i, j-1, 5] + f[i+1, j, 6] +  f[i, j+1, 7]
                m[5] =   + 4.*f[i-1, j, 4] - 4.*f[i+1, j, 6]
                m[6] =   + 4.*f[i, j-1, 5] - 4.*f[i, j+1, 7]
                m[7] =   +    f[i-1, j, 4] -    f[i, j-1, 5] + f[i+1, j, 6] -  f[i, j+1, 7]
                m[8] =   +    f[i-1, j, 8] +    f[i, j-1, 9] + f[i+1, j, 10] + f[i, j+1, 11]
                m[9] =   + 4.*f[i-1, j, 8] - 4.*f[i+1, j, 10]
                m[10] =  + 4.*f[i, j-1, 9] - 4.*f[i, j+1, 11]
                m[11] =  +    f[i-1, j, 8] -    f[i, j-1, 9] + f[i+1, j, 10] - f[i, j+1, 11]
            end

            with {id_prefix=update_m,dep=init_m*}
                m[1] = m[1] + 2.*(m[4] - m[1])
                m[2] = m[2] + 2.*(m[8] - m[2])
                m[3] = m[3]*(1. - 1.5)
                m[5] = m[5] + 1.5*(0.5*(m[0]*m[0]) + (m[4]*m[4])/m[0] - m[5])
                m[6] = m[6] + 1.5*(m[4]*m[8]/m[0] - m[6])
                m[7] = m[7]*(1. - 1.2000000000000000)
                m[9] = m[9] + 1.5*(m[4]*m[8]/m[0] - m[9])
                m[10] = m[10] + 1.5*(0.5*(m[0]*m[0]) + (m[8]*m[8])/m[0] - m[10])
                m[11] = m[11]*(1. - 1.2)
            end

            with {dep=update_m*}
                f_new[i, j, 0] =  + 0.25*m[0] + 0.125*m[1] + 0.25*m[3]
                f_new[i, j, 1] =  + 0.25*m[0] + 0.125*m[2] - 0.25*m[3]
                f_new[i, j, 2] =  + 0.25*m[0] - 0.125*m[1] + 0.25*m[3]
                f_new[i, j, 3] =  + 0.25*m[0] - 0.125*m[2] - 0.25*m[3]
                f_new[i, j, 4] =  + 0.25*m[4] + 0.125*m[5] + 0.25*m[7]
                f_new[i, j, 5] =  + 0.25*m[4] + 0.125*m[6] - 0.25*m[7]
                f_new[i, j, 6] =  + 0.25*m[4] - 0.125*m[5] + 0.25*m[7]
                f_new[i, j, 7] =  + 0.25*m[4] - 0.125*m[6] - 0.25*m[7]
                f_new[i, j, 8] =  + 0.25*m[8] + 0.125*m[9] + 0.25*m[11]
                f_new[i, j, 9] =  + 0.25*m[8] + 0.125*m[10] - 0.25*m[11]
                f_new[i, j, 10] =  + 0.25*m[8] - 0.125*m[9] + 0.25*m[11]
                f_new[i, j, 11] =  + 0.25*m[8] - 0.125*m[10] - 0.25*m[11]
           end
        end
        """)

    knl = lp.add_and_infer_dtypes(knl, {"f": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, "ii", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "jj", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.expand_subst(knl)
    knl = lp.add_prefetch(knl, "f", "ii_inner,jj_inner", fetch_bounding_box=True,
            default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20})
Beispiel #42
0
def test_laplacian(ctx_factory):
    1 / 0  # not adapted to new language

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # load:     1+6 fields + 1/N D entry
    # store:    1   fields
    # perform:  N*2*6 + 3*5 flops
    # ratio:   (12*N+15)/8  flops per 4 bytes on bus
    #          ~ 14 FLOPS per 4 bytes at N=8
    #          ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}"
        % n,
        [
            "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))",
            "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))",
            "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))",

            # define function
            "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)",
            "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)",
            "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*Gu(m,j,k))"
            "+ sum_float32(m, D[m,j]*Gv(i,m,k))"
            "+ sum_float32(m, D[m,k]*Gw(i,j,m))"
        ],
        [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    #print(lp.preprocess_kernel(knl, cse_ok=True))
    #1/0
    #
    #print(knl)
    #1/0
    knl = lp.realize_cse(knl, "urf", np.float32, ["o1"])
    knl = lp.realize_cse(knl, "usf", np.float32, ["o2"])
    knl = lp.realize_cse(knl, "utf", np.float32, ["o3"])

    knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"])
    knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"])

    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])

    if 0:
        pass
        #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto")
        #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], default_tag="l.auto")
        #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto")
    else:
        seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.add_prefetch(knl,
                          "G", ["gi", "m", "j", "k"],
                          "G[gi,e,m,j,k]",
                          default_tag="l.auto")
    knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto")

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    #print(seq_knl)
    #print(lp.preprocess_kernel(knl))
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(
        knl, loop_priority=["m_fetch_G", "i_fetch_u"])
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True)
Beispiel #43
0
def test_laplacian_lmem(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 4

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" %
        n, [
            "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
            "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
            "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    seq_knl = knl

    if 1:
        # original
        knl = lp.add_prefetch(knl,
                              "u", ["i", "j", "k", "o"],
                              default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")
    else:
        # experiment
        #        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto")
        knl = lp.precompute(knl,
                            "eu",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")

    #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G
    #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G
    #print(knl)
    #1/0

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
#    knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j")

#print(seq_knl)
#print(lp.preprocess_kernel(knl))
#1/0

# TW: turned this off since it generated:
# ValueError: cannot tag 'i_and_j'--not known
#    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K})
Beispiel #44
0
def test_interp_diff(ctx_factory):
    1 / 0  # not ready
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %
        M %
        N["[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
          "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
          "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
          "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
          "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
          "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
          "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [
              lp.ArrayArg("u", dtype, shape=field_shape, order=order),
              lp.ArrayArg("P", dtype, shape=interim_field_shape, order=order),
              lp.ArrayArg("I", dtype, shape=(M, N), order=order),
              lp.ArrayArg("V", dtype, shape=(N, M), order=order),
              lp.ArrayArg("Pu", dtype, shape=field_shape, order=order),
              lp.ValueArg("K", np.int32, approximately=1000),
          ],
        name="sem_lap_precon",
        assumptions="K>=1")

    print(knl)
    1 / 0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=0,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True,
    )
Beispiel #45
0
    def variant_k_ilp(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp")
        knl = lp.tag_inames(knl, dict(m="unr"))
        return knl
Beispiel #46
0
    def assign_axis(recursion_axis, iname, axis=None):
        """Assign iname to local axis *axis* and start over by calling
        the surrounding function assign_automatic_axes.

        If *axis* is None, find a suitable axis automatically.
        """
        try:
            with isl.SuppressedWarnings(kernel.isl_context):
                desired_length = kernel.get_constant_iname_length(iname)
        except isl.Error:
            # Likely unbounded, automatic assignment is not
            # going to happen for this iname.
            new_iname_to_tag = kernel.iname_to_tag.copy()
            new_iname_to_tag[iname] = None
            return assign_automatic_axes(
                kernel.copy(iname_to_tag=new_iname_to_tag),
                axis=recursion_axis)

        if axis is None:
            # {{{ find a suitable axis

            shorter_possible_axes = []
            test_axis = 0
            while True:
                if test_axis >= len(local_size):
                    break
                if test_axis in assigned_local_axes:
                    test_axis += 1
                    continue

                if local_size[test_axis] < desired_length:
                    shorter_possible_axes.append(test_axis)
                    test_axis += 1
                    continue
                else:
                    axis = test_axis
                    break

            # The loop above will find an unassigned local axis
            # that has enough 'room' for the iname. In the same traversal,
            # it also finds theoretically assignable axes that are shorter,
            # in the variable shorter_possible_axes.

            if axis is None and shorter_possible_axes:
                # sort as longest first
                shorter_possible_axes.sort(key=lambda ax: local_size[ax])
                axis = shorter_possible_axes[0]

            # }}}

        if axis is None:
            new_tag = None
        else:
            new_tag = LocalIndexTag(axis)
            if desired_length > local_size[axis]:
                from loopy import split_iname

                # Don't be tempted to switch the outer tag to unroll--this may
                # generate tons of code on some examples.

                return assign_automatic_axes(split_iname(
                    kernel,
                    iname,
                    inner_length=local_size[axis],
                    outer_tag=None,
                    inner_tag=new_tag,
                    do_tagged_check=False),
                                             axis=recursion_axis,
                                             local_size=local_size)

        if not isinstance(kernel.iname_to_tag.get(iname),
                          AutoLocalIndexTagBase):
            raise LoopyError("trying to reassign '%s'" % iname)

        new_iname_to_tag = kernel.iname_to_tag.copy()
        new_iname_to_tag[iname] = new_tag
        return assign_automatic_axes(
            kernel.copy(iname_to_tag=new_iname_to_tag),
            axis=recursion_axis,
            local_size=local_size)
Beispiel #47
0
 def variant_cpu(knl):
     knl = lp.expand_subst(knl)
     knl = lp.split_iname(knl, "i", 1024,
             outer_tag="g.0", slabs=(0, 1))
     knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
     return knl
Beispiel #48
0
 def variant_cpu(knl):
     unroll = 16
     block_size = unroll*4096
     knl = lp.split_iname(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1))
     knl = lp.split_iname(knl, "i_inner", unroll, inner_tag="unr")
     return knl
Beispiel #49
0
knl = lp.make_kernel("{[i, j]: 0<=i<n and 0<=j<n}",
                     "c[i, j] = a[i]*b[j]",
                     assumptions="n >= 16")

a = np.arange(200, dtype=np.float32)
b = np.arange(200, dtype=np.float32)

knl = lp.set_options(knl, write_code=True)
evt, (c, ) = knl(queue, a=a, b=b)
# SETUPEND

orig_knl = knl

# SPLITBEGIN
knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1")
# SPLITEND

knl = lp.set_options(knl, write_code=True)
evt, (c, ) = knl(queue, a=a, b=b)

split_knl = knl

# PREFETCH1BEGIN
knl = lp.add_prefetch(knl,
                      "a",
                      fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
knl = lp.add_prefetch(knl,
                      "b",
                      fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
Beispiel #50
0
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array

knl = lp.make_kernel(
    "{ [i,k]: 0<=i<n and 0<=k<3 }", """
        c[k,i] = a[k, i + 1]
        out[k,i] = c[k,i]
        """)

# transform
knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
from loopy.kernel.tools import add_dtypes

knl = add_dtypes(knl, {
    "a": np.float32,
    "c": np.float32,
    "out": np.float32,
    "n": np.int32
})

# schedule
from loopy.preprocess import preprocess_kernel

knl = preprocess_kernel(knl)

from loopy.schedule import get_one_scheduled_kernel

knl = get_one_scheduled_kernel(knl)
Beispiel #51
0
def test_tim3d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" %
        n,
        [
            "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
            "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
            "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",
            "lap[e,i,j,k]  = "
            "   sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            " + sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            " + sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ],
        [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            #            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            #            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap3D",
        assumptions="K>=1")

    seq_knl = knl
    knl = lp.add_prefetch(knl,
                          "D", ["m", "j", "i", "k", "o"],
                          default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j", "o", "k"], default_tag="l.auto")
    knl = lp.precompute(knl,
                        "ur",
                        np.float32, ["a", "b", "c"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "us",
                        np.float32, ["a", "b", "c"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "ut",
                        np.float32, ["a", "b", "c"],
                        default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "k", n, inner_tag="l.2")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "j", n, inner_tag="l.1")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "i", n, inner_tag="l.0")  #, slabs=(0, 1))

    #    knl = lp.tag_inames(knl, dict(k_nner="unr"))

    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))
    #    knl = lp.tag_inames(knl, dict(i="unr"))

    knl = lp.add_prefetch(knl, "G", [2, 3, 4],
                          default_tag="l.auto")  # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 4000
    lp.auto_test_vs_ref(seq_knl,
                        ctx,
                        kernel_gen,
                        op_count=K * ((n**4) * 3 * 2 + (n**3) * 5 * 3 +
                                      (n**4) * 3 * 2) / 1e9,
                        op_label="GFlops",
                        parameters={"K": K})
Beispiel #52
0
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
        split_kwargs=None):
    """
    :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating
        that the index in *axis_nr* should be split. The tuples may
        also be *(array, axis_nr, "F")*, indicating that the index will
        be split as it would be according to Fortran order.

        *array* may name a temporary variable or an argument.

        If *arrays_and_axes* is a :class:`tuple`, it is automatically
        wrapped in a list, to make single splits easier.

    :arg count: The group size to use in the split.
    :arg auto_split_inames: Whether to automatically split inames
        encountered in the specified indices.
    :arg split_kwargs: arguments to pass to :func:`loopy.split_inames`

    Note that splits on the corresponding inames are carried out implicitly.
    The inames may *not* be split beforehand. (There's no *really* good reason
    for this--this routine is just not smart enough to deal with this.)
    """

    if count == 1:
        return kernel

    if split_kwargs is None:
        split_kwargs = {}

    # {{{ process input into array_to_rest

    # where "rest" is the non-argument-name part of the input tuples
    # in args_and_axes
    def normalize_rest(rest):
        if len(rest) == 1:
            return (rest[0], "C")
        elif len(rest) == 2:
            return rest
        else:
            raise RuntimeError("split instruction '%s' not understood" % rest)

    if isinstance(arrays_and_axes, tuple):
        arrays_and_axes = [arrays_and_axes]

    array_to_rest = dict(
            (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes)

    if len(arrays_and_axes) != len(array_to_rest):
        raise RuntimeError("cannot split multiple axes of the same variable")

    del arrays_and_axes

    # }}}

    # {{{ adjust arrays

    from loopy.kernel.tools import ArrayChanger

    for array_name, (axis, order) in six.iteritems(array_to_rest):
        achng = ArrayChanger(kernel, array_name)
        ary = achng.get()

        from pytools import div_ceil

        # {{{ adjust shape

        new_shape = ary.shape
        if new_shape is not None:
            new_shape = list(new_shape)
            axis_len = new_shape[axis]
            new_shape[axis] = count
            outer_len = div_ceil(axis_len, count)

            if order == "F":
                new_shape.insert(axis+1, outer_len)
            elif order == "C":
                new_shape.insert(axis, outer_len)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_shape = tuple(new_shape)

        # }}}

        # {{{ adjust dim tags

        if ary.dim_tags is None:
            raise RuntimeError("dim_tags of '%s' are not known" % array_name)
        new_dim_tags = list(ary.dim_tags)

        old_dim_tag = ary.dim_tags[axis]

        from loopy.kernel.array import FixedStrideArrayDimTag
        if not isinstance(old_dim_tag, FixedStrideArrayDimTag):
            raise RuntimeError("axis %d of '%s' is not tagged fixed-stride"
                    % (axis, array_name))

        old_stride = old_dim_tag.stride
        outer_stride = count*old_stride

        if order == "F":
            new_dim_tags.insert(axis+1, FixedStrideArrayDimTag(outer_stride))
        elif order == "C":
            new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride))
        else:
            raise RuntimeError("order '%s' not understood" % order)

        new_dim_tags = tuple(new_dim_tags)

        # }}}

        # {{{ adjust dim_names

        new_dim_names = ary.dim_names
        if new_dim_names is not None:
            new_dim_names = list(new_dim_names)
            existing_name = new_dim_names[axis]
            new_dim_names[axis] = existing_name + "_inner"
            outer_name = existing_name + "_outer"

            if order == "F":
                new_dim_names.insert(axis+1, outer_name)
            elif order == "C":
                new_dim_names.insert(axis, outer_name)
            else:
                raise RuntimeError("order '%s' not understood" % order)
            new_dim_names = tuple(new_dim_names)

        # }}}

        kernel = achng.with_changed_array(ary.copy(
            shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names))

    # }}}

    split_vars = {}

    var_name_gen = kernel.get_var_name_generator()

    def split_access_axis(expr):
        axis_nr, order = array_to_rest[expr.aggregate.name]

        idx = expr.index
        if not isinstance(idx, tuple):
            idx = (idx,)
        idx = list(idx)

        axis_idx = idx[axis_nr]

        if auto_split_inames:
            from pymbolic.primitives import Variable
            if not isinstance(axis_idx, Variable):
                raise RuntimeError("found access '%s' in which axis %d is not a "
                        "single variable--cannot split "
                        "(Have you tried to do the split yourself, manually, "
                        "beforehand? If so, you shouldn't.)"
                        % (expr, axis_nr))

            split_iname = idx[axis_nr].name
            assert split_iname in kernel.all_inames()

            try:
                outer_iname, inner_iname = split_vars[split_iname]
            except KeyError:
                outer_iname = var_name_gen(split_iname+"_outer")
                inner_iname = var_name_gen(split_iname+"_inner")
                split_vars[split_iname] = outer_iname, inner_iname

            inner_index = Variable(inner_iname)
            outer_index = Variable(outer_iname)

        else:
            from loopy.symbolic import simplify_using_aff
            inner_index = simplify_using_aff(kernel, axis_idx % count)
            outer_index = simplify_using_aff(kernel, axis_idx // count)

        idx[axis_nr] = inner_index

        if order == "F":
            idx.insert(axis+1, outer_index)
        elif order == "C":
            idx.insert(axis, outer_index)
        else:
            raise RuntimeError("order '%s' not understood" % order)

        return expr.aggregate.index(tuple(idx))

    rule_mapping_context = SubstitutionRuleMappingContext(
            kernel.substitutions, var_name_gen)
    aash = ArrayAxisSplitHelper(rule_mapping_context,
            set(six.iterkeys(array_to_rest)), split_access_axis)
    kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))

    if auto_split_inames:
        from loopy import split_iname
        for iname, (outer_iname, inner_iname) in six.iteritems(split_vars):
            kernel = split_iname(kernel, iname, count,
                    outer_iname=outer_iname, inner_iname=inner_iname,
                    **split_kwargs)

    return kernel
Beispiel #53
0
 def get_optimized_kernel(self):
     # FIXME
     knl = self.get_kernel()
     knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0")
     return knl
Beispiel #54
0
 def variant_1(knl):
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"])
     knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"])
     return knl
Beispiel #55
0
def test_mem_access_counter_mixed():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
                         ],
                         name="mixed",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(
        knl,
        dict(a=np.float32,
             b=np.float32,
             g=np.float64,
             h=np.float64,
             x=np.float32))
    threads = 16
    knl = lp.split_iname(knl, "j", threads)
    knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})

    mem_map = lp.get_mem_access_map(knl)  # noqa
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f64uniform = mem_map[lp.MemAccess('global',
                                      np.float64,
                                      stride=0,
                                      direction='load',
                                      variable='g')].eval_with_dict(params)
    f64uniform += mem_map[lp.MemAccess('global',
                                       np.float64,
                                       stride=0,
                                       direction='load',
                                       variable='h')].eval_with_dict(params)
    f32uniform = mem_map[lp.MemAccess('global',
                                      np.float32,
                                      stride=0,
                                      direction='load',
                                      variable='x')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.dtype(np.float32),
                                        stride=Variable('m'),
                                        direction='load',
                                        variable='a')].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global',
                                         np.dtype(np.float32),
                                         stride=Variable('m'),
                                         direction='load',
                                         variable='b')].eval_with_dict(params)
    assert f64uniform == 2 * n * m
    assert f32uniform == n * m * l / threads
    assert f32nonconsec == 3 * n * m * l

    f64uniform = mem_map[lp.MemAccess('global',
                                      np.float64,
                                      stride=0,
                                      direction='store',
                                      variable='e')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.float32,
                                        stride=Variable('m'),
                                        direction='store',
                                        variable='c')].eval_with_dict(params)
    assert f64uniform == n * m
    assert f32nonconsec == n * m * l
Beispiel #56
0
 def variant_image_d(knl):
     knl = lp.tag_inames(knl, dict(n="l.0"))
     knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
     knl = lp.change_arg_to_image(knl, "DrDsDt")
     return knl
Beispiel #57
0
def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [
            "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
            "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
            "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl,
                          "u", [1, 2, 3, "e_inner_inner"],
                          default_tag="l.auto")

    knl = lp.precompute(knl,
                        "ur",
                        np.float32, [0, 1, 2, "e_inner_inner"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "us",
                        np.float32, [0, 1, 2, "e_inner_inner"],
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "ut",
                        np.float32, [0, 1, 2, "e_inner_inner"],
                        default_tag="l.auto")

    knl = lp.add_prefetch(knl,
                          "G", ["m", "i", "j", "k", "e_inner_inner"],
                          default_tag="l.auto")
    knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto")

    #print(seq_knl)
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))
Beispiel #58
0
def split(vanilla):
    k = lp.split_iname(vanilla, "i", 4, slabs=(1, 1))
    k = lp.prioritize_loops(k, "i_outer,i_inner")
    return k
Beispiel #59
0
def test_advect(ctx_factory):
    1 / 0  # not ready

    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    N = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, N, N, N)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic

    # A. updated for CSE: notation.
    # B. fixed temp indexing and C ordering
    # load:     3+9 fields + 1/N D entry
    # store:    3   fields
    # perform:  N*2*6 + 3*5 + 3*5 flops
    # ratio:   (12*N+30)/15  flops per 4 bytes on bus
    #          ~ 8.4 FLOPS per 4 bytes at N=8
    #          ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N,
        [
            # differentiate u
            "CSE:  ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])",
            "CSE:  us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])",
            "CSE:  ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])",

            # differentiate v
            "CSE:  vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])",
            "CSE:  vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])",
            "CSE:  vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])",

            # differentiate w
            "CSE:  wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])",
            "CSE:  ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])",
            "CSE:  wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])",

            # find velocity in (r,s,t) coordinates
            # CSE?
            "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]",
            "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]",
            "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]",

            # form nonlinear term on integration nodes
            "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)",
            "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)",
            "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)",
        ],
        [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("v", dtype, shape=field_shape, order=order),
            lp.ArrayArg("w", dtype, shape=field_shape, order=order),
            lp.ArrayArg("Nu", dtype, shape=field_shape, order=order),
            lp.ArrayArg("Nv", dtype, shape=field_shape, order=order),
            lp.ArrayArg("Nw", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(9, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(N, N), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="sem_advect",
        assumptions="K>=1")

    print(knl)
    1 / 0

    seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=0,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True,
    )
def test_matmul(ctx_factory, buffer_inames):
    ctx = ctx_factory()

    if (buffer_inames
            and ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    logging.basicConfig(level=logging.INFO)

    fortran_src = """
        subroutine dgemm(m,n,ell,a,b,c)
          implicit none
          real*8 a(m,ell),b(ell,n),c(m,n)
          integer m,n,k,i,j,ell

          do j = 1,n
            do i = 1,m
              do k = 1,ell
                c(i,j) = c(i,j) + b(k,j)*a(i,k)
              end do
            end do
          end do
        end subroutine
        """

    knl, = lp.parse_fortran(fortran_src)

    assert len(knl.domains) == 1

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    knl = lp.assume(knl, "n mod 32 = 0")
    knl = lp.assume(knl, "m mod 32 = 0")
    knl = lp.assume(knl, "ell mod 16 = 0")

    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl,
                        "a_acc",
                        "k_inner,i_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")
    knl = lp.precompute(knl,
                        "b_acc",
                        "j_inner,k_inner",
                        precompute_outer_inames="i_outer, j_outer, k_outer",
                        default_tag="l.auto")

    knl = lp.buffer_array(knl,
                          "c",
                          buffer_inames=buffer_inames,
                          init_expression="0",
                          store_expression="base+buffer")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters=dict(n=128, m=128, ell=128))