Python generate_loop_schedulesの例、loopy.generate_loop_schedules Pythonの例

コード例 #1

0

ファイルを表示

ファイル: test_tim.py プロジェクト: inducer/loopy

def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})

コード例 #2

0

ファイルを表示

ファイル: test_linalg.py プロジェクト: rckirby/loopy

def test_ilp_race_matmul(ctx_factory):
    dtype = np.float32
    order = "C"

    n = 9

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.ImageArg("a", dtype, shape=(n, n)),
                lp.ImageArg("b", dtype, shape=(n, n)),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                ],
            name="matmul")

    knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 2)
    knl = lp.add_prefetch(knl, 'b', ["k_inner"])

    with lp.CacheMode(False):
        from loopy.diagnostic import WriteRaceConditionWarning
        from warnings import catch_warnings
        with catch_warnings(record=True) as warn_list:
            knl = lp.preprocess_kernel(knl)
            list(lp.generate_loop_schedules(knl))

            assert any(isinstance(w.message, WriteRaceConditionWarning)
                    for w in warn_list)

コード例 #3

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_detection_global(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i,j]: 0<=i,j<n }",
            [
                "a[i] = 5+i+j",
                ],
            [
                lp.GlobalArg("a", np.float32),
                lp.ValueArg("n", np.int32, approximately=1000),
                ],
            assumptions="n>=1")

    knl = lp.tag_inames(knl, dict(j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with lp.CacheMode(False):
        from loopy.diagnostic import WriteRaceConditionWarning
        from warnings import catch_warnings
        with catch_warnings(record=True) as warn_list:
            list(lp.generate_loop_schedules(knl))

            assert any(isinstance(w.message, WriteRaceConditionWarning)
                    for w in warn_list)

コード例 #4

0

ファイルを表示

def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})

コード例 #5

0

ファイルを表示

ファイル: test_domain.py プロジェクト: yueyedeai/loopy

def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<n}",
            [
                "b[i] = 2*a[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1 and (exists zz: n = 16*zz)")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters={"n": 16**3})

コード例 #6

0

ファイルを表示

ファイル: test_domain.py プロジェクト: cmsquared/loopy

def test_divisibility_assumption(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i]: 0<=i<n}",
            [
                "b[i] = 2*a[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1 and (exists zz: n = 16*zz)")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16)

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "if" not in code

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters={"n": 16**3})

コード例 #7

0

ファイルを表示

def test_interp_diff(ctx_factory):
    1/0 # not ready
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N
            [
                "[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
                "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
                "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
                "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
                "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
                "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
                "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])",
                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("P",   dtype, shape=interim_field_shape, order=order),
            lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
            lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
            lp.GlobalArg("Pu",  dtype, shape=field_shape, order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_lap_precon", assumptions="K>=1")

    print(knl)
    1/0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)

コード例 #8

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_interp_diff(ctx_factory):
    1/0 # not ready
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N
            [
                "[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
                "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
                "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
                "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
                "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
                "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
                "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])",
                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("P",   dtype, shape=interim_field_shape, order=order),
            lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
            lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
            lp.GlobalArg("Pu",  dtype, shape=field_shape, order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_lap_precon", assumptions="K>=1")

    print(knl)
    1/0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)

コード例 #9

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_avoidance_local(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i<16 and 0<=j<17 }", [
        "<> a[i] = 5+i+j",
    ], [])

    knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        assert k.temporary_variables["a"].shape == (16, 17)

コード例 #10

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_avoidance_private(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[j]: 0<=j<16 }", [
        "<> a = 5+j",
    ], [])

    knl = lp.tag_inames(knl, dict(j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        assert k.temporary_variables["a"].shape == (16, )

コード例 #11

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_variable_size_temporary():
    knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''',
                         ''' out[i] = sum(j, a[i,j])''')

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    knl = lp.add_prefetch(knl, "a[:,:]", default_tag=None)

    # Make sure that code generation succeeds even if
    # there are variable-length arrays.
    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        lp.generate_code(k)

コード例 #12

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_owed_barriers(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i]"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))])

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #13

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_simple_side_effect(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<100}", """
                a[i] = a[i] + 1
                """, [lp.GlobalArg("a", np.float32, shape=(100, ))])

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #14

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_avoidance_private(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[j]: 0<=j<16 }",
            [
                "<> a = 5+j",
                ],
            [])

    knl = lp.tag_inames(knl, dict(j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        assert k.temporary_variables["a"].shape == (16,)

コード例 #15

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_avoidance_local(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<16 and 0<=j<17 }",
            [
                "<> a[i] = 5+i+j",
                ],
            [])

    knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        assert k.temporary_variables["a"].shape == (16, 17)

コード例 #16

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_variable_size_temporary():
    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' out[i] = sum(j, a[i,j])''')

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    knl = lp.add_prefetch(
            knl, "a[:,:]", default_tag=None)

    # Make sure that code generation succeeds even if
    # there are variable-length arrays.
    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        lp.generate_code(k)

コード例 #17

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_multiple_writes_to_local_temporary():
    # Loopy would previously only handle barrier insertion correctly if exactly
    # one instruction wrote to each local temporary. This tests that multiple
    # writes are OK.

    knl = lp.make_kernel(
        "{[i,e]: 0<=i<5 and 0<=e<nelements}", """
        <> temp[i, 0] = 17
        temp[i, 1] = 15
        """)
    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        code, _ = lp.generate_code(k)
        print(code)

コード例 #18

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_multiple_writes_to_local_temporary():
    # Loopy would previously only handle barrier insertion correctly if exactly
    # one instruction wrote to each local temporary. This tests that multiple
    # writes are OK.

    knl = lp.make_kernel(
        "{[i,e]: 0<=i<5 and 0<=e<nelements}",
        """
        <> temp[i, 0] = 17
        temp[i, 1] = 15
        """)
    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        code, _ = lp.generate_code(k)
        print(code)

コード例 #19

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_wg_too_small(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}",
                         ["<float32> z[i] = a[i] {id=copy}"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))],
                         local_sizes={0: 16})

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    import pytest
    for gen_knl in kernel_gen:
        with pytest.raises(RuntimeError):
            lp.CompiledKernel(ctx, gen_knl).get_code()

コード例 #20

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_multi_cse(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<100}",
                         ["<float32> z[i] = a[i] + a[i]**2"],
                         [lp.GlobalArg("a", np.float32, shape=(100, ))],
                         local_sizes={0: 16})

    knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
    knl = lp.add_prefetch(knl, "a", [])

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #21

0

ファイルを表示

ファイル: test_domain.py プロジェクト: shigh/loopy

def test_eq_constraint(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<= i,j < 32}", ["a[i] = b[i]"], [
        lp.GlobalArg("a", np.float32, shape=(1000, )),
        lp.GlobalArg("b", np.float32, shape=(1000, ))
    ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for knl in kernel_gen:
        print(lp.generate_code(knl))

コード例 #22

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_simple_side_effect(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<100}",
            """
                a[i] = a[i] + 1
                """,
            [lp.GlobalArg("a", np.float32, shape=(100,))]
            )

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #23

0

ファイルを表示

ファイル: test_domain.py プロジェクト: shigh/loopy

def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1",
                         [lp.GlobalArg("a", np.float32, shape="n"), "..."])

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.set_loop_priority(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
        assert "if" not in compiled.get_code()

コード例 #24

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_type_inference_no_artificial_doubles(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         """
                <> bb = a[i] - b[i]
                c[i] = bb
                """, [
                             lp.GlobalArg("a", np.float32, shape=("n", )),
                             lp.GlobalArg("b", np.float32, shape=("n", )),
                             lp.GlobalArg("c", np.float32, shape=("n", )),
                             lp.ValueArg("n", np.int32),
                         ],
                         assumptions="n>=1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "double" not in code

コード例 #25

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_owed_barriers(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<100}",
            [
                "<float32> z[i] = a[i]"
                ],
            [lp.GlobalArg("a", np.float32, shape=(100,))]
            )

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #26

0

ファイルを表示

ファイル: test_domain.py プロジェクト: cmsquared/loopy

def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[jj]: 0<=jj<row_len}",
                ],
            [
                "<> row_len = a_row_lengths[i]",
                "a[i,jj] = 1",
                ],
            [
                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                lp.ValueArg("n", np.int32),
                ])

    assert knl.parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
            inner_tag="l.0")

    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
            inner_tag="l.1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    import pytest
    with pytest.raises(RuntimeError):
        list(lp.generate_loop_schedules(knl_bad))

コード例 #27

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_wg_too_small(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<100}",
            [
                "<float32> z[i] = a[i] {id=copy}"
                ],
            [lp.GlobalArg("a", np.float32, shape=(100,))],
            local_sizes={0: 16})

    knl = lp.tag_inames(knl, dict(i="l.0"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    import pytest
    for gen_knl in kernel_gen:
        with pytest.raises(RuntimeError):
            lp.CompiledKernel(ctx, gen_knl).get_code()

コード例 #28

0

ファイルを表示

ファイル: test_domain.py プロジェクト: cmsquared/loopy

def test_assume(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = a[i] + 1",
            [lp.GlobalArg("a", np.float32, shape="n"), "..."])

    knl = lp.split_iname(knl, "i", 16)
    knl = lp.set_loop_priority(knl, "i_outer,i_inner")
    knl = lp.assume(knl, "n mod 16 = 0")
    knl = lp.assume(knl, "n > 10")
    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        print(gen_knl)
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())
        assert "if" not in compiled.get_code()

コード例 #29

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_multi_cse(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<100}",
            [
                "<float32> z[i] = a[i] + a[i]**2"
                ],
            [lp.GlobalArg("a", np.float32, shape=(100,))],
            local_sizes={0: 16})

    knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
    knl = lp.add_prefetch(knl, "a", [])

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for gen_knl in kernel_gen:
        compiled = lp.CompiledKernel(ctx, gen_knl)
        print(compiled.get_code())

コード例 #30

0

ファイルを表示

ファイル: test_domain.py プロジェクト: yueyedeai/loopy

def test_dependent_loop_bounds_3(ctx_factory):
    # The point of this test is that it shows a dependency between
    # domains that is exclusively mediated by the row_len temporary.
    # It also makes sure that row_len gets read before any
    # conditionals use it.

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<n}",
                "{[jj]: 0<=jj<row_len}",
                ],
            [
                "<> row_len = a_row_lengths[i]",
                "a[i,jj] = 1",
                ],
            [
                lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                lp.ValueArg("n", np.int32),
                ])

    assert knl.parents_per_domain()[1] == 0

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
            inner_tag="l.0")

    cknl = lp.CompiledKernel(ctx, knl)
    print("---------------------------------------------------")
    print(cknl.get_highlighted_code())
    print("---------------------------------------------------")

    knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
            inner_tag="l.1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with pytest.raises(RuntimeError):
        list(lp.generate_loop_schedules(knl_bad))

コード例 #31

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_type_inference_no_artificial_doubles(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            """
                <> bb = a[i] - b[i]
                c[i] = bb
                """,
            [
                lp.GlobalArg("a", np.float32, shape=("n",)),
                lp.GlobalArg("b", np.float32, shape=("n",)),
                lp.GlobalArg("c", np.float32, shape=("n",)),
                lp.ValueArg("n", np.int32),
                ],
            assumptions="n>=1")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    for k in lp.generate_loop_schedules(knl):
        code = lp.generate_code(k)
        assert "double" not in code

コード例 #32

0

ファイルを表示

ファイル: test_domain.py プロジェクト: cmsquared/loopy

def test_eq_constraint(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i,j]: 0<= i,j < 32}",
            [
                "a[i] = b[i]"
                ],
            [
                lp.GlobalArg("a", np.float32, shape=(1000,)),
                lp.GlobalArg("b", np.float32, shape=(1000,))
                ])

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")

    knl = lp.preprocess_kernel(knl, ctx.devices[0])
    kernel_gen = lp.generate_loop_schedules(knl)

    for knl in kernel_gen:
        print(lp.generate_code(knl))

コード例 #33

0

ファイルを表示

ファイル: test_loopy.py プロジェクト: dokempf/loopy

def test_ilp_write_race_detection_global(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel("[n] -> {[i,j]: 0<=i,j<n }", [
        "a[i] = 5+i+j",
    ], [
        lp.GlobalArg("a", np.float32),
        lp.ValueArg("n", np.int32, approximately=1000),
    ],
                         assumptions="n>=1")

    knl = lp.tag_inames(knl, dict(j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with lp.CacheMode(False):
        from loopy.diagnostic import WriteRaceConditionWarning
        from warnings import catch_warnings
        with catch_warnings(record=True) as warn_list:
            list(lp.generate_loop_schedules(knl))

            assert any(
                isinstance(w.message, WriteRaceConditionWarning)
                for w in warn_list)

コード例 #34

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_laplacian(ctx_factory):
    1/0 # not adapted to new language

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # load:     1+6 fields + 1/N D entry
    # store:    1   fields
    # perform:  N*2*6 + 3*5 flops
    # ratio:   (12*N+15)/8  flops per 4 bytes on bus 
    #          ~ 14 FLOPS per 4 bytes at N=8
    #          ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n,
            [
                "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))",
                "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))",
                "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))",

                # define function
                "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)",
                "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)",
                "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*Gu(m,j,k))"
                "+ sum_float32(m, D[m,j]*Gv(i,m,k))"
                "+ sum_float32(m, D[m,k]*Gw(i,j,m))"
                ],
            [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")

    #print lp.preprocess_kernel(knl, cse_ok=True)
    #1/0
    #
    #print knl
    #1/0
    knl = lp.realize_cse(knl, "urf", np.float32, ["o1"])
    knl = lp.realize_cse(knl, "usf", np.float32, ["o2"])
    knl = lp.realize_cse(knl, "utf", np.float32, ["o3"])

    knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"])
    knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"])

    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])

    if 0:
        pass
        #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
        #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
        #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    else:
        seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
    knl = lp.add_prefetch(knl, "D", ["m", "j"])
    #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    #print seq_knl
    #print lp.preprocess_kernel(knl)
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl,
            loop_priority=["m_fetch_G", "i_fetch_u"])
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True)

コード例 #35

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_laplacian_lmem(ctx_factory):
    1/0 # not adapted to new language

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n,
            [
                "CSE: ur(i,j,k) = sum_float32(@o, D[i,o]*u[e,o,j,k])",
                "CSE: us(i,j,k) = sum_float32(@o, D[j,o]*u[e,i,o,k])",
                "CSE: ut(i,j,k) = sum_float32(@o, D[k,o]*u[e,i,j,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")


    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])

    if 0:
        seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
        seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
        seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    else:
        seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
    knl = lp.add_prefetch(knl, "D", ["m", "j"])
    knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    #print seq_knl
    #print lp.preprocess_kernel(knl)
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True)

コード例 #36

0

ファイルを表示

def test_advect_dealias(ctx_factory):
    1 / 0  # not ready

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}"
        % M % N[

            # interpolate u to integration nodes
            "CSE:  u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])",
            "CSE:  u1[i,j,kp,e]  = sum_float32(@o, I[j,o]*u0[i,o,kp,e])",
            "CSE:  Iu[i,j,k,e]   = sum_float32(@o, I[k,o]*u1[i,j,o,e])",

            # differentiate u on integration nodes
            "CSE:  Iur[i,j,k,e]  = sum_float32(@m, D[i,m]*Iu[m,j,k,e])",
            "CSE:  Ius[i,j,k,e]  = sum_float32(@m, D[j,m]*Iu[i,m,k,e])",
            "CSE:  Iut[i,j,k,e]  = sum_float32(@m, D[k,m]*Iu[i,j,m,e])",

            # interpolate v to integration nodes
            "CSE:  v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])",
            "CSE:  v1[i,j,kp,e]  = sum_float32(@o, I[j,o]*v0[i,o,kp,e])",
            "CSE:  Iv[i,j,k,e]   = sum_float32(@o, I[k,o]*v1[i,j,o,e])",

            # differentiate v on integration nodes
            "CSE:  Ivr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iv[m,j,k,e])",
            "CSE:  Ivs[i,j,k,e]  = sum_float32(@m, D[j,m]*Iv[i,m,k,e])",
            "CSE:  Ivt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iv[i,j,m,e])",

            # interpolate w to integration nodes
            "CSE:  w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])",
            "CSE:  w1[i,j,kp,e]  = sum_float32(@o, I[j,o]*w0[i,o,kp,e])",
            "CSE:  Iw[i,j,k,e]   = sum_float32(@o, I[k,o]*w1[i,j,o,e])",

            # differentiate v on integration nodes
            "CSE:  Iwr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iw[m,j,k,e])",
            "CSE:  Iws[i,j,k,e]  = sum_float32(@m, D[j,m]*Iw[i,m,k,e])",
            "CSE:  Iwt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iw[i,j,m,e])",

            # find velocity in (r,s,t) coordinates
            # QUESTION: should I use CSE here ?
            "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]",
            "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]",
            "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]",

            # form nonlinear term on integration nodes
            # QUESTION: should I use CSE here ?
            "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]",
            "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]",
            "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]",

            # L2 project Nu back to Lagrange basis
            "CSE: Nu2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])",
            "CSE: Nu1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])",
            "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])",

            # L2 project Nv back to Lagrange basis
            "CSE: Nv2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])",
            "CSE: Nv1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])",
            "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])",

            # L2 project Nw back to Lagrange basis
            "CSE: Nw2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])",
            "CSE: Nw1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])",
            "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ],
        [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("v", dtype, shape=field_shape, order=order),
            lp.GlobalArg("w", dtype, shape=field_shape, order=order),
            lp.GlobalArg("INu", dtype, shape=field_shape, order=order),
            lp.GlobalArg("INv", dtype, shape=field_shape, order=order),
            lp.GlobalArg("INw", dtype, shape=field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(M, M), order=order),
            lp.GlobalArg("I", dtype, shape=(M, N), order=order),
            lp.GlobalArg("V", dtype, shape=(N, M), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="sem_advect",
        assumptions="K>=1")

    print(knl)
    1 / 0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=0,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True,
    )

コード例 #37

0

ファイルを表示

def test_advect(ctx_factory):
    1 / 0  # not ready

    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    N = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, N, N, N)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic

    # A. updated for CSE: notation.
    # B. fixed temp indexing and C ordering
    # load:     3+9 fields + 1/N D entry
    # store:    3   fields
    # perform:  N*2*6 + 3*5 + 3*5 flops
    # ratio:   (12*N+30)/15  flops per 4 bytes on bus
    #          ~ 8.4 FLOPS per 4 bytes at N=8
    #          ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N,
        [
            # differentiate u
            "CSE:  ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])",
            "CSE:  us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])",
            "CSE:  ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])",

            # differentiate v
            "CSE:  vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])",
            "CSE:  vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])",
            "CSE:  vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])",

            # differentiate w
            "CSE:  wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])",
            "CSE:  ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])",
            "CSE:  wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])",

            # find velocity in (r,s,t) coordinates
            # CSE?
            "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]",
            "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]",
            "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]",

            # form nonlinear term on integration nodes
            "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)",
            "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)",
            "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)",
        ],
        [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("v", dtype, shape=field_shape, order=order),
            lp.GlobalArg("w", dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nu", dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nv", dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nw", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(9, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(N, N), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="sem_advect",
        assumptions="K>=1")

    print(knl)
    1 / 0

    seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=0,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True,
    )

コード例 #38

0

ファイルを表示

ファイル: test_sem_tim.py プロジェクト: cmsquared/loopy

def test_laplacian_lmem(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 4

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n,
            [
                "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
                "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
                "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")

    seq_knl = knl

    if 1:
        # original
        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"])
        knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"])
        knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"])
        knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"])
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
        knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"])
    else:
        # experiment
#        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"])
        knl = lp.precompute(knl, "eu", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "ur", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "us", np.float32, ["b", "c"])
        knl = lp.precompute(knl, "ut", np.float32, ["b", "c"])
        knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))
        knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"])



    #knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G
    #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"]) # axis/argument indices on G
    #print(knl)
    #1/0

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
#    knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j")

    #print(seq_knl)
    #print(lp.preprocess_kernel(knl))
    #1/0

# TW: turned this off since it generated:
# ValueError: cannot tag 'i_and_j'--not known
#    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9,
            op_label="GFlops",
            parameters={"K": K})

コード例 #39

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_advect_dealias(ctx_factory):
    1/0 # not ready

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N
            [

                # interpolate u to integration nodes
                "CSE:  u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])",
                "CSE:  u1[i,j,kp,e]  = sum_float32(@o, I[j,o]*u0[i,o,kp,e])",
                "CSE:  Iu[i,j,k,e]   = sum_float32(@o, I[k,o]*u1[i,j,o,e])",

                # differentiate u on integration nodes
                "CSE:  Iur[i,j,k,e]  = sum_float32(@m, D[i,m]*Iu[m,j,k,e])",
                "CSE:  Ius[i,j,k,e]  = sum_float32(@m, D[j,m]*Iu[i,m,k,e])",
                "CSE:  Iut[i,j,k,e]  = sum_float32(@m, D[k,m]*Iu[i,j,m,e])",

                # interpolate v to integration nodes
                "CSE:  v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])",
                "CSE:  v1[i,j,kp,e]  = sum_float32(@o, I[j,o]*v0[i,o,kp,e])",
                "CSE:  Iv[i,j,k,e]   = sum_float32(@o, I[k,o]*v1[i,j,o,e])",

                # differentiate v on integration nodes
                "CSE:  Ivr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iv[m,j,k,e])",
                "CSE:  Ivs[i,j,k,e]  = sum_float32(@m, D[j,m]*Iv[i,m,k,e])",
                "CSE:  Ivt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iv[i,j,m,e])",

                # interpolate w to integration nodes
                "CSE:  w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])",
                "CSE:  w1[i,j,kp,e]  = sum_float32(@o, I[j,o]*w0[i,o,kp,e])",
                "CSE:  Iw[i,j,k,e]   = sum_float32(@o, I[k,o]*w1[i,j,o,e])",

                # differentiate v on integration nodes
                "CSE:  Iwr[i,j,k,e]  = sum_float32(@m, D[i,m]*Iw[m,j,k,e])",
                "CSE:  Iws[i,j,k,e]  = sum_float32(@m, D[j,m]*Iw[i,m,k,e])",
                "CSE:  Iwt[i,j,k,e]  = sum_float32(@m, D[k,m]*Iw[i,j,m,e])",

                # find velocity in (r,s,t) coordinates
                # QUESTION: should I use CSE here ?
                "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]",
                "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]",
                "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]",

                # form nonlinear term on integration nodes
                # QUESTION: should I use CSE here ?
                "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]",
                "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]",
                "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]",

                # L2 project Nu back to Lagrange basis
                "CSE: Nu2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])",
                "CSE: Nu1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])",
                "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])",

                # L2 project Nv back to Lagrange basis
                "CSE: Nv2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])",
                "CSE: Nv1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])",
                "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])",

                # L2 project Nw back to Lagrange basis
                "CSE: Nw2[ip,j,k,e]   = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])",
                "CSE: Nw1[ip,jp,k,e]  = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])",
                "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])",

                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("v",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("w",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("INu",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("INv",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("INw",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("D",   dtype, shape=(M,M),  order=order),
            lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
            lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_advect", assumptions="K>=1")

    print(knl)
    1/0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)


    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)

コード例 #40

0

ファイルを表示

ファイル: test_sem_tim.py プロジェクト: yueyedeai/loopy

def test_laplacian_lmem(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 4

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" %
        n, [
            "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])",
            "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])",
            "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    seq_knl = knl

    if 1:
        # original
        knl = lp.add_prefetch(knl,
                              "u", ["i", "j", "k", "o"],
                              default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["a", "b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")
    else:
        # experiment
        #        knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto")
        knl = lp.precompute(knl,
                            "eu",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ur",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "us",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.precompute(knl,
                            "ut",
                            np.float32, ["b", "c"],
                            default_tag="l.auto")
        knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")  #, slabs=(0, 1))
        knl = lp.add_prefetch(knl,
                              "D", ["m", "j", "k", "i"],
                              default_tag="l.auto")

    #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G
    #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G
    #print(knl)
    #1/0

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")
#    knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j")

#print(seq_knl)
#print(lp.preprocess_kernel(knl))
#1/0

# TW: turned this off since it generated:
# ValueError: cannot tag 'i_and_j'--not known
#    knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K})

コード例 #41

0

ファイルを表示

ファイル: auto_test.py プロジェクト: cmsquared/loopy

def auto_test_vs_ref(
        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
        print_ref_code=False, print_code=True, warmup_rounds=2,
        dump_binary=False,
        fills_entire_output=None, do_check=True, check_result=None,
        max_test_kernel_count=1,
        quiet=False, blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
                    "%d (1-based)" % (i+1))

    from loopy.compiled import CompiledKernel, get_highlighted_cl_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2)

    # {{{ compile and run reference code

    from loopy.preprocess import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(ref_ctx,
                properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("%s (ref): trying %s for the reference calculation" % (
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75*"-")
            print("Reference Code:")
            print(75*"-")
            print(get_highlighted_cl_code(ref_compiled.code))
            print(75*"-")

        ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_cl_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75*"-",
                    "On %s:" % dev,
                    75*"-",
                    traceback.format_exc(),
                    75*"-"]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("%s (ref): using %s for the reference calculation" % (
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop-ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                "reference computation.\n"
                "These errors were encountered:\n"+"\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)

    args = None
    from loopy.kernel import kernel_state
    if test_knl.state not in [
            kernel_state.PREPROCESSED,
            kernel_state.SCHEDULED]:
        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.preprocess import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        if args is None:
            cl_kernel_info = compiled.cl_kernel_info(frozenset())

            args = make_args(kernel,
                    cl_kernel_info.implemented_data_info,
                    queue, ref_arg_data, parameters)
        args["out_host"] = False

        if not quiet:
            print(75*"-")
            print("Kernel #%d:" % i)
            print(75*"-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75*"-")
            if dump_binary:
                print(type(compiled.cl_program))
                print(compiled.cl_program.binaries[0])
                print(75*"-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                            arg_desc.ref_storage_array.get(),
                            shape=arg_desc.ref_shape,
                            strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                            arg_desc.test_storage_array.get(),
                            shape=arg_desc.test_shape,
                            strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = warmup_rounds

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9*evt_end.profile.START
                            - 1e-9*evt_start.profile.START)
                        / timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time-start_time)/timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)

        if not quiet:
            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                    "(%d rounds)%s" % (
                        format_float_or_none(elapsed_event),
                        format_float_or_none(elapsed_event_marker),
                        format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: %g s event, %g s wall%s" % (
                        ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict

コード例 #42

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n,
            [
                "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
                "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
                "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",

                "lap[e,i,j,k]  = "
                "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
                "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
                "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
                ],
            [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
            name="semlap", assumptions="K>=1")


    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))

コード例 #43

0

ファイルを表示

def test_laplacian_lmem(ctx_factory):
    1 / 0  # not adapted to new language

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" %
        n, [
            "CSE: ur(i,j,k) = sum_float32(@o, D[i,o]*u[e,o,j,k])",
            "CSE: us(i,j,k) = sum_float32(@o, D[j,o]*u[e,i,o,k])",
            "CSE: ut(i,j,k) = sum_float32(@o, D[k,o]*u[e,i,j,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])

    if 0:
        seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"],
                                  "G[gi,e,m,j,k]")
        seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
        seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    else:
        seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
    knl = lp.add_prefetch(knl, "D", ["m", "j"])
    knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    #print seq_knl
    #print lp.preprocess_kernel(knl)
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True)

コード例 #44

0

ファイルを表示

ファイル: test_sem.py プロジェクト: cmsquared/loopy

def test_advect(ctx_factory):
    1/0 # not ready

    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    N = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, N, N, N)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic

    # A. updated for CSE: notation. 
    # B. fixed temp indexing and C ordering
    # load:     3+9 fields + 1/N D entry
    # store:    3   fields
    # perform:  N*2*6 + 3*5 + 3*5 flops
    # ratio:   (12*N+30)/15  flops per 4 bytes on bus 
    #          ~ 8.4 FLOPS per 4 bytes at N=8
    #          ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N,
            [
                # differentiate u
                "CSE:  ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])",
                "CSE:  us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])",
                "CSE:  ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])",

                # differentiate v
                "CSE:  vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])",
                "CSE:  vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])",
                "CSE:  vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])",

                # differentiate w
                "CSE:  wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])",
                "CSE:  ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])",
                "CSE:  wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])",

                # find velocity in (r,s,t) coordinates
                # CSE?
                "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]",
                "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]",
                "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]",

                # form nonlinear term on integration nodes
                "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)",
                "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)",
                "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)",
                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("v",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("w",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nu",  dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nv",  dtype, shape=field_shape, order=order),
            lp.GlobalArg("Nw",  dtype, shape=field_shape, order=order),
            lp.GlobalArg("G",   dtype, shape=(9,)+field_shape, order=order),
            lp.GlobalArg("D",   dtype, shape=(N, N),  order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_advect", assumptions="K>=1")

    print(knl)
    1/0

    seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))


    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)


    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)

コード例 #45

0

ファイルを表示

def test_laplacian(ctx_factory):
    1 / 0  # not adapted to new language

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # load:     1+6 fields + 1/N D entry
    # store:    1   fields
    # perform:  N*2*6 + 3*5 flops
    # ratio:   (12*N+15)/8  flops per 4 bytes on bus
    #          ~ 14 FLOPS per 4 bytes at N=8
    #          ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}"
        % n,
        [
            "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))",
            "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))",
            "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))",

            # define function
            "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)",
            "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)",
            "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*Gu(m,j,k))"
            "+ sum_float32(m, D[m,j]*Gv(i,m,k))"
            "+ sum_float32(m, D[m,k]*Gw(i,j,m))"
        ],
        [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    #print lp.preprocess_kernel(knl, cse_ok=True)
    #1/0
    #
    #print knl
    #1/0
    knl = lp.realize_cse(knl, "urf", np.float32, ["o1"])
    knl = lp.realize_cse(knl, "usf", np.float32, ["o2"])
    knl = lp.realize_cse(knl, "utf", np.float32, ["o3"])

    knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"])
    knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"])

    knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"])
    knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"])
    knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"])

    if 0:
        pass
        #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
        #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"])
        #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]")
    else:
        seq_knl = knl

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]")
    knl = lp.add_prefetch(knl, "D", ["m", "j"])
    #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]")

    #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    #print seq_knl
    #print lp.preprocess_kernel(knl)
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(
        knl, loop_priority=["m_fetch_G", "i_fetch_u"])
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(
        seq_knl,
        ctx,
        kernel_gen,
        op_count=K *
        (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9,
        op_label="GFlops",
        parameters={"K": K},
        print_seq_code=True)

コード例 #46

0

ファイルを表示

ファイル: proba2.py プロジェクト: earendilllock/loopycurs


n = 128
r = 3
#a = np.ones((n, n), dtype = np.float32)
#b = np.ones((n, n), dtype = np.float32)
f = np.zeros((n, r), dtype = np.float32)
a = np.random.randn(n, n, n)
v = np.random.randn(n, r)
w = np.random.randn(n,r)

a = a.astype(np.float32)
v = v.astype(np.float32)
w = w.astype(np.float32)
parameters = {"a" : a, "v": v, "w" : w} 
knl_gen = lp.generate_loop_schedules(knl)
ref_compiled = lp.CompiledKernel(ctx,knl_gen,options=[],codegen_kwargs={})

a1 = cl_array.to_device(queue,a)
v1 = cl_array.to_device(queue,v)
w1 = cl_array.to_device(queue,w)
f = cl_array.to_device(queue,f)
qq = {"a":a1,"v":v1,"w":w1,'f':f}

from pymbolic import evaluate
kk = {}
for arg in knl.args:
    kk[arg.name] = qq[arg.name].astype(arg.dtype)

import time

コード例 #47

0

ファイルを表示

def test_laplacian_lmem_ilp(ctx_factory):
    # This does not lead to practical/runnable code (out of lmem), but it's an
    # excellent stress test for the code generator. :)

    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
        ctx.devices[0],
        "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [
            "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])",
            "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])",
            "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])",
            "lap[e,i,j,k]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))"
            "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))"
        ], [
            lp.GlobalArg("u", dtype, shape=field_shape, order=order),
            lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
            lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order),
            lp.GlobalArg("D", dtype, shape=(n, n), order=order),
            lp.ValueArg("K", np.int32, approximately=1000),
        ],
        name="semlap",
        assumptions="K>=1")

    # Must act on u first, otherwise stencil becomes crooked and
    # footprint becomes non-convex.

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")  #, slabs=(0, 1))
    knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp")

    knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"])

    knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"])
    knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"])

    knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"])
    knl = lp.add_prefetch(knl, "D", ["m", "j"])

    #print seq_knl
    #1/0

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    for knl in kernel_gen:
        print(lp.generate_code(knl))

コード例 #48

0

ファイルを表示

ファイル: test_fem_assembly.py プロジェクト: simplicialsimulations/loopy

def test_laplacian_stiffness(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    dim = 2  # (baked into code)

    Nq = 40  # num. quadrature points (baked into code)
    Nb = 20  # num. basis functions (baked into code)
    Nc = 100  # num. cells (run-time symbolic)

    from pymbolic import var
    Nc_sym = var("Nc")

    knl = lp.make_kernel(
        ctx.devices[0],
        "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
        "and 0<= dx_axis, ax_b < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [
            "dPsi(ij, dxi) := sum_float32(@ax_b,"
            "  jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])",
            "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
            "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))"
        ], [
            lp.GlobalArg(
                "jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
            lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order),
            lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order),
            lp.ConstantArg("w", dtype, shape=(Nq, ), order=order),
            lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order),
            lp.ValueArg("Nc", np.int32, approximately=1000),
        ],
        name="lapquad",
        assumptions="Nc>=1")

    knl = lp.tag_inames(knl, dict(ax_b="unr"))
    seq_knl = knl

    def variant_fig31(knl):
        # This (mostly) reproduces Figure 3.1.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    def variant_pg4(knl):
        # This (mostly) reproduces the unlabeled code snippet on pg. 4.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl,
                             "K",
                             Ncloc,
                             outer_iname="Ko",
                             inner_iname="Kloc")
        return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"]

    def variant_fig32(knl):
        # This (mostly) reproduces Figure 3.2.

        Ncloc = 16
        knl = lp.split_iname(knl,
                             "K",
                             Ncloc,
                             outer_iname="Ko",
                             inner_iname="Kloc")
        knl = lp.precompute(knl,
                            "dPsi",
                            np.float32, ["i", "q", "dx_axis"],
                            default_tag=None)
        knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"})
        return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]

    def variant_fig33(knl):
        # This is meant to (mostly) reproduce Figure 3.3.

        Ncloc = 16
        knl = lp.split_iname(knl,
                             "K",
                             Ncloc,
                             outer_iname="Ko",
                             inner_iname="Kloc")
        knl = lp.precompute(knl,
                            "dPsi$one",
                            np.float32, ["dx_axis"],
                            default_tag=None)
        knl = lp.tag_inames(knl, {"j": "ilp.seq"})

        return knl, ["Ko", "Kloc"]

    def variant_simple_gpu(knl):
        # This is a simple GPU-ish variant.

        # It's not the same thing as Matt's code, but I'll need some more time
        # to reverse-engineer what is going on there. Some discussion might
        # help, too. :)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl,
                             "K",
                             Ncloc,
                             outer_iname="Ko",
                             inner_iname="Kloc",
                             outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    def variant_simple_gpu_prefetch(knl):
        # This adds prefetching to the GPU variant above.

        # In this variant (on my machine), loopy makes a silly choice
        # for the upper bound of Kloc (it uses Nc). I'll investigate and
        # fix that. (FIXME)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl,
                             "K",
                             Ncloc,
                             outer_iname="Ko",
                             inner_iname="Kloc",
                             outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto")
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    # Plug in variant name here
    #                        |
    #                        v
    for variant in [variant_fig33]:
        var_knl, loop_prio = variant(knl)
        kernel_gen = lp.generate_loop_schedules(var_knl,
                                                loop_priority=loop_prio)
        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))

        #print lp.preprocess_kernel(var_knl)

        lp.auto_test_vs_ref(seq_knl,
                            ctx,
                            kernel_gen,
                            op_count=0,
                            op_label="GFlops",
                            parameters={"Nc": Nc},
                            print_ref_code=True)

コード例 #49

0

ファイルを表示

ファイル: proba2.py プロジェクト: earendilllock/loopycurs

cknl.print_code()

n = 128
r = 3
#a = np.ones((n, n), dtype = np.float32)
#b = np.ones((n, n), dtype = np.float32)
f = np.zeros((n, r), dtype=np.float32)
a = np.random.randn(n, n, n)
v = np.random.randn(n, r)
w = np.random.randn(n, r)

a = a.astype(np.float32)
v = v.astype(np.float32)
w = w.astype(np.float32)
parameters = {"a": a, "v": v, "w": w}
knl_gen = lp.generate_loop_schedules(knl)
ref_compiled = lp.CompiledKernel(ctx, knl_gen, options=[], codegen_kwargs={})

a1 = cl_array.to_device(queue, a)
v1 = cl_array.to_device(queue, v)
w1 = cl_array.to_device(queue, w)
f = cl_array.to_device(queue, f)
qq = {"a": a1, "v": v1, "w": w1, 'f': f}

from pymbolic import evaluate
kk = {}
for arg in knl.args:
    kk[arg.name] = qq[arg.name].astype(arg.dtype)

import time

コード例 #50

0

ファイルを表示

def auto_test_vs_ref(ref_knl,
                     ctx,
                     test_knl=None,
                     op_count=[],
                     op_label=[],
                     parameters={},
                     print_ref_code=False,
                     print_code=True,
                     warmup_rounds=2,
                     dump_binary=False,
                     fills_entire_output=None,
                     do_check=True,
                     check_result=None,
                     max_test_kernel_count=1,
                     quiet=False,
                     blacklist_ref_vendors=[]):
    """Compare results of `ref_knl` to the kernels generated by
    scheduling *test_knl*.

    :arg check_result: a callable with :class:`numpy.ndarray` arguments
        *(result, reference_result)* returning a a tuple (class:`bool`,
        message) indicating correctness/acceptability of the result
    :arg max_test_kernel_count: Stop testing after this many *test_knl*
    """

    import pyopencl as cl

    if test_knl is None:
        test_knl = ref_knl
        do_check = False

    if len(ref_knl.args) != len(test_knl.args):
        raise LoopyError("ref_knl and test_knl do not have the same number "
                         "of arguments")

    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
        if ref_arg.name != test_arg.name:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

        if ref_arg.dtype != test_arg.dtype:
            raise LoopyError(
                "ref_knl and test_knl argument lists disagree at index "
                "%d (1-based)" % (i + 1))

    from loopy.compiled import CompiledKernel
    from loopy.target.execution import get_highlighted_code

    if isinstance(op_count, (int, float)):
        warn("op_count should be a list", stacklevel=2)
        op_count = [op_count]
    if isinstance(op_label, str):
        warn("op_label should be a list", stacklevel=2)
        op_label = [op_label]

    from time import time

    if check_result is None:
        check_result = _default_check_result

    if fills_entire_output is not None:
        warn("fills_entire_output is deprecated",
             DeprecationWarning,
             stacklevel=2)

    # {{{ compile and run reference code

    from loopy.type_inference import infer_unknown_types
    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)

    found_ref_device = False

    ref_errors = []

    from loopy.kernel.data import ImageArg
    need_ref_image_support = any(
        isinstance(arg, ImageArg) for arg in ref_knl.args)

    for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors,
                                                  need_ref_image_support):

        ref_ctx = cl.Context([dev])
        ref_queue = cl.CommandQueue(
            ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

        pp_ref_knl = lp.preprocess_kernel(ref_knl)

        for knl in lp.generate_loop_schedules(pp_ref_knl):
            ref_sched_kernel = knl
            break

        logger.info("{} (ref): trying {} for the reference calculation".format(
            ref_knl.name, dev))

        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
        if not quiet and print_ref_code:
            print(75 * "-")
            print("Reference Code:")
            print(75 * "-")
            print(get_highlighted_code(ref_compiled.get_code()))
            print(75 * "-")

        ref_kernel_info = ref_compiled.kernel_info(frozenset())

        try:
            ref_args, ref_arg_data = \
                    make_ref_args(ref_sched_kernel,
                            ref_kernel_info.implemented_data_info,
                            ref_queue, parameters)
            ref_args["out_host"] = False
        except cl.RuntimeError as e:
            if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED:
                import traceback
                ref_errors.append("\n".join([
                    75 * "-",
                    "On %s:" % dev, 75 * "-",
                    traceback.format_exc(), 75 * "-"
                ]))

                continue
            else:
                raise

        found_ref_device = True

        if not do_check:
            break

        ref_queue.finish()

        logger.info("{} (ref): using {} for the reference calculation".format(
            ref_knl.name, dev))
        logger.info("%s (ref): run" % ref_knl.name)

        ref_start = time()

        if not AUTO_TEST_SKIP_RUN:
            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
        else:
            ref_evt = cl.enqueue_marker(ref_queue)

        ref_queue.finish()
        ref_stop = time()
        ref_elapsed_wall = ref_stop - ref_start

        logger.info("%s (ref): run done" % ref_knl.name)

        ref_evt.wait()
        ref_elapsed_event = 1e-9 * (ref_evt.profile.END -
                                    ref_evt.profile.START)

        break

    if not found_ref_device:
        raise LoopyError("could not find a suitable device for the "
                         "reference computation.\n"
                         "These errors were encountered:\n" +
                         "\n".join(ref_errors))

    # }}}

    # {{{ compile and run parallel code

    need_check = do_check

    queue = cl.CommandQueue(
        ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

    from loopy.kernel import KernelState
    from loopy.target.pyopencl import PyOpenCLTarget
    if test_knl.state not in [
            KernelState.PREPROCESSED, KernelState.LINEARIZED
    ]:
        if isinstance(test_knl.target, PyOpenCLTarget):
            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))

        test_knl = lp.preprocess_kernel(test_knl)

    if not test_knl.schedule:
        test_kernels = lp.generate_loop_schedules(test_knl)
    else:
        test_kernels = [test_knl]

    test_kernel_count = 0

    from loopy.type_inference import infer_unknown_types
    for i, kernel in enumerate(test_kernels):
        test_kernel_count += 1
        if test_kernel_count > max_test_kernel_count:
            break

        kernel = infer_unknown_types(kernel, expect_completion=True)

        compiled = CompiledKernel(ctx, kernel)

        kernel_info = compiled.kernel_info(frozenset())

        args = make_args(kernel, kernel_info.implemented_data_info, queue,
                         ref_arg_data, parameters)

        args["out_host"] = False

        if not quiet:
            print(75 * "-")
            print("Kernel #%d:" % i)
            print(75 * "-")
            if print_code:
                print(compiled.get_highlighted_code())
                print(75 * "-")
            if dump_binary:
                # {{{ find cl program

                for name in dir(kernel_info.cl_kernels):
                    if name.startswith("__"):
                        continue
                    cl_kernel = getattr(kernel_info.cl_kernels, name)
                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
                    break
                else:
                    assert False, "could not find cl_program"

                # }}}

                print(type(cl_program))
                if hasattr(cl_program, "binaries"):
                    print(cl_program.binaries[0])

                print(75 * "-")

        logger.info("%s: run warmup" % (knl.name))

        for i in range(warmup_rounds):
            if not AUTO_TEST_SKIP_RUN:
                compiled(queue, **args)

            if need_check and not AUTO_TEST_SKIP_RUN:
                for arg_desc in ref_arg_data:
                    if arg_desc is None:
                        continue
                    if not arg_desc.needs_checking:
                        continue

                    from pyopencl.compyte.array import as_strided
                    ref_ary = as_strided(
                        arg_desc.ref_storage_array.get(),
                        shape=arg_desc.ref_shape,
                        strides=arg_desc.ref_numpy_strides).flatten()
                    test_ary = as_strided(
                        arg_desc.test_storage_array.get(),
                        shape=arg_desc.test_shape,
                        strides=arg_desc.test_numpy_strides).flatten()
                    common_len = min(len(ref_ary), len(test_ary))
                    ref_ary = ref_ary[:common_len]
                    test_ary = test_ary[:common_len]

                    error_is_small, error = check_result(test_ary, ref_ary)
                    if not error_is_small:
                        raise AutomaticTestFailure(error)

                    need_check = False

        events = []
        queue.finish()

        logger.info("%s: warmup done" % (knl.name))

        logger.info("%s: timing run" % (knl.name))

        timing_rounds = max(warmup_rounds, 1)

        while True:
            from time import time
            start_time = time()

            evt_start = cl.enqueue_marker(queue)

            for i in range(timing_rounds):
                if not AUTO_TEST_SKIP_RUN:
                    evt, _ = compiled(queue, **args)
                    events.append(evt)
                else:
                    events.append(cl.enqueue_marker(queue))

            evt_end = cl.enqueue_marker(queue)

            queue.finish()
            stop_time = time()

            for evt in events:
                evt.wait()
            evt_start.wait()
            evt_end.wait()

            elapsed_event = (1e-9*events[-1].profile.END
                    - 1e-9*events[0].profile.START) \
                    / timing_rounds
            try:
                elapsed_event_marker = ((1e-9 * evt_end.profile.START -
                                         1e-9 * evt_start.profile.START) /
                                        timing_rounds)
            except cl.RuntimeError:
                elapsed_event_marker = None

            elapsed_wall = (stop_time - start_time) / timing_rounds

            if elapsed_wall * timing_rounds < 0.3:
                timing_rounds *= 4
            else:
                break

        logger.info("%s: timing run done" % (knl.name))

        rates = ""
        for cnt, lbl in zip(op_count, op_label):
            rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl)

        if not quiet:

            def format_float_or_none(v):
                if v is None:
                    return "<unavailable>"
                else:
                    return "%g" % v

            print("elapsed: %s s event, %s s marker-event %s s wall "
                  "(%d rounds)%s" %
                  (format_float_or_none(elapsed_event),
                   format_float_or_none(elapsed_event_marker),
                   format_float_or_none(elapsed_wall), timing_rounds, rates))

        if do_check:
            ref_rates = ""
            for cnt, lbl in zip(op_count, op_label):
                ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl)
            if not quiet:
                print("ref: elapsed: {:g} s event, {:g} s wall{}".format(
                    ref_elapsed_event, ref_elapsed_wall, ref_rates))

    # }}}

    result_dict = {}
    result_dict["elapsed_event"] = elapsed_event
    result_dict["elapsed_event_marker"] = elapsed_event_marker
    result_dict["elapsed_wall"] = elapsed_wall
    result_dict["timing_rounds"] = timing_rounds

    if do_check:
        result_dict["ref_elapsed_event"] = ref_elapsed_event
        result_dict["ref_elapsed_wall"] = ref_elapsed_wall

    return result_dict

コード例 #51

0

ファイルを表示

ファイル: test_fem_assembly.py プロジェクト: cmsquared/loopy

def test_laplacian_stiffness(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    dim = 2 # (baked into code)

    Nq = 40 # num. quadrature points (baked into code)
    Nb = 20 # num. basis functions (baked into code)
    Nc = 100 # num. cells (run-time symbolic)

    from pymbolic import var
    Nc_sym = var("Nc")

    knl = lp.make_kernel(ctx.devices[0],
            "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d "
            "and 0<= dx_axis, ax_b < %(dim)d}"
            % dict(Nb=Nb, Nq=Nq, dim=dim),
            [
                "dPsi(ij, dxi) := sum_float32(@ax_b,"
                    "  jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])",
                "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * ("
                    "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))"
                ],
            [
            lp.GlobalArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order),
            lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order),
            lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order),
            lp.ConstantArg("w", dtype, shape=(Nq,), order=order),
            lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order),
            lp.ValueArg("Nc",  np.int32, approximately=1000),
            ],
            name="lapquad", assumptions="Nc>=1")

    knl = lp.tag_inames(knl, dict(ax_b="unr"))
    seq_knl = knl

    def variant_fig31(knl):
        # This (mostly) reproduces Figure 3.1.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    def variant_pg4(knl):
        # This (mostly) reproduces the unlabeled code snippet on pg. 4.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"]

    def variant_fig32(knl):
        # This (mostly) reproduces Figure 3.2.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"],
                default_tag=None)
        knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"})
        return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]

    def variant_fig33(knl):
        # This is meant to (mostly) reproduce Figure 3.3.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)
        knl = lp.tag_inames(knl, {"j": "ilp.seq"})

        return knl, ["Ko", "Kloc"]

    def variant_simple_gpu(knl):
        # This is a simple GPU-ish variant.

        # It's not the same thing as Matt's code, but I'll need some more time
        # to reverse-engineer what is going on there. Some discussion might
        # help, too. :)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    def variant_simple_gpu_prefetch(knl):
        # This adds prefetching to the GPU variant above.

        # In this variant (on my machine), loopy makes a silly choice
        # for the upper bound of Kloc (it uses Nc). I'll investigate and
        # fix that. (FIXME)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        knl = lp.add_prefetch(knl, "w", ["q"])
        knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2])
        knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3])
        knl = lp.add_prefetch(knl, "jacDet", [1])
        return knl, ["K", "i", "j", "q", "ax_b_insn"]

    # Plug in variant name here
    #                        |
    #                        v
    for variant in [variant_fig33]:
        var_knl, loop_prio = variant(knl)
        kernel_gen = lp.generate_loop_schedules(var_knl,
                loop_priority=loop_prio)
        kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc))

        #print lp.preprocess_kernel(var_knl)

        lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
                op_count=0, op_label="GFlops",
                parameters={"Nc": Nc}, print_ref_code=True)