コード例 #1
0
ファイル: test_transform.py プロジェクト: inducer/loopy
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])''',
         [lp.TemporaryVariable(
             "cnst",
             dtype=np.float32,
             shape=(),
             scope=lp.temp_var_scope.PRIVATE), '...'])
    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                            x=np.float32,
                                            a=np.float32))
    ref_knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                    x=np.float32,
                                                    a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl.temporary_variables['cnst'].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(
            bref_knl, ctx, bknl,
            parameters=dict(a=a, x=x, n=5, nbatches=7))
コード例 #2
0
ファイル: test_transform.py プロジェクト: connorjward/loopy
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        """ { [i,j]: 0<=i,j<n } """, """ cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])""", [
            lp.TemporaryVariable("cnst",
                                 dtype=np.float32,
                                 shape=(),
                                 address_space=lp.AddressSpace.PRIVATE), "..."
        ])
    knl = lp.add_and_infer_dtypes(
        knl, dict(out=np.float32, x=np.float32, a=np.float32))
    ref_knl = lp.make_kernel(""" { [i,j]: 0<=i,j<n } """,
                             """out[i] = sum(j, 2.0*a[i,j]*x[j])""")
    ref_knl = lp.add_and_infer_dtypes(
        ref_knl, dict(out=np.float32, x=np.float32, a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(bref_knl,
                        ctx,
                        bknl,
                        parameters=dict(a=a, x=x, n=5, nbatches=7))
コード例 #3
0
ファイル: test_transform.py プロジェクト: inducer/loopy
def test_to_batched(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' out[i] = sum(j, a[i,j]*x[j])''')
    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                            x=np.float32,
                                            a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")

    ref_knl = lp.make_kernel(
         ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
         '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                    x=np.float32,
                                                    a=np.float32))

    a = np.random.randn(5, 5).astype(np.float32)
    x = np.random.randn(7, 5).astype(np.float32)

    # Running both the kernels
    evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
    evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)

    # checking that the outputs are same
    assert np.linalg.norm(out1-out2) < 1e-15
コード例 #4
0
ファイル: test_transform.py プロジェクト: sailfish009/loopy
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])''', [
            lp.TemporaryVariable("cnst",
                                 dtype=np.float32,
                                 shape=(),
                                 scope=lp.temp_var_scope.PRIVATE), '...'
        ])
    knl = lp.add_and_infer_dtypes(
        knl, dict(out=np.float32, x=np.float32, a=np.float32))
    ref_knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''',
                             '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
    ref_knl = lp.add_and_infer_dtypes(
        ref_knl, dict(out=np.float32, x=np.float32, a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl.temporary_variables['cnst'].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(bref_knl,
                        ctx,
                        bknl,
                        parameters=dict(a=a, x=x, n=5, nbatches=7))
コード例 #5
0
ファイル: test_transform.py プロジェクト: sailfish009/loopy
def test_to_batched(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''',
                         ''' out[i] = sum(j, a[i,j]*x[j])''')
    knl = lp.add_and_infer_dtypes(
        knl, dict(out=np.float32, x=np.float32, a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")

    ref_knl = lp.make_kernel(''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
                             '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
    ref_knl = lp.add_and_infer_dtypes(
        ref_knl, dict(out=np.float32, x=np.float32, a=np.float32))

    a = np.random.randn(5, 5).astype(np.float32)
    x = np.random.randn(7, 5).astype(np.float32)

    # Running both the kernels
    evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
    evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)

    # checking that the outputs are same
    assert np.linalg.norm(out1 - out2) < 1e-15
コード例 #6
0
ファイル: test_transform.py プロジェクト: connorjward/loopy
def test_diamond_tiling(ctx_factory, interactive=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    ref_knl = lp.make_kernel(
        "[nx,nt] -> {[ix, it]: 1<=ix<nx-1 and 0<=it<nt}", """
        u[ix, it+2] = (
            2*u[ix, it+1]
            + dt**2/dx**2 * (u[ix+1, it+1] - 2*u[ix, it+1] + u[ix-1, it+1])
            - u[ix, it])
        """)

    knl_for_transform = ref_knl

    ref_knl = lp.prioritize_loops(ref_knl, "it, ix")

    import islpy as isl
    m = isl.BasicMap(
        "[nx,nt] -> {[ix, it] -> [tx, tt, tparity, itt, itx]: "
        "16*(tx - tt) + itx - itt = ix - it and "
        "16*(tx + tt + tparity) + itt + itx = ix + it and "
        "0<=tparity<2 and 0 <= itx - itt < 16 and 0 <= itt+itx < 16}")
    knl = lp.map_domain(knl_for_transform, m)
    knl = lp.prioritize_loops(knl, "tt,tparity,tx,itt,itx")

    if interactive:
        nx = 43
        u = np.zeros((nx, 200))
        x = np.linspace(-1, 1, nx)
        dx = x[1] - x[0]
        u[:, 0] = u[:, 1] = np.exp(-100 * x**2)

        u_dev = cl.array.to_device(queue, u)
        knl(queue, u=u_dev, dx=dx, dt=dx)

        u = u_dev.get()
        import matplotlib.pyplot as plt
        plt.imshow(u.T)
        plt.show()
    else:
        types = {"dt,dx,u": np.float64}
        knl = lp.add_and_infer_dtypes(knl, types)
        ref_knl = lp.add_and_infer_dtypes(ref_knl, types)

        lp.auto_test_vs_ref(ref_knl,
                            ctx,
                            knl,
                            parameters={
                                "nx": 200,
                                "nt": 300,
                                "dx": 1,
                                "dt": 1
                            })
コード例 #7
0
ファイル: test_transform.py プロジェクト: cmsquared/loopy
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
コード例 #8
0
ファイル: test_transform.py プロジェクト: cmsquared/loopy
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name
        for rule_name in knl.substitutions
        if rule_name.startswith("E")]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
コード例 #9
0
ファイル: test_transform.py プロジェクト: cmsquared/loopy
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
            split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=30))
コード例 #10
0
ファイル: test_transform.py プロジェクト: cmsquared/loopy
def test_alias_temporaries(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        times2(i) := 2*a[i]
        times3(i) := 3*a[i]
        times4(i) := 4*a[i]

        x[i] = times2(i)
        y[i] = times3(i)
        z[i] = times4(i)
        """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")

    knl = lp.precompute(knl, "times2", "i_inner")
    knl = lp.precompute(knl, "times3", "i_inner")
    knl = lp.precompute(knl, "times4", "i_inner")

    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=30))
コード例 #11
0
ファイル: test_statistics.py プロジェクト: rckirby/loopy
def test_op_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))

    poly = get_op_poly(knl)

    n = 10
    m = 10
    l = 10
    param_values = {'n': n, 'm': m, 'l': l}
    i32 = poly.dict[np.dtype(np.int32)].eval_with_dict(param_values)
    i64 = poly.dict[np.dtype(np.int64)].eval_with_dict(param_values)
    not_there = poly[np.dtype(np.float64)].eval_with_dict(param_values)
    print(poly.dict)
    assert i32 == n*m + n*m*l
    assert i64 == 2*n*m
    assert not_there == 0
コード例 #12
0
def build_loopy_kernel_A_text():
    knl_name = "kernel_tensor_A"

    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }",
                         """
            A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """,
                         name=knl_name,
                         assumptions="n >= 1 and m >= 1",
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION,
                         target=lp.CTarget())

    knl = lp.add_and_infer_dtypes(
        knl, {
            "A": np.dtype(np.double),
            "B": np.dtype(np.double),
            "c": np.dtype(np.double)
        })
    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    #print(knl)

    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])

    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
コード例 #13
0
ファイル: tests.py プロジェクト: umarbrowser/tvb-hpc
 def test_sparse_matmul(self):
     "Tests how to do sparse indexing w/ loop."
     target = NumbaTarget()
     knl = lp.make_kernel(
         [
             '{[i]: 0   <= i <   n}',
             # note loop bounded by jlo jhi
             '{[j]: jlo <= j < jhi}'
         ],
         # which are set as instructions
         """
         <> jlo = row[i]
         <> jhi = row[i + 1]
         out[i] = sum(j, dat[j] * vec[col[j]])
         """,
         'n nnz row col dat vec out'.split(),
         target=target)
     knl = lp.add_and_infer_dtypes(knl, {
         'out,dat,vec': np.float32,
         'col,row,n,nnz': np.uintc,
     })
     # col and dat have uninferrable shape
     knl.args[3].shape = pm.var('nnz'),
     knl.args[4].shape = pm.var('nnz'),
     from scipy.sparse import csr_matrix
     n = 64
     mat = csr_matrix(np.ones((64, 64)) * (np.random.rand(64, 64) < 0.1))
     row = mat.indptr.astype(np.uintc)
     col = mat.indices.astype(np.uintc)
     dat = mat.data.astype(np.float32)
     out, vec = np.random.rand(2, n).astype(np.float32)
     nnz = mat.nnz
     knl(n, nnz, row, col, dat, vec, out)
     np.testing.assert_allclose(out, mat * vec, 1e-5, 1e-6)
コード例 #14
0
ファイル: test_statistics.py プロジェクト: cmsquared/loopy
def test_barrier_counter_barriers():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
            [
                """
            c[i,j,k] = 2*a[i,j,k] {id=first}
            e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first}
            """
            ], [
                lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
                "..."
            ],
            name="weird2",
            )
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
    knl = lp.split_iname(knl, "k", 128, outer_tag="g.0", inner_tag="l.0")
    poly = lp.get_synchronization_poly(knl)
    print(poly)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    barrier_count = poly["barrier_local"].eval_with_dict(params)
    assert barrier_count == 50*10*2
コード例 #15
0
def test_precompute_confusing_subst_arguments(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        D(i):=a[i+1]-a[i]
        b[i,j] = D(j)
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", sweep_inames="j",
            precompute_outer_inames="j, i_inner, i_outer")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
コード例 #16
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.prioritize_loops(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
コード例 #17
0
ファイル: test_apps.py プロジェクト: inducer/loopy
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
コード例 #18
0
def test_op_counter_triangular_domain():

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<m and i<j}",
                         """
            a[i, j] = b[i,j] * 2
            """,
                         name="bitwise",
                         assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    op_map = lp.get_op_map(knl, subgroup_size=SGS,
                           count_redundant_work=True)[lp.Op(
                               np.float64, 'mul', CG.SUBGROUP)]
    value_dict = dict(m=13, n=200)
    flops = op_map.eval_with_dict(value_dict)

    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group

    if expect_fallback:
        assert flops == 144 * n_subgroups
    else:
        assert flops == 78 * n_subgroups
コード例 #19
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_op_counter_basic():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
                         ],
                         name="basic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = lp.get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32add == f32mul == f32div == n * m * l
    assert f64mul == n * m
    assert i32add == n * m * 2
コード例 #20
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_gmem_access_counter_consec():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
                         ],
                         name="consec",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})

    poly = lp.get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    f64consec = poly[(np.dtype(np.float64), 'consecutive',
                      'load')].eval_with_dict(params)
    f32consec = poly[(np.dtype(np.float32), 'consecutive',
                      'load')].eval_with_dict(params)
    assert f64consec == 2 * n * m
    assert f32consec == 3 * n * m * l

    f64consec = poly[(np.dtype(np.float64), 'consecutive',
                      'store')].eval_with_dict(params)
    f32consec = poly[(np.dtype(np.float32), 'consecutive',
                      'store')].eval_with_dict(params)
    assert f64consec == n * m
    assert f32consec == n * m * l
コード例 #21
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_gmem_access_counter_logic():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
    ],
                         name="logic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    poly = lp.get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = poly[(np.dtype(np.float32), 'uniform',
                'load')].eval_with_dict(params)
    f64 = poly[(np.dtype(np.float64), 'uniform',
                'load')].eval_with_dict(params)
    assert f32 == 2 * n * m
    assert f64 == n * m

    f64 = poly[(np.dtype(np.float64), 'uniform',
                'store')].eval_with_dict(params)
    assert f64 == n * m
コード例 #22
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_gmem_access_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
    ],
                         name="specialops",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = lp.get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = poly[(np.dtype(np.float32), 'uniform',
                'load')].eval_with_dict(params)
    f64 = poly[(np.dtype(np.float64), 'uniform',
                'load')].eval_with_dict(params)
    assert f32 == 2 * n * m * l
    assert f64 == 2 * n * m

    f32 = poly[(np.dtype(np.float32), 'uniform',
                'store')].eval_with_dict(params)
    f64 = poly[(np.dtype(np.float64), 'uniform',
                'store')].eval_with_dict(params)
    assert f32 == n * m * l
    assert f64 == n * m
コード例 #23
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_gmem_access_counter_basic():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k] = g[i,k]*h[i,k+1]
                """
                         ],
                         name="basic",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = lp.get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = poly[(np.dtype(np.float32), 'uniform',
                'load')].eval_with_dict(params)
    f64 = poly[(np.dtype(np.float64), 'uniform',
                'load')].eval_with_dict(params)
    assert f32 == 3 * n * m * l
    assert f64 == 2 * n * m

    f32 = poly[(np.dtype(np.float32), 'uniform',
                'store')].eval_with_dict(params)
    f64 = poly[(np.dtype(np.float64), 'uniform',
                'store')].eval_with_dict(params)
    assert f32 == n * m * l
    assert f64 == n * m
コード例 #24
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_op_counter_bitwise():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
    ],
                         name="bitwise",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.int32, b=np.int32, g=np.int64, h=np.int64))

    poly = lp.get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
    assert i32add == n * m + n * m * l
    assert i32bw == 2 * n * m * l
    assert i64bw == 2 * n * m
    assert i64add == i64mul == n * m
    assert i64shift == 2 * n * m
コード例 #25
0
ファイル: test_statistics.py プロジェクト: shigh/loopy
def test_op_counter_specialops():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [
        """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                """
    ],
                         name="specialops",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = lp.get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    f64rsqrt = poly[(np.dtype(np.float64),
                     'func:rsqrt')].eval_with_dict(params)
    f64sin = poly[(np.dtype(np.float64), 'func:sin')].eval_with_dict(params)
    assert f32div == 2 * n * m * l
    assert f32mul == f32add == n * m * l
    assert f64add == 3 * n * m
    assert f64pow == i32add == f64rsqrt == f64sin == n * m
コード例 #26
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
コード例 #27
0
def test_alias_temporaries(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        times2(i) := 2*a[i]
        times3(i) := 3*a[i]
        times4(i) := 4*a[i]

        x[i] = times2(i)
        y[i] = times3(i)
        z[i] = times4(i)
        """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")

    knl = lp.precompute(knl, "times2", "i_inner")
    knl = lp.precompute(knl, "times3", "i_inner")
    knl = lp.precompute(knl, "times4", "i_inner")

    knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
コード例 #28
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_global_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n}", """
            <> c[i] = a[i + 1]
            out[i] = c[i]
            """)

    knl = lp.add_and_infer_dtypes(knl, {
        "a": np.float32,
        "c": np.float32,
        "out": np.float32,
        "n": np.int32
    })
    knl = lp.set_temporary_scope(knl, "c", "global")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    #print(cgr.device_code())
    #print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #29
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_gmem_access_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int32, h=np.int32))

    poly = get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32 = poly[
                    (np.dtype(np.int32), 'uniform', 'load')
                    ].eval_with_dict(params)
    assert i32 == 4*n*m+2*n*m*l

    i32 = poly[
                    (np.dtype(np.int32), 'uniform', 'store')
                    ].eval_with_dict(params)
    assert i32 == n*m+n*m*l
コード例 #30
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_gmem_access_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    poly = get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = poly[
                    (np.dtype(np.float32), 'uniform', 'load')
                    ].eval_with_dict(params)
    f64 = poly[
                    (np.dtype(np.float64), 'uniform', 'load')
                    ].eval_with_dict(params)
    assert f32 == 2*n*m
    assert f64 == n*m

    f64 = poly[
                    (np.dtype(np.float64), 'uniform', 'store')
                    ].eval_with_dict(params)
    assert f64 == n*m
コード例 #31
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_ispc_streaming_stores():
    stream_dtype = np.float32
    index_dtype = np.int32

    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "a[i] = b[i] + scalar * c[i]",
            target=lp.ISPCTarget(), index_dtype=index_dtype,
            name="stream_triad")

    vars = ["a", "b", "c", "scalar"]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(
        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
    knl = lp.tag_instructions(knl, "!streaming_store")

    knl = lp.add_and_infer_dtypes(knl, {
        var: stream_dtype
        for var in vars
        })

    knl = lp.set_argument_order(knl, vars + ["n"])

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    lp.generate_code_v2(knl).all_code()
コード例 #32
0
ファイル: test_statistics.py プロジェクト: mmmika/loopy
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                                  dict(a=np.float32, b=np.float32,
                                       g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == f32div == n*m*ell*n_subgroups
    assert f64mul == n*m*n_subgroups
    assert i32add == n*m*2*n_subgroups
コード例 #33
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_op_counter_triangular_domain():

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
            """
            a[i, j] = b[i,j] * 2
            """,
            name="bitwise", assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl,
            dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
    value_dict = dict(m=13, n=200)
    flops = poly.eval_with_dict(value_dict)

    if expect_fallback:
        assert flops == 144
    else:
        assert flops == 78
コード例 #34
0
ファイル: test_statistics.py プロジェクト: shwina/loopy
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                                  dict(a=np.float32, b=np.float32,
                                       g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl, count_redundant_work=True)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f32add == f32mul == f32div == n*m*ell
    assert f64mul == n*m
    assert i32add == n*m*2
コード例 #35
0
 def add_types(knl):
     return lp.add_and_infer_dtypes(knl, dict(
         w=np.float32,
         J=np.float32,
         DPsi=np.float32,
         DFinv=np.float32,
         ))
コード例 #36
0
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)],
                             4,
                             split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
コード例 #37
0
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}", """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name for rule_name in knl.substitutions
        if rule_name.startswith("E")
    ]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
コード例 #38
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_kernel_splitting_with_loop(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,k]: 0<=i<n and 0<=k<3 }",
            """
            c[k,i] = a[k, i + 1]
            out[k,i] = c[k,i]
            """)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    # schedule
    from loopy.preprocess import preprocess_kernel
    knl = preprocess_kernel(knl)

    from loopy.schedule import get_one_scheduled_kernel
    knl = get_one_scheduled_kernel(knl)

    # map schedule onto host or device
    print(knl)

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    print(cgr.device_code())
    print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #39
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_global_temporary(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n}",
            """
            <> c[i] = a[i + 1]
            out[i] = c[i]
            """)

    knl = lp.add_and_infer_dtypes(knl,
            {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32})
    knl = lp.set_temporary_scope(knl, "c", "global")

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    cgr = lp.generate_code_v2(knl)

    assert len(cgr.device_programs) == 2

    #print(cgr.device_code())
    #print(cgr.host_code())

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
コード例 #40
0
def test_precompute_with_preexisting_inames_fail():
    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
    })

    knl = lp.fix_parameters(knl, n=13)

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl,
                        "D1_subst",
                        "i,j",
                        default_tag="for",
                        precompute_inames="ii,jj")
    with pytest.raises(lp.LoopyError):
        lp.precompute(knl,
                      "D2_subst",
                      "i,k",
                      default_tag="for",
                      precompute_inames="ii,jj")
コード例 #41
0
ファイル: test_reduction.py プロジェクト: cmsquared/loopy
 def add_types(knl):
     return lp.add_and_infer_dtypes(knl, dict(
         w=np.float32,
         J=np.float32,
         DPsi=np.float32,
         DFinv=np.float32,
         ))
コード例 #42
0
ファイル: test_statistics.py プロジェクト: mmmika/loopy
def test_barrier_counter_barriers():

    knl = lp.make_kernel(
            "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
            [
                """
            c[i,j,k] = 2*a[i,j,k] {id=first}
            e[i,j,k] = c[i,j,k+1]+c[i,j,k-1] {dep=first}
            """
            ], [
                lp.TemporaryVariable("c", lp.auto, shape=(50, 10, 99)),
                "..."
            ],
            name="weird2",
            )
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.int32))
    knl = lp.split_iname(knl, "k", 128, inner_tag="l.0")
    sync_map = lp.get_synchronization_map(knl)
    print(sync_map)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    barrier_count = sync_map["barrier_local"].eval_with_dict(params)
    assert barrier_count == 50*10*2
コード例 #43
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32mul == n*m
    assert f64div == 2*n*m  # TODO why?
    assert f64add == n*m
    assert i32add == n*m
コード例 #44
0
ファイル: test_statistics.py プロジェクト: mmmika/loopy
def test_op_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul_serial", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == n*m*ell*n_subgroups

    op_map_dtype = op_map.group_by('dtype')
    f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
    assert f32 == f32add + f32mul
コード例 #45
0
ファイル: test_statistics.py プロジェクト: shwina/loopy
def test_op_counter_triangular_domain():

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
            """
            a[i, j] = b[i,j] * 2
            """,
            name="bitwise", assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl,
            dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    op_map = lp.get_op_map(
                    knl,
                    count_redundant_work=True
                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
    value_dict = dict(m=13, n=200)
    flops = op_map.eval_with_dict(value_dict)

    if expect_fallback:
        assert flops == 144
    else:
        assert flops == 78
コード例 #46
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_op_counter_specialops():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
            ],
            name="specialops", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32div == 2*n*m*l
    assert f32mul == f32add == n*m*l
    assert f64add == 2*n*m
    assert f64pow == i32add == n*m
コード例 #47
0
ファイル: test_statistics.py プロジェクト: shwina/loopy
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(
                        not(k<ell-2) and k>6 or k/2==ell,
                        g[i,k]*2,
                        g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    op_map = lp.get_op_map(knl, count_redundant_work=True)
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
                    ].eval_with_dict(params)
    assert f32mul == n*m
    assert f64div == 2*n*m  # TODO why?
    assert f64add == n*m
    assert i32add == n*m
コード例 #48
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_op_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))

    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
    assert i32add == n*m+n*m*l
    assert i32bw == 2*n*m*l
    assert i64bw == 2*n*m
    assert i64add == i64mul == n*m
    assert i64shift == 2*n*m
コード例 #49
0
ファイル: test_statistics.py プロジェクト: mmmika/loopy
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
            [
                """
                e[i,k] = if(
                        not(k<ell-2) and k>6 or k/2==ell,
                        g[i,k]*2,
                        g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups*subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
                    ].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                    ].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32mul == n*m*n_subgroups
    assert f64div == 2*n*m*n_subgroups  # TODO why?
    assert f64add == n*m*n_subgroups
    assert i32add == n*m*n_subgroups
コード例 #50
0
ファイル: test_statistics.py プロジェクト: navjotk/loopy
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32add == f32mul == f32div == n*m*l
    assert f64mul == n*m
    assert i32add == n*m*2
コード例 #51
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_numba_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaTarget())

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #52
0
ファイル: test_statistics.py プロジェクト: cmsquared/loopy
def test_all_counters_parallel_matmul():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul", assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")

    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    sync_poly = lp.get_synchronization_poly(knl)
    assert len(sync_poly) == 1
    assert sync_poly["kernel_launch"].eval_with_dict(params) == 1

    op_map = lp.get_op_poly(knl)
    f32mul = op_map[
                        (np.dtype(np.float32), 'mul')
                        ].eval_with_dict(params)
    f32add = op_map[
                        (np.dtype(np.float32), 'add')
                        ].eval_with_dict(params)
    i32ops = op_map[
                        (np.dtype(np.int32), 'add')
                        ].eval_with_dict(params)
    i32ops += op_map[
                        (np.dtype(np.int32), 'mul')
                        ].eval_with_dict(params)

    assert f32mul+f32add == n*m*l*2
    assert i32ops == n*m*l*4 + l*n*4

    subscript_map = lp.get_gmem_access_poly(knl)
    f32uncoal = subscript_map[
                        (np.dtype(np.float32), 'nonconsecutive', 'load')
                        ].eval_with_dict(params)
    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'load')
                        ].eval_with_dict(params)

    assert f32uncoal == n*m*l
    assert f32coal == n*m*l

    f32coal = subscript_map[
                        (np.dtype(np.float32), 'consecutive', 'store')
                        ].eval_with_dict(params)

    assert f32coal == n*l
コード例 #53
0
def test_reg_counter_reduction():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
            ],
            name="matmul_serial", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #54
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_auto_test_can_detect_problems(ctx_factory):
    ctx = ctx_factory()

    ref_knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        """
        a[i,j] = 25
        """)

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        a[i,i] = 25
        """)

    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(a=np.float32))
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    from loopy.diagnostic import AutomaticTestFailure
    with pytest.raises(AutomaticTestFailure):
        lp.auto_test_vs_ref(
                ref_knl, ctx, knl,
                parameters=dict(n=123))
コード例 #55
0
ファイル: test_statistics.py プロジェクト: cmsquared/loopy
def test_gather_access_footprint():
    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i,j,k<n}",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j]) + a[i,j]"
            ],
            name="matmul", assumptions="n >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))

    from loopy.statistics import gather_access_footprints, count
    fp = gather_access_footprints(knl)

    for key, footprint in six.iteritems(fp):
        print(key, count(knl, footprint))
コード例 #56
0
ファイル: test_statistics.py プロジェクト: cmsquared/loopy
def test_gather_access_footprint_2():
    knl = lp.make_kernel(
            "{[i]: 0<=i<n}",
            "c[2*i] = a[i]",
            name="matmul", assumptions="n >= 1")
    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    from loopy.statistics import gather_access_footprints, count
    fp = gather_access_footprints(knl)

    params = {"n": 200}
    for key, footprint in six.iteritems(fp):
        assert count(knl, footprint).eval_with_dict(params) == 200
        print(key, count(knl, footprint))
コード例 #57
0
def test_reg_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    regs = estimate_regs_per_thread(knl)
    assert regs == 6
コード例 #58
0
def transform(knl, vars, stream_dtype):
    vars = [v.strip() for v in vars.split(",")]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(
        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")

    knl = lp.add_and_infer_dtypes(knl, {
        var: stream_dtype
        for var in vars
        })

    knl = lp.set_argument_order(knl, vars + ["n"])

    return knl
コード例 #59
0
ファイル: test_target.py プロジェクト: inducer/loopy
def test_cuda_short_vector():
    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
コード例 #60
0
ファイル: test_loopy.py プロジェクト: dokempf/loopy
def test_variable_size_temporary():
    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' out[i] = sum(j, a[i,j])''')

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    knl = lp.add_prefetch(
            knl, "a[:,:]", default_tag=None)

    # Make sure that code generation succeeds even if
    # there are variable-length arrays.
    knl = lp.preprocess_kernel(knl)
    for k in lp.generate_loop_schedules(knl):
        lp.generate_code(k)