Example #1
0
def test_to_batched_temp(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' cnst = 2.0
         out[i] = sum(j, cnst*a[i,j]*x[j])''',
         [lp.TemporaryVariable(
             "cnst",
             dtype=np.float32,
             shape=(),
             scope=lp.temp_var_scope.PRIVATE), '...'])
    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                            x=np.float32,
                                            a=np.float32))
    ref_knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                    x=np.float32,
                                                    a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")
    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")

    # checking that cnst is not being bathced
    assert bknl.temporary_variables['cnst'].shape == ()

    a = np.random.randn(5, 5)
    x = np.random.randn(7, 5)

    # Checking that the program compiles and the logic is correct
    lp.auto_test_vs_ref(
            bref_knl, ctx, bknl,
            parameters=dict(a=a, x=x, n=5, nbatches=7))
Example #2
0
def test_to_batched(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<n } ''',
         ''' out[i] = sum(j, a[i,j]*x[j])''')
    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
                                            x=np.float32,
                                            a=np.float32))

    bknl = lp.to_batched(knl, "nbatches", "out,x")

    ref_knl = lp.make_kernel(
         ''' { [i,j,k]: 0<=i,j<n and 0<=k<nbatches} ''',
         '''out[k, i] = sum(j, a[i,j]*x[k, j])''')
    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
                                                    x=np.float32,
                                                    a=np.float32))

    a = np.random.randn(5, 5).astype(np.float32)
    x = np.random.randn(7, 5).astype(np.float32)

    # Running both the kernels
    evt, (out1, ) = bknl(queue, a=a, x=x, n=5, nbatches=7)
    evt, (out2, ) = ref_knl(queue, a=a, x=x, n=5, nbatches=7)

    # checking that the outputs are same
    assert np.linalg.norm(out1-out2) < 1e-15
Example #3
0
def test_unschedulable_kernel_detection():
    knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
                         """
                         mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
                         mat2[j] = mat2[j] + 1 {inames=j, id=i2}
                         mat3[i] = mat3[i] + 1 {inames=i, id=i3}
                         """)

    knl = lp.preprocess_kernel(knl)

    # Check that loopy can detect the unschedulability of the kernel
    assert lp.needs_iname_duplication(knl)
    assert len(list(lp.get_iname_duplication_options(knl))) == 4

    for inames, insns in lp.get_iname_duplication_options(knl):
        fixed_knl = lp.duplicate_inames(knl, inames, insns)
        assert not lp.needs_iname_duplication(fixed_knl)

    knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"],
                         """
                         mat1[l,m,i,j,k] = mat1[l,m,i,j,k] + 1 {inames=i:j:k:l:m}
                         mat2[l,m,j,k] = mat2[l,m,j,k] + 1 {inames=j:k:l:m}
                         mat3[l,m,k] = mat3[l,m,k] + 11 {inames=k:l:m}
                         mat4[l,m,i] = mat4[l,m,i] + 1 {inames=i:l:m}
                         """)

    assert lp.needs_iname_duplication(knl)
    assert len(list(lp.get_iname_duplication_options(knl))) == 10
Example #4
0
def test_arg_shape_uses_assumptions(ctx_factory):
    # If arg shape determination does not use assumptions, then it won't find a
    # static shape for out, which is at least 1 x 1 in size, but otherwise of
    # size n x n.

    lp.make_kernel(
            "{ [i,j]: 0<=i,j<n }",
            """
            out[i,j] = 2*a[i,j]
            out[0,0] = 13.0
            """, assumptions="n>=1")
Example #5
0
def test_fusion():
    exp_kernel = lp.make_kernel(
         ''' { [i]: 0<=i<n } ''',
         ''' exp[i] = pow(E, z[i])''',
         assumptions="n>0")

    sum_kernel = lp.make_kernel(
        '{ [j]: 0<=j<n }',
        'out2 = sum(j, exp[j])',
        assumptions='n>0')

    knl = lp.fuse_kernels([exp_kernel, sum_kernel])

    print(knl)
Example #6
0
def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel(
        "{[i]: 1<=i<=n}",
        "out[i] = -(f[i+1] - f[i-1])/h",
        [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel(
        "{[j]: 1<=j<=n}",
        "f[j] = u[j]**2/2",
        [
            lp.GlobalArg("f", shape="n+2"),
            lp.GlobalArg("u", shape="n+2"),
            ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
            data_flow=[
                ("f", 1, 0)
                ])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(
            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")

    precomp_knl = lp.precompute(
            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)
Example #7
0
def test_op_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
    f64div = poly[(np.dtype(np.float64), 'div')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32mul == n*m
    assert f64div == 2*n*m  # TODO why?
    assert f64add == n*m
    assert i32add == n*m
Example #8
0
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
Example #9
0
        def pick_knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes}""",
                "result[to_element_indices[k], i] \
                    = vec[from_element_indices[k], pick_list[i]]",
                [
                    lp.GlobalArg("result", None,
                        shape="nelements_result, n_to_nodes",
                        offset=lp.auto),
                    lp.GlobalArg("vec", None,
                        shape="nelements_vec, n_from_nodes",
                        offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    lp.ValueArg("n_from_nodes", np.int32),
                    "...",
                    ],
                name="resample_by_picking",
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #10
0
def test_gmem_access_counter_logic():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                e[i,k] = if(not(k<l-2) and k>6 or k/2==l, g[i,k]*2, g[i,k]+h[i,k]/2)
                """
            ],
            name="logic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
    poly = get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32 = poly[
                    (np.dtype(np.float32), 'uniform', 'load')
                    ].eval_with_dict(params)
    f64 = poly[
                    (np.dtype(np.float64), 'uniform', 'load')
                    ].eval_with_dict(params)
    assert f32 == 2*n*m
    assert f64 == n*m

    f64 = poly[
                    (np.dtype(np.float64), 'uniform', 'store')
                    ].eval_with_dict(params)
    assert f64 == n*m
Example #11
0
def test_op_counter_triangular_domain():

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<m and i<j}",
            """
            a[i, j] = b[i,j] * 2
            """,
            name="bitwise", assumptions="n,m >= 1")

    knl = lp.add_and_infer_dtypes(knl,
            dict(b=np.float64))

    expect_fallback = False
    import islpy as isl
    try:
        isl.BasicSet.card
    except AttributeError:
        expect_fallback = True
    else:
        expect_fallback = False

    poly = get_op_poly(knl)[(np.dtype(np.float64), 'mul')]
    value_dict = dict(m=13, n=200)
    flops = poly.eval_with_dict(value_dict)

    if expect_fallback:
        assert flops == 144
    else:
        assert flops == 78
Example #12
0
def test_op_counter_basic():

    knl = lp.make_kernel(
            "[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
            ],
            name="basic", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f64mul = poly[(np.dtype(np.float64), 'mul')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32add == f32mul == f32div == n*m*l
    assert f64mul == n*m
    assert i32add == n*m*2
Example #13
0
def get_tensor(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}",
    
  ],
  [
    "res[i,j,k]=sum((alpha), u[alpha,i]*v[alpha,j]*w[alpha,k])",
  ],
  [
    lp.GlobalArg("res", dtype, shape="n, n, n", order=order),
    lp.GlobalArg("v", dtype, shape="r, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("w", dtype, shape="r, n", order=order),
    lp.ValueArg("n", np.int32),
    lp.ValueArg("r", np.int32),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "i", 8,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "alpha", 2)
  knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" )
  
  return knl
Example #14
0
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 6*16*2

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                ],
            name="matmul")

    seq_knl = knl

    i_reg = 2
    j_reg = 2
    i_chunks = 16
    j_chunks = 16
    knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0")
    knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp")
    knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1")
    knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp")
    knl = lp.split_iname(knl, "k", 16)
    knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={})
Example #15
0
def LU_solver(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[l,k,i,j,m]: 0<=l<r and 0<=k<n-1 and k+1<=i<n and 0<=j<n-1 and 0<=m<n-1-j}",
    
  ],
  [
  "bcopy[i,l] = bcopy[i,l]-bcopy[k,l]*LU[i,k] {id=lab1}",
  "bcopy[n-1-j,l]=bcopy[n-j-1,l]/LU[n-j-1,n-1-j] {id=l2, dep=lab1}",
  "bcopy[m,l]= bcopy[m,l]-bcopy[n-j-1,l]*LU[m,n-1-j] {id=l3, dep =l2}",
  "bcopy[0,l]=bcopy[0,l]/LU[0,0]{id=l4, dep=l2}",
  ],
  [
  lp.GlobalArg("LU", dtype, shape = "n, n" , order=order),
  lp.GlobalArg("bcopy", dtype, shape = "n, r" , order=order),
  lp.ValueArg("n", np.int64),
  lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "k", 1)
  knl = lp.split_iname(knl, "i", 32)
  knl = lp.split_iname(knl, "j", 32)
  knl = lp.split_iname(knl, "l", 32, outer_tag="g.0", inner_tag="l.0")

#  print knl
#  print lp.CompiledKernel(ctx, knl).get_highlighted_code()   
  return knl
Example #16
0
def test_small_batched_matvec(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    K = 9997  # noqa
    Np = 36  # noqa

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np,
            [
                "result[k, i] = sum(j, d[i, j]*f[k, j])"
                ],
            [
                lp.GlobalArg("d", dtype, shape=(Np, Np), order=order),
                lp.GlobalArg("f", dtype, shape=("K", Np), order=order),
                lp.GlobalArg("result", dtype, shape=("K", Np), order=order),
                lp.ValueArg("K", np.int32, approximately=1000),
                ], name="batched_matvec", assumptions="K>=1")

    seq_knl = knl

    align_bytes = 64
    knl = lp.add_prefetch(knl, 'd[:,:]')
    pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes)
    knl = lp.split_array_dim(knl, ("f", 0), pad_mult)
    knl = lp.add_padding(knl, "f", 0, align_bytes)

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[K*2*Np**2/1e9], op_label=["GFlops"],
            parameters=dict(K=K))
Example #17
0
def test_op_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int64, h=np.int64))

    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    i32bw = poly[(np.dtype(np.int32), 'bw')].eval_with_dict(params)
    i64bw = poly[(np.dtype(np.int64), 'bw')].eval_with_dict(params)
    i64mul = poly[(np.dtype(np.int64), 'mul')].eval_with_dict(params)
    i64add = poly[(np.dtype(np.int64), 'add')].eval_with_dict(params)
    i64shift = poly[(np.dtype(np.int64), 'shift')].eval_with_dict(params)
    assert i32add == n*m+n*m*l
    assert i32bw == 2*n*m*l
    assert i64bw == 2*n*m
    assert i64add == i64mul == n*m
    assert i64shift == 2*n*m
Example #18
0
def LU_decomposition(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    "{[k,i]: 0<=k<n-1 and k+1<=i<n}",
    "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}",
  ],
  [
  "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}",
  "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}",
  ],
  [
  lp.GlobalArg("syst", dtype, shape = "n, n" , order=order),
  lp.ValueArg("n", np.int32),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "k", n)
  knl = lp.split_iname(knl, "i", 32)
  knl = lp.split_iname(knl, "j", 32)
  knl = lp.split_iname(knl, "l", 32)

#  print knl
#  print lp.CompiledKernel(ctx, knl).get_highlighted_code()   
  return knl
Example #19
0
def test_fancy_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "[n] -> {[i,j,k]: 0<=i,j,k<n }",
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.GlobalArg("a", dtype, shape="(n, n)", order=order),
                lp.GlobalArg("b", dtype, shape="(n, n)", order=order),
                lp.GlobalArg("c", dtype, shape="(n, n)", order=order),
                lp.ValueArg("n", np.int32, approximately=1000),
                ], name="fancy_matmul", assumptions="n>=1")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 16, slabs=(0, 1))
    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
    knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters=dict(n=n))
Example #20
0
def test_variable_size_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            "c[i, j] = sum(k, a[i, k]*b[k, j])")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
        })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0",
            slabs=(0, 1))
    knl = lp.split_iname(knl, "k", 8, slabs=(0, 1))

    knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
    knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={"n": n})
Example #21
0
def test_transpose(ctx_factory):
    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<%d}" % n,
            [
                "b[i, j] = a[j, i]"
                ],
            [
                lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                ],
            name="transpose")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16,
            outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16,
            outer_tag="g.1", inner_tag="l.0")
    knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"],
            parameters={})
Example #22
0
def test_plain_matrix_mul(ctx_factory):
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    for dtype, check, vec_size in [
            (cl_array.vec.float4, check_float4, 4),
            (np.float32, None, 1),
            ]:
        knl = lp.make_kernel(
                "{[i,j,k]: 0<=i,j,k<%d}" % n,
                [
                    "c[i, j] = sum(k, a[i, k]*b[k, j])"
                    ],
                [
                    lp.GlobalArg("a", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("b", dtype, shape=(n, n), order=order),
                    lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                    ],
                name="matmul")

        ref_knl = knl

        knl = lp.split_iname(knl, "i", 16,
                outer_tag="g.0", inner_tag="l.1")
        knl = lp.split_iname(knl, "j", 16,
                outer_tag="g.1", inner_tag="l.0")
        knl = lp.split_iname(knl, "k", 16)
        knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"])
        knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ])

        lp.auto_test_vs_ref(ref_knl, ctx, knl,
                op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"],
                parameters={"n": n}, check_result=check)
Example #23
0
def Prav_V(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}",
    
  ],
  [
    "f[alpha,j]=sum((k,i), a[i,j,k]*w[alpha, k]*u[alpha, i])",
  ],
  [
    lp.GlobalArg("a", dtype, shape="n, n, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("w", dtype, shape="r, n", order=order),
    lp.GlobalArg("f", dtype, shape="r, n", order=order),
    lp.ValueArg("n", np.int64),
    lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "j", 16,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "i", 16)
  knl = lp.split_iname(knl, "k", 16) 
   
  return knl
Example #24
0
        def kproj():
            import loopy as lp
            knl = lp.make_kernel([
                "{[k]: 0 <= k < nelements}",
                "{[j]: 0 <= j < n_from_nodes}"
                ],
                """
                for k
                    <> element_dot = \
                            sum(j, vec[from_element_indices[k], j] * \
                                   basis[j] * weights[j])

                    result[to_element_indices[k], ibasis] = \
                            result[to_element_indices[k], ibasis] + element_dot
                end
                """,
                [
                    lp.GlobalArg("vec", None,
                        shape=("n_from_elements", "n_from_nodes")),
                    lp.GlobalArg("result", None,
                        shape=("n_to_elements", "n_to_nodes")),
                    lp.GlobalArg("basis", None,
                        shape="n_from_nodes"),
                    lp.GlobalArg("weights", None,
                        shape="n_from_nodes"),
                    lp.ValueArg("n_from_elements", np.int32),
                    lp.ValueArg("n_to_elements", np.int32),
                    lp.ValueArg("n_to_nodes", np.int32),
                    lp.ValueArg("ibasis", np.int32),
                    '...'
                    ],
                name="conn_projection_knl",
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

            return knl
Example #25
0
def test_sum_factorization():
    knl = lp.make_kernel(
        "{[i,j,ip,jp,k,l]: "
        "0<=i<I and 0<=j<J and 0<=ip<IP and 0<=jp<JP and 0<=k,l<Q}",
        """
        phi1(i, x) := x**i
        phi2(i, x) := x**i
        psi1(i, x) := x**i
        psi2(i, x) := x**i
        a(x, y) := 1

        A[i,j,ip,jp] = sum(k,sum(l,
            phi1(i,x[0,k]) * phi2(j,x[1,l])
            * psi1(ip, x[0,k]) * psi2(jp, x[1, l])
            * w[0,k] * w[1,l]
            * a(x[0,k], x[1,l])
        ))
        """)

    pytest.xfail("extract_subst is currently too stupid for sum factorization")

    knl = lp.extract_subst(knl, "temp_array",
            "phi1(i,x[0,k]) *psi1(ip, x[0,k]) * w[0,k]")
    knl = lp.extract_subst(knl, "temp_array",
            "sum(k, phi1(i,x[0,k]) *psi1(ip, x[0,k]) * w[0,k])")

    print(knl)
Example #26
0
def test_op_counter_specialops():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (2*a[i,j,k])%(2+b[i,j,k]/3.0)
                e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                """
            ],
            name="specialops", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl,
                        dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    poly = get_op_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32mul = poly[(np.dtype(np.float32), 'mul')].eval_with_dict(params)
    f32div = poly[(np.dtype(np.float32), 'div')].eval_with_dict(params)
    f32add = poly[(np.dtype(np.float32), 'add')].eval_with_dict(params)
    f64pow = poly[(np.dtype(np.float64), 'pow')].eval_with_dict(params)
    f64add = poly[(np.dtype(np.float64), 'add')].eval_with_dict(params)
    i32add = poly[(np.dtype(np.int32), 'add')].eval_with_dict(params)
    assert f32div == 2*n*m*l
    assert f32mul == f32add == n*m*l
    assert f64add == 2*n*m
    assert f64pow == i32add == n*m
Example #27
0
def test_gmem_access_counter_bitwise():

    knl = lp.make_kernel(
            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
            [
                """
                c[i, j, k] = (a[i,j,k] | 1) + (b[i,j,k] & 1)
                e[i, k] = (g[i,k] ^ k)*(~h[i,k+1]) + (g[i, k] << (h[i,k] >> k))
                """
            ],
            name="bitwise", assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(
            knl, dict(
                a=np.int32, b=np.int32,
                g=np.int32, h=np.int32))

    poly = get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    i32 = poly[
                    (np.dtype(np.int32), 'uniform', 'load')
                    ].eval_with_dict(params)
    assert i32 == 4*n*m+2*n*m*l

    i32 = poly[
                    (np.dtype(np.int32), 'uniform', 'store')
                    ].eval_with_dict(params)
    assert i32 == n*m+n*m*l
Example #28
0
def left_W(ctx):
  order='C'
  dtype = np.float32
  knl = lp.make_kernel(ctx.devices[0], 
  [
    
    "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}",
    
  ],
  [
    "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((j),v[alpha,j]*v[alpha1,j])",
  ],
  [

    lp.GlobalArg("v", dtype, shape="r, n", order=order),
    lp.GlobalArg("u", dtype, shape="r, n", order=order),
    lp.GlobalArg("l", dtype, shape="r, r", order=order),
    lp.ValueArg("n", np.int64),
    lp.ValueArg("r", np.int64),
  ],
  assumptions="n>=1")
  knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0")
  knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
  knl = lp.split_iname(knl, "j", 16)
  knl = lp.split_iname(knl, "i", 16)
  
  return knl
Example #29
0
def test_forced_iname_deps_and_reduction():
    # See https://github.com/inducer/loopy/issues/24

    # This is (purposefully) somewhat un-idiomatic, to replicate the conditions
    # under which the above bug was found. If assignees were phi[i], then the
    # iname propagation heuristic would not assume that dependent instructions
    # need to run inside of 'i', and hence the forced_iname_* bits below would not
    # be needed.

    i1 = lp.CInstruction("i",
            "doSomethingToGetPhi();",
            assignees="phi")

    from pymbolic.primitives import Subscript, Variable
    i2 = lp.Assignment("a",
            lp.Reduction("sum", "j", Subscript(Variable("phi"), Variable("j"))),
            forced_iname_deps=frozenset(),
            forced_iname_deps_is_final=True)

    k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
            [i1, i2],
            [
                lp.GlobalArg("a", dtype=np.float32, shape=()),
                lp.ValueArg("n", dtype=np.int32),
                lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
                ],
            target=lp.CTarget(),
            )

    k = lp.preprocess_kernel(k)

    assert 'i' not in k.insn_inames("insn_0_j_update")
    print(k.stringify(with_dependencies=True))
Example #30
0
def test_atomic(ctx_factory, dtype):
    ctx = ctx_factory()

    if (
            np.dtype(dtype).itemsize == 8
            and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
        pytest.skip("64-bit atomics not supported on device")

    import pyopencl.version  # noqa
    if (
            cl.version.VERSION < (2015, 2)
            and dtype == np.int64):
        pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2")

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i%20] = out[i%20] + 2*a[i] {atomic}",
            [
                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
                lp.GlobalArg("a", dtype, shape=lp.auto),
                "..."
                ],
            assumptions="n>0")

    ref_knl = knl
    knl = lp.split_iname(knl, "i", 512)
    knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
Example #31
0
def test_generate_c_snippet():
    from loopy.target.c import CTarget

    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [
        Instr(f[I], l_sum(k, q_v[k, I] * u)),
        Instr(df[I], l_sum(k, q_v[k, I])),
    ], [
        lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
        lp.GlobalArg("f,df", np.float64, shape="nSpace"),
        lp.ValueArg("u", np.float64),
        "...",
    ],
                         target=CTarget(),
                         assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    print(lp.generate_body(knl))
Example #32
0
def test_funny_shape_matrix_mul(ctx_factory):
    ctx = ctx_factory()

    n = get_suitable_size(ctx)
    m = n + 12
    ell = m + 12

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,ell >= 1")

    knl = lp.add_dtypes(knl, {
        "a": np.float32,
        "b": np.float32,
    })

    ref_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)

    #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
    #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto")
    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        op_count=[2 * n**3 / 1e9],
                        op_label=["GFlops"],
                        parameters={
                            "n": n,
                            "m": m,
                            "ell": ell
                        })
Example #33
0
def test_simplify_indices(ctx_factory):
    ctx = ctx_factory()
    twice = lp.make_function("{[i, j]: 0<=i<10 and 0<=j<4}",
                             """
        y[i,j] = 2*x[i,j]
        """,
                             name="zerozerozeroonezeroify")

    knl = lp.make_kernel(
        "{:}", """
        Y[:,:] = zerozerozeroonezeroify(X[:,:])
        """, [lp.GlobalArg("X,Y", shape=(10, 4), dtype=np.float64)])

    class ContainsFloorDiv(lp.symbolic.CombineMapper):
        def combine(self, values):
            return any(values)

        def map_floor_div(self, expr):
            return True

        def map_variable(self, expr):
            return False

        def map_constant(self, expr):
            return False

    knl = lp.merge([knl, twice])
    knl = lp.inline_callable_kernel(knl, "zerozerozeroonezeroify")
    simplified_knl = lp.simplify_indices(knl)
    contains_floordiv = ContainsFloorDiv()

    assert any(
        contains_floordiv(insn.expression)
        for insn in knl.default_entrypoint.instructions
        if isinstance(insn, lp.MultiAssignmentBase))
    assert all(not contains_floordiv(insn.expression)
               for insn in simplified_knl.default_entrypoint.instructions
               if isinstance(insn, lp.MultiAssignmentBase))

    lp.auto_test_vs_ref(knl, ctx, simplified_knl)
Example #34
0
def test_op_counter_basic():

    knl = lp.make_kernel(
        "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [
            """
                c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
                e[i, k+1] = -g[i,k]*h[i,k+1]
                """
        ],
        name="basic",
        assumptions="n,m,ell >= 1")

    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    op_map = lp.get_op_map(knl,
                           subgroup_size=SGS,
                           count_redundant_work=True,
                           count_within_subscripts=True)
    n_workgroups = 1
    group_size = 1
    subgroups_per_group = div_ceil(group_size, SGS)
    n_subgroups = n_workgroups * subgroups_per_group
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f32add = op_map[lp.Op(np.float32, 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32mul = op_map[lp.Op(np.float32, 'mul',
                          CG.SUBGROUP)].eval_with_dict(params)
    f32div = op_map[lp.Op(np.float32, 'div',
                          CG.SUBGROUP)].eval_with_dict(params)
    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul',
                          CG.SUBGROUP)].eval_with_dict(params)
    i32add = op_map[lp.Op(np.dtype(np.int32), 'add',
                          CG.SUBGROUP)].eval_with_dict(params)
    # (count-per-sub-group)*n_subgroups
    assert f32add == f32mul == f32div == n * m * ell * n_subgroups
    assert f64mul == n * m * n_subgroups
    assert i32add == n * m * 2 * n_subgroups
Example #35
0
def test_image_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = get_suitable_size(ctx)

    if (not ctx.devices[0].image_support
            or ctx.devices[0].platform.name == "Portable Computing Language"):
        pytest.skip("crashes on pocl")

    image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT)
    if image_format not in cl.get_supported_image_formats(
            ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D):
        pytest.skip("image format not supported")

    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<%d}" % n,
            [
                "c[i, j] = sum(k, a[i, k]*b[k, j])"
                ],
            [
                lp.ImageArg("a", dtype, shape=(n, n)),
                lp.ImageArg("b", dtype, shape=(n, n)),
                lp.GlobalArg("c", dtype, shape=(n, n), order=order),
                ],
            name="matmul")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 32)
    # conflict-free
    knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"])
    knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"])

    lp.auto_test_vs_ref(seq_knl, ctx, knl,
            op_count=[2*n**3/1e9], op_label=["GFlops"],
            parameters={}, print_ref_code=True)
Example #36
0
def test_mem_access_counter_reduction():

    knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
                         name="matmul",
                         assumptions="n,m,l >= 1")

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
    mem_map = lp.get_mem_access_map(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}
    f32l = mem_map[lp.MemAccess('global',
                                np.float32,
                                stride=0,
                                direction='load',
                                variable='a')].eval_with_dict(params)
    f32l += mem_map[lp.MemAccess('global',
                                 np.float32,
                                 stride=0,
                                 direction='load',
                                 variable='b')].eval_with_dict(params)
    assert f32l == 2 * n * m * l

    f32s = mem_map[lp.MemAccess('global',
                                np.dtype(np.float32),
                                stride=0,
                                direction='store',
                                variable='c')].eval_with_dict(params)
    assert f32s == n * l

    ld_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['load'
                                            ]).to_bytes().eval_and_sum(params)
    st_bytes = mem_map.filter_by(mtype=['global'],
                                 direction=['store'
                                            ]).to_bytes().eval_and_sum(params)
    assert ld_bytes == 4 * f32l
    assert st_bytes == 4 * f32s
Example #37
0
def get_tensor(ctx):
    order = 'C'
    dtype = np.float64
    knl = lp.make_kernel(ctx.devices[0], [
        "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}",
    ], [
        "res[i,j,k]=sum((alpha), u[i,alpha]*v[j,alpha]*w[k,alpha])",
    ], [
        lp.GlobalArg("res", dtype, shape="n, n, n", order=order),
        lp.GlobalArg("v", dtype, shape="n, r", order=order),
        lp.GlobalArg("u", dtype, shape="n, r", order=order),
        lp.GlobalArg("w", dtype, shape="n, r", order=order),
        lp.ValueArg("n", np.int32),
        lp.ValueArg("r", np.int32),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "alpha", 2)
    knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2")

    return knl
Example #38
0
def test_fancy_matrix_mul(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()

    order = "C"

    n = get_suitable_size(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j,k]: 0<=i,j,k<n }", ["c[i, j] = sum(k, a[i, k]*b[k, j])"],
        [
            lp.GlobalArg("a", dtype, shape="(n, n)", order=order),
            lp.GlobalArg("b", dtype, shape="(n, n)", order=order),
            lp.GlobalArg("c", dtype, shape="(n, n)", order=order),
            lp.ValueArg("n", np.int32, approximately=1000),
        ],
        name="fancy_matmul",
        assumptions="n>=1")

    seq_knl = knl

    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0")
    knl = lp.split_iname(knl, "k", 16, slabs=(0, 1))
    knl = lp.add_prefetch(knl,
                          "a", ["i_inner", "k_inner"],
                          fetch_outer_inames="i_outer, j_outer, k_outer",
                          default_tag="l.auto")
    knl = lp.add_prefetch(knl,
                          "b", ["k_inner", "j_inner"],
                          fetch_outer_inames="i_outer, j_outer, k_outer",
                          default_tag="l.auto")

    lp.auto_test_vs_ref(seq_knl,
                        ctx,
                        knl,
                        op_count=[2 * n**3 / 1e9],
                        op_label=["GFlops"],
                        parameters=dict(n=n))
Example #39
0
def LU_decomposition(ctx):
    order = 'C'
    dtype = np.float64
    knl = lp.make_kernel(ctx.devices[0], [
        "{[k,i]: 0<=k<n-1 and k+1<=i<n}",
        "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}",
    ], [
        "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}",
        "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}",
    ], [
        lp.GlobalArg("syst", dtype, shape="n, n", order=order),
        lp.ValueArg("n", np.int32),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "k", n)
    knl = lp.split_iname(knl, "i", 32)
    knl = lp.split_iname(knl, "j", 32)
    knl = lp.split_iname(knl, "l", 32)

    #  print knl
    #  print lp.CompiledKernel(ctx, knl).get_highlighted_code()
    return knl
Example #40
0
def test_callee_with_auto_offset(ctx_factory):

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arange = lp.make_function("{[i]: 0<=i<7}",
                              """
        y[i] = 2*y[i]
        """, [lp.GlobalArg("y", offset=lp.auto)],
                              name="dosify")

    knl = lp.make_kernel(
        "{[i]: 0<=i<7}", """
        [i]: y[i] = dosify([i]: y[i])
        """, [lp.GlobalArg("y", offset=3, shape=10)])

    knl = lp.merge([knl, arange])

    y = np.arange(10)
    knl(queue, y=y)
    np.testing.assert_allclose(y[:3], np.arange(3))
    np.testing.assert_allclose(y[3:], 2 * np.arange(3, 10))
Example #41
0
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (self.get_default_src_tgt_arguments() + [
            lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources")
            for i, dtype in enumerate(self.value_dtypes)
        ])

        loopy_knl = lp.make_kernel(
            [
                """
            {[itgt, isrc, idim]: \
                0 <= itgt < ntargets and \
                0 <= isrc < nsources and \
                0 <= idim < dim}
            """
            ],
            self.get_kernel_scaling_assignments() + ["for itgt, isrc"] +
            ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + [
                "<> is_self = (isrc == target_to_source[itgt])"
                if self.exclude_self else ""
            ] + loopy_insns + kernel_exprs + [
                """
                result_{i}[itgt, isrc] = \
                    knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}}
                """.format(i=iknl) for iknl in range(len(self.kernels))
            ] + ["end"],
            arguments,
            assumptions="nsources>=1 and ntargets>=1",
            name=self.name,
            fixed_parameters=dict(dim=self.dim),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Example #42
0
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arr = np.ones(n, dtype=np.float32)
    segment_boundaries = np.zeros(n, dtype=np.int32)
    segment_boundaries[(segment_boundaries_indices, )] = 1

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<=i}",
        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [
            lp.GlobalArg("arr", np.float32, shape=("n", )),
            lp.GlobalArg("segflag", np.int32, shape=("n", )), "..."
        ])

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i=iname_tag))
    knl = lp.realize_reduction(knl, force_scan=True)

    (evt, (out, )) = knl(queue, arr=arr, segflag=segment_boundaries)

    check_segmented_scan_output(arr, segment_boundaries_indices, out)
Example #43
0
def test_kc_with_floor_div_in_expr(ctx_factory, inline):
    # See https://github.com/inducer/loopy/issues/366
    import loopy as lp

    ctx = ctx_factory()
    callee = lp.make_function("{[i]: 0<=i<10}",
                              """
            x[i] = 2*x[i]
            """,
                              name="callee_with_update")

    knl = lp.make_kernel(
        "{[i]: 0<=i<10}", """
            [i]: x[2*(i//2) + (i%2)] = callee_with_update([i]: x[i])
            """)

    knl = lp.merge([knl, callee])

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_with_update")

    lp.auto_test_vs_ref(knl, ctx, knl)
Example #44
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl,
                          "u", ["i_inner", "j_inner"],
                          fetch_bounding_box=True)

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
Example #45
0
def test_ispc_target(occa_mode=False):
    from loopy.target.ispc import ISPCTarget

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i] = 2*a[i]",
            [
                lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                "..."
                ],
            target=ISPCTarget(occa_mode=occa_mode))

    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
    knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
    knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])

    codegen_result = lp.generate_code_v2(
                lp.get_one_scheduled_kernel(
                    lp.preprocess_kernel(knl)))

    print(codegen_result.device_code())
    print(codegen_result.host_code())
Example #46
0
def test_nested_dependent_reduction(ctx_factory):
    dtype = np.dtype(np.int32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(["{[i]: 0<=i<n}", "{[j]: 0<=j<i+sumlen}"], [
        "<> sumlen = ell[i]",
        "a[i] = sum(j, j)",
    ], [
        lp.ValueArg("n", np.int32),
        lp.GlobalArg("a", dtype, ("n", )),
        lp.GlobalArg("ell", np.int32, ("n", )),
    ])

    cknl = lp.CompiledKernel(ctx, knl)

    n = 330
    ell = np.arange(n, dtype=np.int32)
    evt, (a, ) = cknl(queue, ell=ell, n=n, out_host=True)

    tgt_result = (2 * ell - 1) * 2 * ell / 2
    assert (a == tgt_result).all()
Example #47
0
def test_ispc_streaming_stores():
    stream_dtype = np.float32
    index_dtype = np.int32

    knl = lp.make_kernel("{[i]: 0<=i<n}",
                         "a[i] = b[i] + scalar * c[i]",
                         target=lp.ISPCTarget(),
                         index_dtype=index_dtype,
                         name="stream_triad")

    vars = ["a", "b", "c", "scalar"]
    knl = lp.assume(knl, "n>0")
    knl = lp.split_iname(knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
    knl = lp.tag_instructions(knl, "!streaming_store")

    knl = lp.add_and_infer_dtypes(knl, {var: stream_dtype for var in vars})

    knl = lp.set_argument_order(knl, vars + ["n"])

    lp.generate_code_v2(knl).all_code()
    assert "streaming_store(" in lp.generate_code_v2(knl).all_code()
Example #48
0
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "{[k]: 0<=k<=1}",
            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
        ],
        "out[k,i] = k + sum(j, j**2)"
        )

    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
    n = 10
    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (out,) = knl(queue)

    inner = np.cumsum(np.arange(n)**2)

    assert (out.get() == np.array([inner, 1 + inner])).all()
Example #49
0
        def knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes and
                    0<=j<n_from_nodes}""",
                "result[itgt_base + to_element_indices[k]*n_to_nodes + i, \
                        isrc_base + from_element_indices[k]*n_from_nodes + j] \
                    = resample_mat[i, j]", [
                    lp.GlobalArg("result",
                                 None,
                                 shape="nnodes_tgt, nnodes_src",
                                 offset=lp.auto),
                    lp.ValueArg("itgt_base,isrc_base", np.int32),
                    lp.ValueArg("nnodes_tgt,nnodes_src", np.int32),
                    "...",
                ],
                name="oversample_mat")

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #50
0
def test_c_instruction(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    knl = lp.make_kernel("{[i,j]: 0<=i,j<n }", [
        lp.CInstruction("i,j",
                        """
                    x = sin((float) i*j);
                    """,
                        assignees="x"),
        "a[i,j] = x",
    ], [
        lp.GlobalArg("a", shape=lp.auto, dtype=np.float32),
        lp.TemporaryVariable("x", np.float32),
        "...",
    ],
                         assumptions="n>=1")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    print(knl)
    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
Example #51
0
def Prav_V(ctx):
    order = 'C'
    dtype = np.float64
    knl = lp.make_kernel(ctx.devices[0], [
        "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}",
    ], [
        "f[alpha,j]=sum((k,i), a[i,j,k]*w[k,alpha]*u[i,alpha])",
    ], [
        lp.GlobalArg("a", dtype, shape="n, n, n", order=order),
        lp.GlobalArg("u", dtype, shape="n, r", order=order),
        lp.GlobalArg("w", dtype, shape="n, r", order=order),
        lp.GlobalArg("f", dtype, shape="r, n", order=order),
        lp.ValueArg("n", np.int64),
        lp.ValueArg("r", np.int64),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "i", 16)
    knl = lp.split_iname(knl, "k", 16)

    return knl
Example #52
0
def test_double_sum_made_unique(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 20

    knl = lp.make_kernel(
            "{[i,j]: 0<=i,j<n }",
            [
                "a = sum((i,j), i*j)",
                "b = sum(i, sum(j, i*j))",
                ],
            assumptions="n>=1")

    knl = lp.make_reduction_inames_unique(knl)
    print(knl)

    evt, (a, b) = knl(queue, n=n)

    ref = sum(i*j for i in range(n) for j in range(n))
    assert a.get() == ref
    assert b.get() == ref
Example #53
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel([
        "[n] -> {[i]: 0 <= i < n}", "[i] -> {[j]: 0 <= j <= i}",
        "[i] -> {[k]: 0 <= k <= i}"
    ], """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out, ) = knl(queue)

    print(out)
Example #54
0
def Prav_U(ctx):
    order = 'C'
    dtype = np.float32
    knl = lp.make_kernel(ctx.devices[0], [
        "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}",
    ], [
        "f[alpha,i]=sum((j,k), a[i,j,k]*v[alpha,j]*w[alpha,k])",
    ], [
        lp.GlobalArg("a", dtype, shape="n, n, n", order=order),
        lp.GlobalArg("v", dtype, shape="r, n", order=order),
        lp.GlobalArg("w", dtype, shape="r, n", order=order),
        lp.GlobalArg("f", dtype, shape="r, n", order=order),
        lp.ValueArg("n", np.int64),
        lp.ValueArg("r", np.int64),
    ],
                         assumptions="n>=1")
    knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.split_iname(knl, "alpha", 1, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl, "j", 16)
    knl = lp.split_iname(knl, "k", 16)
    print lp.CompiledKernel(ctx, knl).get_highlighted_code()
    return knl
Example #55
0
def test_scan_with_different_lower_bound_from_sweep(ctx_factory, sweep_lbound,
                                                    scan_lbound):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n, sweep_lbound, scan_lbound] -> "
        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """
        out[i-sweep_lbound] = sum(j, j**2)
        """)

    n = 10

    knl = lp.fix_parameters(knl,
                            sweep_lbound=sweep_lbound,
                            scan_lbound=scan_lbound)
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (out, ) = knl(queue, n=n)

    assert (out.get() == np.cumsum(
        np.arange(scan_lbound, 2 * n + scan_lbound)**2)[::2]).all()
Example #56
0
def test_argmax(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 10000

    knl = lp.make_kernel(
        "{[i]: 0<=i<%d}" % n, """
            max_val, max_idx = argmax(i, abs(a[i]), i)
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    print(lp.preprocess_kernel(knl))
    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

    a = np.random.randn(10000).astype(dtype)
    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
    assert max_val == np.max(np.abs(a))
    assert max_idx == np.where(np.abs(a) == max_val)[-1]
Example #57
0
def test_non1_step_slices(ctx_factory, start, inline):
    # See https://github.com/inducer/loopy/pull/222#discussion_r645905188

    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    callee = lp.make_function("{[i]: 0<=i<n}",
                              """
            y[i] = i**2
            """, [lp.ValueArg("n"), ...],
                              name="squared_arange")

    t_unit = lp.make_kernel("{[i_init, j_init]: 0<=i_init, j_init<40}",
                            f"""
            X[i_init] = 42
            X[{start}:40:3] = squared_arange({len(range(start, 40, 3))})

            Y[j_init] = 1729
            Y[39:{start}:-3] = squared_arange({len(range(39, start, -3))})
            """, [lp.GlobalArg("X,Y", shape=40)],
                            seq_dependencies=True)

    expected_out1 = 42 * np.ones(40, dtype=np.int64)
    expected_out1[start:40:3] = np.arange(len(range(start, 40, 3)))**2

    expected_out2 = 1729 * np.ones(40, dtype=np.int64)
    expected_out2[39:start:-3] = np.arange(len(range(39, start, -3)))**2

    t_unit = lp.merge([t_unit, callee])

    t_unit = lp.set_options(t_unit, "return_dict")

    if inline:
        t_unit = lp.inline_callable_kernel(t_unit, "squared_arange")

    evt, out_dict = t_unit(cq)

    np.testing.assert_allclose(out_dict["X"].get(), expected_out1)
    np.testing.assert_allclose(out_dict["Y"].get(), expected_out2)
Example #58
0
def test_rename_argument_with_auto_stride(ctx_factory):
    from loopy.kernel.array import FixedStrideArrayDimTag

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{[i]: 0<=i<10}", """
            y[i] = x[i]
            """, [
        lp.GlobalArg("x",
                     dtype=float,
                     shape=lp.auto,
                     dim_tags=[FixedStrideArrayDimTag(lp.auto)]), ...
    ])

    knl = lp.rename_argument(knl, "x", "x_new")

    code_str = lp.generate_code_v2(knl).device_code()
    assert code_str.find("double const *__restrict__ x_new,") != -1
    assert code_str.find("double const *__restrict__ x,") == -1

    evt, (out, ) = knl(queue, x_new=np.random.rand(10))
Example #59
0
 def make_G_S_knl(instructions):
     knl = lp.make_kernel(
         "[Nx, Ny, Nz] -> { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }",
         instructions,
         [
             lp.GlobalArg(
                 "subarr", shape=pencil_shape_str, offset=lp.auto),
             lp.GlobalArg("arr", shape="(Nx, Ny, Nz)", offset=lp.auto),
             ...,
         ],
         default_offset=lp.auto,
         lang_version=(2018, 2),
     )
     knl = lp.fix_parameters(knl, **params_to_fix)
     knl = lp.split_iname(knl,
                          "k",
                          32,
                          outer_tag="g.0",
                          inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 2, outer_tag="g.1", inner_tag="unr")
     knl = lp.split_iname(knl, "i", 1, outer_tag="g.2", inner_tag="unr")
     return knl
Example #60
0
def test_ilp_loop_bound(ctx_factory):
    # The salient bit of this test is that a joint bound on (outer, inner)
    # from a split occurs in a setting where the inner loop has been ilp'ed.
    # In 'normal' parallel loops, the inner index is available for conditionals
    # throughout. In ILP'd loops, not so much.

    ctx = ctx_factory()
    knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }",
                         """
            out[i,k] = sum(j, a[i,j]*b[j,k])
            """, [
                             lp.GlobalArg("a,b", np.float32, shape=lp.auto),
                             "...",
                         ],
                         assumptions="n>=1")

    ref_knl = knl

    knl = lp.set_loop_priority(knl, "j,i,k")
    knl = lp.split_iname(knl, "k", 4, inner_tag="ilp")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=200))