def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 6*16*2 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape="(n, n)", order=order), lp.GlobalArg("b", dtype, shape="(n, n)", order=order), lp.GlobalArg("c", dtype, shape="(n, n)", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters=dict(n=n))
def get_tensor(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,k]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "res[i,j,k]=sum((alpha), u[alpha,i]*v[alpha,j]*w[alpha,k])", ], [ lp.GlobalArg("res", dtype, shape="n, n, n", order=order), lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int32), lp.ValueArg("r", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "i", 8,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "alpha", 2) knl = lp.split_iname(knl, "k", 8, outer_tag="g.2", inner_tag="l.2" ) return knl
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size in [ (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check)
def test_transpose(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={})
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") return knl
def get_3d_knl(context, dtype): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", """ <> xx = i/(n-1) <> yy = j/(n-1) <> zz = k/(n-1) <float64> phi = 0.3 <> s1 = sin(phi) <> c1 = cos(phi) <> xxx = c1*xx + s1*yy <> yyy = -s1*xx + c1*yy <> zzz = zz <float64> theta = 0.7 <> s2 = sin(theta) <> c2 = cos(theta) x[i,j,k] = 4 * (c2*xxx + s2*zzz) - 2 y[i,j,k] = 4 * yyy - 2 z[i,j,k] = 4 * (-s2*xxx + c2*zzz) - 2 """, [ lp.GlobalArg("x,y,z", dtype, shape=lp.auto), lp.ValueArg("n", np.int32), ], assumptions="n>0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "k", 16, outer_tag="g.0", inner_tag="l.0") return lp.CompiledKernel(context, knl)
def no_test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) # ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") print(knl) 1/0 knl = lp.realize_reduction(knl) evt, (z,) = knl(queue, n=size)
def test_equality_constraints(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 10 knl = lp.make_kernel([ "[n] -> {[i,j]: 0<=i,j<n }", "{[k]: k =i+5 and k < n}", ], [ "a[i,j] = 5 {id=set_all}", "b[i,k] = 22 {dep=set_all}", ], [ lp.GlobalArg("a,b", dtype, shape="n, n", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="equality_constraints", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") #print(knl) #print(knl.domains[0].detect_equalities()) lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n), print_ref_code=True)
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def test_ilp_race_matmul(ctx_factory): dtype = np.float32 order = "C" n = 9 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) knl = lp.add_prefetch(knl, 'b', ["k_inner"]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: knl = lp.preprocess_kernel(knl) list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def right_u(ctx): order = "C" dtype = np.float32 n=128 r=3 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k,s]: 0<=i<%d and 0<=j<%d and 0<=k<%d and 0<=s<%d}" % (n,n,n,r), [ "f[i,s] = sum((j,k), a[i,j,k]*v[j,s]*w[k,s])", ], [ lp.GlobalArg("a", dtype, shape=(n, n, n), order=order), lp.GlobalArg("v", dtype, shape=(n, r), order=order), lp.GlobalArg("w", dtype, shape=(n, r), order=order), lp.GlobalArg("f", dtype, shape=(n, r), order=order), ], name="pravaya") knl = lp.split_iname(knl, "i", 2,outer_tag = "g.0", inner_tag="l.1") knl = lp.split_iname(knl, "s", r, outer_tag = "g.1", inner_tag = "l.0") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "k", 16) #,16,outer_tag="g.2", inner_tag="l.2") # knl = lp.add_prefetch(knl, "a", ["k_inner","j_inner", "i_inner"]) knl = lp.add_prefetch(knl, "v", ["s_inner", "j_inner"]) knl = lp.add_prefetch(knl, "w", ["s_inner", "k_inner"]) #knl = lp.add_prefetch(knl, "a", ["k_inner", "j_inner"]) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) seq_knl = knl return knl
def test_independent_multi_domain(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[j]: 0<=j<n}", ], [ "a[i] = 1", "b[j] = 2", ], [ lp.GlobalArg("a", dtype, shape=("n"), order="C"), lp.GlobalArg("b", dtype, shape=("n"), order="C"), lp.ValueArg("n", np.int32), ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") assert knl.parents_per_domain() == 2*[None] n = 50 cknl = lp.CompiledKernel(ctx, knl) evt, (a, b) = cknl(queue, n=n, out_host=True) assert a.shape == (50,) assert b.shape == (50,) assert (a == 1).all() assert (b == 2).all()
def test_ispc_streaming_stores(): stream_dtype = np.float32 index_dtype = np.int32 knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = b[i] + scalar * c[i]", target=lp.ISPCTarget(), index_dtype=index_dtype, name="stream_triad") vars = ["a", "b", "c", "scalar"] knl = lp.assume(knl, "n>0") knl = lp.split_iname( knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0") knl = lp.tag_instructions(knl, "!streaming_store") knl = lp.add_and_infer_dtypes(knl, { var: stream_dtype for var in vars }) knl = lp.set_argument_order(knl, vars + ["n"]) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) lp.generate_code_v2(knl).all_code()
def left_W(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[j,i,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=j,i<n}", ], [ "l[alpha,alpha1]=sum((i), u[alpha,i]*u[alpha1,i])*sum((j),v[alpha,j]*v[alpha1,j])", ], [ lp.GlobalArg("v", dtype, shape="r, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "i", 16) return knl
def test_atomic(ctx_factory, dtype): ctx = ctx_factory() if ( np.dtype(dtype).itemsize == 8 and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): pytest.skip("64-bit atomics not supported on device") import pyopencl.version # noqa if ( cl.version.VERSION < (2015, 2) and dtype == np.int64): pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i%20] = out[i%20] + 2*a[i] {atomic}", [ lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), lp.GlobalArg("a", dtype, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl knl = lp.split_iname(knl, "i", 512) knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def left_V(ctx): order='C' dtype = np.float64 knl = lp.make_kernel(ctx.devices[0], [ "{[i,k,alpha,alpha1]: 0<=alpha,alpha1<r and 0<=i,k<n}", ], [ "l[alpha,alpha1]=sum((i), u[i,alpha]*u[i,alpha1])*sum((k),w[k,alpha]*w[k,alpha1])", ], [ lp.GlobalArg("u", dtype, shape="n, r", order=order), lp.GlobalArg("w", dtype, shape="n, r", order=order), lp.GlobalArg("l", dtype, shape="r, r", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "alpha1", 16,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def LU_solver(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[l,k,i,j,m]: 0<=l<r and 0<=k<n-1 and k+1<=i<n and 0<=j<n-1 and 0<=m<n-1-j}", ], [ "bcopy[i,l] = bcopy[i,l]-bcopy[k,l]*LU[i,k] {id=lab1}", "bcopy[n-1-j,l]=bcopy[n-j-1,l]/LU[n-j-1,n-1-j] {id=l2, dep=lab1}", "bcopy[m,l]= bcopy[m,l]-bcopy[n-j-1,l]*LU[m,n-1-j] {id=l3, dep =l2}", "bcopy[0,l]=bcopy[0,l]/LU[0,0]{id=l4, dep=l2}", ], [ lp.GlobalArg("LU", dtype, shape = "n, n" , order=order), lp.GlobalArg("bcopy", dtype, shape = "n, r" , order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "k", 1) knl = lp.split_iname(knl, "i", 32) knl = lp.split_iname(knl, "j", 32) knl = lp.split_iname(knl, "l", 32, outer_tag="g.0", inner_tag="l.0") # print knl # print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def LU_decomposition(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[k,i]: 0<=k<n-1 and k+1<=i<n}", "{[j,l]: 0<=k<n-1 and k+1<=j,l<n}", ], [ "syst[i,k] = syst[i,k]/syst[k,k] {id=lab1}", "syst[l,j]= syst[l,j] - syst[l,k]*syst[k,j] {dep=lab1}", ], [ lp.GlobalArg("syst", dtype, shape = "n, n" , order=order), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.split_iname(knl, "k", n) knl = lp.split_iname(knl, "i", 32) knl = lp.split_iname(knl, "j", 32) knl = lp.split_iname(knl, "l", 32) # print knl # print lp.CompiledKernel(ctx, knl).get_highlighted_code() return knl
def Prav_V(ctx): order='C' dtype = np.float32 knl = lp.make_kernel(ctx.devices[0], [ "{[i,j,k,alpha]: 0<=alpha<r and 0<=i,j,k<n}", ], [ "f[alpha,j]=sum((k,i), a[i,j,k]*w[alpha, k]*u[alpha, i])", ], [ lp.GlobalArg("a", dtype, shape="n, n, n", order=order), lp.GlobalArg("u", dtype, shape="r, n", order=order), lp.GlobalArg("w", dtype, shape="r, n", order=order), lp.GlobalArg("f", dtype, shape="r, n", order=order), lp.ValueArg("n", np.int64), lp.ValueArg("r", np.int64), ], assumptions="n>=1") knl = lp.split_iname(knl, "j", 16,outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "alpha", 3, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "i", 16) knl = lp.split_iname(knl, "k", 16) return knl
def variant_gpu(knl): unroll = 4 block_size = 256 knl = lp.split_iname(knl, "i", unroll*block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") return knl
def test_red2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 16 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ue(a,b) := u[e,a,b]", "ur(a,b) := sum_float32(@o, D[a,o]*ue(o,b))", "us(a,b) := sum_float32(@o, D[b,o]*ue(a,o))", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j)+G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m)+G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"]) knl = lp.add_prefetch(knl, "u", ["i", "j", "o"]) knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"]) knl = lp.precompute(knl, "ur", np.float32, ["a", "b"]) knl = lp.precompute(knl, "us", np.float32, ["a", "b"]) knl = lp.split_iname(knl, "e", 2, outer_tag="g.0") knl = lp.split_iname(knl, "j", n, inner_tag="l.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.1")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.add_prefetch(knl, "G", [2,3]) # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*((n**3)*2*2 + n*n*2*3 + (n**3)*2*2)/1e9, op_label="GFlops", parameters={"K": K})
def variant_3(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a", ["i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner"]) return knl
def variant_overfetch(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", slabs=(1, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", slabs=(1, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = knl.set_loop_priority(knl, ["i", "j"]) knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl
def test_all_counters_parallel_matmul(): knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} sync_poly = lp.get_synchronization_poly(knl) assert len(sync_poly) == 1 assert sync_poly["kernel_launch"].eval_with_dict(params) == 1 op_map = lp.get_op_poly(knl) f32mul = op_map[ (np.dtype(np.float32), 'mul') ].eval_with_dict(params) f32add = op_map[ (np.dtype(np.float32), 'add') ].eval_with_dict(params) i32ops = op_map[ (np.dtype(np.int32), 'add') ].eval_with_dict(params) i32ops += op_map[ (np.dtype(np.int32), 'mul') ].eval_with_dict(params) assert f32mul+f32add == n*m*l*2 assert i32ops == n*m*l*4 + l*n*4 subscript_map = lp.get_gmem_access_poly(knl) f32uncoal = subscript_map[ (np.dtype(np.float32), 'nonconsecutive', 'load') ].eval_with_dict(params) f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'load') ].eval_with_dict(params) assert f32uncoal == n*m*l assert f32coal == n*m*l f32coal = subscript_map[ (np.dtype(np.float32), 'consecutive', 'store') ].eval_with_dict(params) assert f32coal == n*l
def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) return knl
def variant_more_per_work_group(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") return knl
def test_all_counters_parallel_matmul(): bsize = 16 knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 assert sync_map["barrier_local"].eval_with_dict(params) == 2 * m / bsize op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[lp.Op(np.float32, 'mul')].eval_with_dict(params) f32add = op_map[lp.Op(np.float32, 'add')].eval_with_dict(params) i32ops = op_map[lp.Op(np.int32, 'add')].eval_with_dict(params) i32ops += op_map[lp.Op(np.dtype(np.int32), 'mul')].eval_with_dict(params) assert f32mul + f32add == n * m * ell * 2 op_map = lp.get_mem_access_map(knl, count_redundant_work=True) f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b')].eval_with_dict(params) f32s1la = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='a')].eval_with_dict(params) assert f32s1lb == n * m * ell / bsize assert f32s1la == n * m * ell / bsize f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c')].eval_with_dict(params) assert f32coal == n * ell local_mem_map = lp.get_mem_access_map( knl, count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess( 'local', np.dtype(np.float32), direction='load')].eval_with_dict(params) assert local_mem_l == n * m * ell * 2
def test_advect_dealias(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" % M % N[ # interpolate u to integration nodes "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", # differentiate u on integration nodes "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", # interpolate v to integration nodes "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", # differentiate v on integration nodes "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", # interpolate w to integration nodes "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", # differentiate v on integration nodes "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", # find velocity in (r,s,t) coordinates # QUESTION: should I use CSE here ? "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", # form nonlinear term on integration nodes # QUESTION: should I use CSE here ? "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", # L2 project Nu back to Lagrange basis "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", # L2 project Nv back to Lagrange basis "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", # L2 project Nw back to Lagrange basis "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("v", dtype, shape=field_shape, order=order), lp.ArrayArg("w", dtype, shape=field_shape, order=order), lp.ArrayArg("INu", dtype, shape=field_shape, order=order), lp.ArrayArg("INv", dtype, shape=field_shape, order=order), lp.ArrayArg("INw", dtype, shape=field_shape, order=order), lp.ArrayArg("D", dtype, shape=(M, M), order=order), lp.ArrayArg("I", dtype, shape=(M, N), order=order), lp.ArrayArg("V", dtype, shape=(N, M), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) n = 15 * 10**6 a = cl.array.arange(queue, n, dtype=np.float32) knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = 2*a[i]") knl = lp.set_options(knl, write_code=True) knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec") knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4) knl = lp.tag_array_axes(knl, "a,out", "C,vec") knl(queue, a=a.reshape(-1, 4), n=n)
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching( hsv, prep_var_name + "_*subst*"), default_tag="l.auto") if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, default_tag="l.auto") if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_map = lp.get_op_map(hsv) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def variant_1(knl): knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 256, slabs=(0, 1)) return knl
def test_red2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 16 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ue(a,b) := u[e,a,b]", "ur(a,b) := sum_float32(@o, D[a,o]*ue(o,b))", "us(a,b) := sum_float32(@o, D[b,o]*ue(a,o))", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j)+G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m)+G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 2, outer_tag="g.0") knl = lp.split_iname(knl, "j", n, inner_tag="l.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.1") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.add_prefetch(knl, "G", [2, 3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K * ((n**3) * 2 * 2 + n * n * 2 * 3 + (n**3) * 2 * 2) / 1e9, op_label="GFlops", parameters={"K": K})
def variant_1(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_rob_stroud_bernstein_full(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() # NOTE: result would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ \ 0 <= i1_2 < nqp1d and \ 0 <= alpha1_2 <= deg and \ 0 <= i2_2 < nqp1d \ }", """ for el for i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end for i1_2 <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 <> w2 = s2**deg {id=w2_init} for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ w2 * tmp[alpha1_2, i2_2] {id=res2,dep=w2_init} end w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) \ {id=w2_update, dep=res2} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1" ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) if 0: knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) from pickle import dumps, loads knl = loads(dumps(knl)) knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) print(knl)
def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl
def test_lbm(ctx_factory): ctx = ctx_factory() # D2Q4Q4Q4 lattice Boltzmann scheme for the shallow water equations # Example by Loic Gouarin <*****@*****.**> knl = lp.make_kernel( "{[ii,jj]:0<=ii<nx-2 and 0<=jj<ny-2}", """ # noqa (silences flake8 line length warning) i := ii + 1 j := jj + 1 for ii, jj with {id_prefix=init_m} <> m[0] = + f[i-1, j, 0] + f[i, j-1, 1] + f[i+1, j, 2] + f[i, j+1, 3] m[1] = + 4.*f[i-1, j, 0] - 4.*f[i+1, j, 2] m[2] = + 4.*f[i, j-1, 1] - 4.*f[i, j+1, 3] m[3] = + f[i-1, j, 0] - f[i, j-1, 1] + f[i+1, j, 2] - f[i, j+1, 3] m[4] = + f[i-1, j, 4] + f[i, j-1, 5] + f[i+1, j, 6] + f[i, j+1, 7] m[5] = + 4.*f[i-1, j, 4] - 4.*f[i+1, j, 6] m[6] = + 4.*f[i, j-1, 5] - 4.*f[i, j+1, 7] m[7] = + f[i-1, j, 4] - f[i, j-1, 5] + f[i+1, j, 6] - f[i, j+1, 7] m[8] = + f[i-1, j, 8] + f[i, j-1, 9] + f[i+1, j, 10] + f[i, j+1, 11] m[9] = + 4.*f[i-1, j, 8] - 4.*f[i+1, j, 10] m[10] = + 4.*f[i, j-1, 9] - 4.*f[i, j+1, 11] m[11] = + f[i-1, j, 8] - f[i, j-1, 9] + f[i+1, j, 10] - f[i, j+1, 11] end with {id_prefix=update_m,dep=init_m*} m[1] = m[1] + 2.*(m[4] - m[1]) m[2] = m[2] + 2.*(m[8] - m[2]) m[3] = m[3]*(1. - 1.5) m[5] = m[5] + 1.5*(0.5*(m[0]*m[0]) + (m[4]*m[4])/m[0] - m[5]) m[6] = m[6] + 1.5*(m[4]*m[8]/m[0] - m[6]) m[7] = m[7]*(1. - 1.2000000000000000) m[9] = m[9] + 1.5*(m[4]*m[8]/m[0] - m[9]) m[10] = m[10] + 1.5*(0.5*(m[0]*m[0]) + (m[8]*m[8])/m[0] - m[10]) m[11] = m[11]*(1. - 1.2) end with {dep=update_m*} f_new[i, j, 0] = + 0.25*m[0] + 0.125*m[1] + 0.25*m[3] f_new[i, j, 1] = + 0.25*m[0] + 0.125*m[2] - 0.25*m[3] f_new[i, j, 2] = + 0.25*m[0] - 0.125*m[1] + 0.25*m[3] f_new[i, j, 3] = + 0.25*m[0] - 0.125*m[2] - 0.25*m[3] f_new[i, j, 4] = + 0.25*m[4] + 0.125*m[5] + 0.25*m[7] f_new[i, j, 5] = + 0.25*m[4] + 0.125*m[6] - 0.25*m[7] f_new[i, j, 6] = + 0.25*m[4] - 0.125*m[5] + 0.25*m[7] f_new[i, j, 7] = + 0.25*m[4] - 0.125*m[6] - 0.25*m[7] f_new[i, j, 8] = + 0.25*m[8] + 0.125*m[9] + 0.25*m[11] f_new[i, j, 9] = + 0.25*m[8] + 0.125*m[10] - 0.25*m[11] f_new[i, j, 10] = + 0.25*m[8] - 0.125*m[9] + 0.25*m[11] f_new[i, j, 11] = + 0.25*m[8] - 0.125*m[10] - 0.25*m[11] end end """) knl = lp.add_and_infer_dtypes(knl, {"f": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "ii", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "jj", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.expand_subst(knl) knl = lp.add_prefetch(knl, "f", "ii_inner,jj_inner", fetch_bounding_box=True, default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20})
def test_laplacian(ctx_factory): 1 / 0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # load: 1+6 fields + 1/N D entry # store: 1 fields # perform: N*2*6 + 3*5 flops # ratio: (12*N+15)/8 flops per 4 bytes on bus # ~ 14 FLOPS per 4 bytes at N=8 # ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))", "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))", "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))", # define function "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)", "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)", "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*Gu(m,j,k))" "+ sum_float32(m, D[m,j]*Gv(i,m,k))" "+ sum_float32(m, D[m,k]*Gw(i,j,m))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") #print(lp.preprocess_kernel(knl, cse_ok=True)) #1/0 # #print(knl) #1/0 knl = lp.realize_cse(knl, "urf", np.float32, ["o1"]) knl = lp.realize_cse(knl, "usf", np.float32, ["o2"]) knl = lp.realize_cse(knl, "utf", np.float32, ["o3"]) knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"]) knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"]) knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: pass #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], default_tag="l.auto") #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules( knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "eu", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K})
def test_interp_diff(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" % M % N["[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("P", dtype, shape=interim_field_shape, order=order), lp.ArrayArg("I", dtype, shape=(M, N), order=order), lp.ArrayArg("V", dtype, shape=(N, M), order=order), lp.ArrayArg("Pu", dtype, shape=field_shape, order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_lap_precon", assumptions="K>=1") print(knl) 1 / 0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def variant_k_ilp(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp") knl = lp.tag_inames(knl, dict(m="unr")) return knl
def assign_axis(recursion_axis, iname, axis=None): """Assign iname to local axis *axis* and start over by calling the surrounding function assign_automatic_axes. If *axis* is None, find a suitable axis automatically. """ try: with isl.SuppressedWarnings(kernel.isl_context): desired_length = kernel.get_constant_iname_length(iname) except isl.Error: # Likely unbounded, automatic assignment is not # going to happen for this iname. new_iname_to_tag = kernel.iname_to_tag.copy() new_iname_to_tag[iname] = None return assign_automatic_axes( kernel.copy(iname_to_tag=new_iname_to_tag), axis=recursion_axis) if axis is None: # {{{ find a suitable axis shorter_possible_axes = [] test_axis = 0 while True: if test_axis >= len(local_size): break if test_axis in assigned_local_axes: test_axis += 1 continue if local_size[test_axis] < desired_length: shorter_possible_axes.append(test_axis) test_axis += 1 continue else: axis = test_axis break # The loop above will find an unassigned local axis # that has enough 'room' for the iname. In the same traversal, # it also finds theoretically assignable axes that are shorter, # in the variable shorter_possible_axes. if axis is None and shorter_possible_axes: # sort as longest first shorter_possible_axes.sort(key=lambda ax: local_size[ax]) axis = shorter_possible_axes[0] # }}} if axis is None: new_tag = None else: new_tag = LocalIndexTag(axis) if desired_length > local_size[axis]: from loopy import split_iname # Don't be tempted to switch the outer tag to unroll--this may # generate tons of code on some examples. return assign_automatic_axes(split_iname( kernel, iname, inner_length=local_size[axis], outer_tag=None, inner_tag=new_tag, do_tagged_check=False), axis=recursion_axis, local_size=local_size) if not isinstance(kernel.iname_to_tag.get(iname), AutoLocalIndexTagBase): raise LoopyError("trying to reassign '%s'" % iname) new_iname_to_tag = kernel.iname_to_tag.copy() new_iname_to_tag[iname] = new_tag return assign_automatic_axes( kernel.copy(iname_to_tag=new_iname_to_tag), axis=recursion_axis, local_size=local_size)
def variant_cpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 1024, outer_tag="g.0", slabs=(0, 1)) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) return knl
def variant_cpu(knl): unroll = 16 block_size = unroll*4096 knl = lp.split_iname(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", unroll, inner_tag="unr") return knl
knl = lp.make_kernel("{[i, j]: 0<=i<n and 0<=j<n}", "c[i, j] = a[i]*b[j]", assumptions="n >= 16") a = np.arange(200, dtype=np.float32) b = np.arange(200, dtype=np.float32) knl = lp.set_options(knl, write_code=True) evt, (c, ) = knl(queue, a=a, b=b) # SETUPEND orig_knl = knl # SPLITBEGIN knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") # SPLITEND knl = lp.set_options(knl, write_code=True) evt, (c, ) = knl(queue, a=a, b=b) split_knl = knl # PREFETCH1BEGIN knl = lp.add_prefetch(knl, "a", fetch_outer_inames="i_outer, i_inner, j_outer, j_inner") knl = lp.add_prefetch(knl, "b", fetch_outer_inames="i_outer, i_inner, j_outer, j_inner")
import numpy as np import loopy as lp import pyopencl as cl import pyopencl.array knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) # transform knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.kernel.tools import add_dtypes knl = add_dtypes(knl, { "a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32 }) # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl)
def test_tim3d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" " + sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" " + sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap3D", assumptions="K>=1") seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "k", "o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o", "k"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "k", n, inner_tag="l.2") #, slabs=(0, 1)) knl = lp.split_iname(knl, "j", n, inner_tag="l.1") #, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.0") #, slabs=(0, 1)) # knl = lp.tag_inames(knl, dict(k_nner="unr")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.tag_inames(knl, dict(i="unr")) knl = lp.add_prefetch(knl, "G", [2, 3, 4], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 4000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K * ((n**4) * 3 * 2 + (n**3) * 5 * 3 + (n**4) * 3 * 2) / 1e9, op_label="GFlops", parameters={"K": K})
def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True, split_kwargs=None): """ :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating that the index in *axis_nr* should be split. The tuples may also be *(array, axis_nr, "F")*, indicating that the index will be split as it would be according to Fortran order. *array* may name a temporary variable or an argument. If *arrays_and_axes* is a :class:`tuple`, it is automatically wrapped in a list, to make single splits easier. :arg count: The group size to use in the split. :arg auto_split_inames: Whether to automatically split inames encountered in the specified indices. :arg split_kwargs: arguments to pass to :func:`loopy.split_inames` Note that splits on the corresponding inames are carried out implicitly. The inames may *not* be split beforehand. (There's no *really* good reason for this--this routine is just not smart enough to deal with this.) """ if count == 1: return kernel if split_kwargs is None: split_kwargs = {} # {{{ process input into array_to_rest # where "rest" is the non-argument-name part of the input tuples # in args_and_axes def normalize_rest(rest): if len(rest) == 1: return (rest[0], "C") elif len(rest) == 2: return rest else: raise RuntimeError("split instruction '%s' not understood" % rest) if isinstance(arrays_and_axes, tuple): arrays_and_axes = [arrays_and_axes] array_to_rest = dict( (tup[0], normalize_rest(tup[1:])) for tup in arrays_and_axes) if len(arrays_and_axes) != len(array_to_rest): raise RuntimeError("cannot split multiple axes of the same variable") del arrays_and_axes # }}} # {{{ adjust arrays from loopy.kernel.tools import ArrayChanger for array_name, (axis, order) in six.iteritems(array_to_rest): achng = ArrayChanger(kernel, array_name) ary = achng.get() from pytools import div_ceil # {{{ adjust shape new_shape = ary.shape if new_shape is not None: new_shape = list(new_shape) axis_len = new_shape[axis] new_shape[axis] = count outer_len = div_ceil(axis_len, count) if order == "F": new_shape.insert(axis+1, outer_len) elif order == "C": new_shape.insert(axis, outer_len) else: raise RuntimeError("order '%s' not understood" % order) new_shape = tuple(new_shape) # }}} # {{{ adjust dim tags if ary.dim_tags is None: raise RuntimeError("dim_tags of '%s' are not known" % array_name) new_dim_tags = list(ary.dim_tags) old_dim_tag = ary.dim_tags[axis] from loopy.kernel.array import FixedStrideArrayDimTag if not isinstance(old_dim_tag, FixedStrideArrayDimTag): raise RuntimeError("axis %d of '%s' is not tagged fixed-stride" % (axis, array_name)) old_stride = old_dim_tag.stride outer_stride = count*old_stride if order == "F": new_dim_tags.insert(axis+1, FixedStrideArrayDimTag(outer_stride)) elif order == "C": new_dim_tags.insert(axis, FixedStrideArrayDimTag(outer_stride)) else: raise RuntimeError("order '%s' not understood" % order) new_dim_tags = tuple(new_dim_tags) # }}} # {{{ adjust dim_names new_dim_names = ary.dim_names if new_dim_names is not None: new_dim_names = list(new_dim_names) existing_name = new_dim_names[axis] new_dim_names[axis] = existing_name + "_inner" outer_name = existing_name + "_outer" if order == "F": new_dim_names.insert(axis+1, outer_name) elif order == "C": new_dim_names.insert(axis, outer_name) else: raise RuntimeError("order '%s' not understood" % order) new_dim_names = tuple(new_dim_names) # }}} kernel = achng.with_changed_array(ary.copy( shape=new_shape, dim_tags=new_dim_tags, dim_names=new_dim_names)) # }}} split_vars = {} var_name_gen = kernel.get_var_name_generator() def split_access_axis(expr): axis_nr, order = array_to_rest[expr.aggregate.name] idx = expr.index if not isinstance(idx, tuple): idx = (idx,) idx = list(idx) axis_idx = idx[axis_nr] if auto_split_inames: from pymbolic.primitives import Variable if not isinstance(axis_idx, Variable): raise RuntimeError("found access '%s' in which axis %d is not a " "single variable--cannot split " "(Have you tried to do the split yourself, manually, " "beforehand? If so, you shouldn't.)" % (expr, axis_nr)) split_iname = idx[axis_nr].name assert split_iname in kernel.all_inames() try: outer_iname, inner_iname = split_vars[split_iname] except KeyError: outer_iname = var_name_gen(split_iname+"_outer") inner_iname = var_name_gen(split_iname+"_inner") split_vars[split_iname] = outer_iname, inner_iname inner_index = Variable(inner_iname) outer_index = Variable(outer_iname) else: from loopy.symbolic import simplify_using_aff inner_index = simplify_using_aff(kernel, axis_idx % count) outer_index = simplify_using_aff(kernel, axis_idx // count) idx[axis_nr] = inner_index if order == "F": idx.insert(axis+1, outer_index) elif order == "C": idx.insert(axis, outer_index) else: raise RuntimeError("order '%s' not understood" % order) return expr.aggregate.index(tuple(idx)) rule_mapping_context = SubstitutionRuleMappingContext( kernel.substitutions, var_name_gen) aash = ArrayAxisSplitHelper(rule_mapping_context, set(six.iterkeys(array_to_rest)), split_access_axis) kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel)) if auto_split_inames: from loopy import split_iname for iname, (outer_iname, inner_iname) in six.iteritems(split_vars): kernel = split_iname(kernel, iname, count, outer_iname=outer_iname, inner_iname=inner_iname, **split_kwargs) return kernel
def get_optimized_kernel(self): # FIXME knl = self.get_kernel() knl = lp.split_iname(knl, "itgt_box", 16, outer_tag="g.0") return knl
def variant_1(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"]) knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_mem_access_counter_mixed(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]+x[i,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="mixed", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) threads = 16 knl = lp.split_iname(knl, "j", threads) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl) # noqa n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='g')].eval_with_dict(params) f64uniform += mem_map[lp.MemAccess('global', np.float64, stride=0, direction='load', variable='h')].eval_with_dict(params) f32uniform = mem_map[lp.MemAccess('global', np.float32, stride=0, direction='load', variable='x')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m'), direction='load', variable='b')].eval_with_dict(params) assert f64uniform == 2 * n * m assert f32uniform == n * m * l / threads assert f32nonconsec == 3 * n * m * l f64uniform = mem_map[lp.MemAccess('global', np.float64, stride=0, direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m'), direction='store', variable='c')].eval_with_dict(params) assert f64uniform == n * m assert f32nonconsec == n * m * l
def variant_image_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.change_arg_to_image(knl, "DrDsDt") return knl
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto") #print(seq_knl) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def split(vanilla): k = lp.split_iname(vanilla, "i", 4, slabs=(1, 1)) k = lp.prioritize_loops(k, "i_outer,i_inner") return k
def test_advect(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, N, N, N) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic # A. updated for CSE: notation. # B. fixed temp indexing and C ordering # load: 3+9 fields + 1/N D entry # store: 3 fields # perform: N*2*6 + 3*5 + 3*5 flops # ratio: (12*N+30)/15 flops per 4 bytes on bus # ~ 8.4 FLOPS per 4 bytes at N=8 # ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N, [ # differentiate u "CSE: ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])", "CSE: us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])", "CSE: ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])", # differentiate v "CSE: vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])", "CSE: vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])", "CSE: vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])", # differentiate w "CSE: wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])", "CSE: ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])", "CSE: wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])", # find velocity in (r,s,t) coordinates # CSE? "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]", "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]", "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]", # form nonlinear term on integration nodes "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)", "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)", "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("v", dtype, shape=field_shape, order=order), lp.ArrayArg("w", dtype, shape=field_shape, order=order), lp.ArrayArg("Nu", dtype, shape=field_shape, order=order), lp.ArrayArg("Nv", dtype, shape=field_shape, order=order), lp.ArrayArg("Nw", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(9, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(N, N), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() if (buffer_inames and ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") knl = lp.assume(knl, "ell mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))