def test_small_batched_matvec(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" K = 9997 # noqa Np = 36 # noqa knl = lp.make_kernel( "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" ], [ lp.GlobalArg("d", dtype, shape=(Np, Np), order=order), lp.GlobalArg("f", dtype, shape=("K", Np), order=order), lp.GlobalArg("result", dtype, shape=("K", Np), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="batched_matvec", assumptions="K>=1") seq_knl = knl align_bytes = 64 knl = lp.add_prefetch(knl, 'd[:,:]') pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K*2*Np**2/1e9], op_label=["GFlops"], parameters=dict(K=K))
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 6*16*2 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape="(n, n)", order=order), lp.GlobalArg("b", dtype, shape="(n, n)", order=order), lp.GlobalArg("c", dtype, shape="(n, n)", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters=dict(n=n))
def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size in [ (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check)
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_atomic(ctx_factory, dtype): ctx = ctx_factory() if ( np.dtype(dtype).itemsize == 8 and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): pytest.skip("64-bit atomics not supported on device") import pyopencl.version # noqa if ( cl.version.VERSION < (2015, 2) and dtype == np.int64): pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i%20] = out[i%20] + 2*a[i] {atomic}", [ lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), lp.GlobalArg("a", dtype, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl knl = lp.split_iname(knl, "i", 512) knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E")]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_alias_temporaries(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ times2(i) := 2*a[i] times3(i) := 3*a[i] times4(i) := 4*a[i] x[i] = times2(i) y[i] = times3(i) z[i] = times4(i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.precompute(knl, "times2", "i_inner") knl = lp.precompute(knl, "times3", "i_inner") knl = lp.precompute(knl, "times4", "i_inner") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_transpose(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={})
def test_conditional(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i,j<n }", """ <> my_a = a[i,j] {id=read_a} <> a_less_than_zero = my_a < 0 {dep=read_a,inames=i:j} my_a = 2*my_a {id=twice_a,dep=read_a,if=a_less_than_zero} my_a = my_a+1 {id=aplus,dep=twice_a,if=a_less_than_zero} out[i,j] = 2*my_a {dep=aplus} """, [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) ref_knl = knl lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=200 ))
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "b[i] = 2*a[i]" ], [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)") ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_convolution(ctx_factory): ctx = ctx_factory() dtype = np.float32 knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", options="annotate_inames") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3) ref_knl = knl def variant_0(knl): #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") return knl def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", default_tag="l.auto") return knl for variant in [ #variant_0, #variant_1, variant_2 ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict( im_w=128, im_h=128, f_w=f_w, nfeats=3, nimgs=3 ))
def test_equality_constraints(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 10 knl = lp.make_kernel([ "[n] -> {[i,j]: 0<=i,j<n }", "{[k]: k =i+5 and k < n}", ], [ "a[i,j] = 5 {id=set_all}", "b[i,k] = 22 {dep=set_all}", ], [ lp.GlobalArg("a,b", dtype, shape="n, n", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="equality_constraints", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") #print(knl) #print(knl.domains[0].detect_equalities()) lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n), print_ref_code=True)
def test_global_parallel_reduction_simpler(ctx_factory, size): ctx = ctx_factory() pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") knl = lp.make_kernel( "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", """ <> key = make_uint2(l+nl*g, 1234) {inames=l:g} <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) result = sum(j, tmp[j]) """) ng = 50 knl = lp.fix_parameters(knl, ng=ng) knl = lp.set_options(knl, write_cl=True) ref_knl = knl knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "l_inner") knl = lp.tag_inames(knl, "g:g.0,j:l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
def test_ilp_loop_bound(ctx_factory): # The salient bit of this test is that a joint bound on (outer, inner) # from a split occurs in a setting where the inner loop has been ilp'ed. # In 'normal' parallel loops, the inner index is available for conditionals # throughout. In ILP'd loops, not so much. ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<n }", """ out[i,k] = sum(j, a[i,j]*b[j,k]) """, [ lp.GlobalArg("a,b", np.float32, shape=lp.auto), "...", ], assumptions="n>=1") ref_knl = knl knl = lp.set_loop_priority(knl, "j,i,k") knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=200 ))
def test_to_batched_temp(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' cnst = 2.0 out[i] = sum(j, cnst*a[i,j]*x[j])''', [lp.TemporaryVariable( "cnst", dtype=np.float32, shape=(), scope=lp.temp_var_scope.PRIVATE), '...']) knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32, x=np.float32, a=np.float32)) ref_knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', '''out[i] = sum(j, 2.0*a[i,j]*x[j])''') ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32, x=np.float32, a=np.float32)) bknl = lp.to_batched(knl, "nbatches", "out,x") bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x") # checking that cnst is not being bathced assert bknl.temporary_variables['cnst'].shape == () a = np.random.randn(5, 5) x = np.random.randn(7, 5) # Checking that the program compiles and the logic is correct lp.auto_test_vs_ref( bref_knl, ctx, bknl, parameters=dict(a=a, x=x, n=5, nbatches=7))
def test_assignment_to_subst_indices(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a(n), out(n), out2(n), inp(n) integer n, i do i = 1, n a(i) = 6*inp(i) enddo do i = 1, n out(i) = 5*a(i) end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl assert "a" in knl.temporary_variables knl = lp.assignment_to_subst(knl, "a") assert "a" not in knl.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_assignment_to_subst_two_defs(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a, out(n), out2(n), inp(n) integer n, i do i = 1, n a = inp(i) out(i) = 5*a a = 3*inp(n) out2(i) = 6*a end do end """ knl, = lp.parse_fortran(fortran_src) ref_knl = knl knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_fill(ctx_factory): fortran_src = """ subroutine fill(out, a, n) implicit none real*8 a, out(n) integer n, i do i = 1, n out(i) = a end do end !$loopy begin ! ! fill, = lp.parse_fortran(SOURCE) ! fill = lp.split_iname(fill, "i", split_amount, ! outer_tag="g.0", inner_tag="l.0") ! RESULT = [fill] ! !$loopy end """ knl, = lp.parse_transformed_fortran(fortran_src, pre_transform_code="split_amount = 128") assert "i_inner" in knl.all_inames() ctx = ctx_factory() lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
def test_if(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a, b, out(n), out2(n), inp(n) integer n, i, j do i = 1, n a = inp(i) if (a.ge.3) then b = 2*a do j = 1,3 b = 3 * b end do out(i) = 5*b else out(i) = 4*a endif end do end """ knl, = lp.parse_fortran(fortran_src) ref_knl = knl knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_kernel_splitting_with_loop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,k]: 0<=i<n and 0<=k<3 }", """ c[k,i] = a[k, i + 1] out[k,i] = c[k,i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") # schedule from loopy.preprocess import preprocess_kernel knl = preprocess_kernel(knl) from loopy.schedule import get_one_scheduled_kernel knl = get_one_scheduled_kernel(knl) # map schedule onto host or device print(knl) cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 print(cgr.device_code()) print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_global_temporary(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n}", """ <> c[i] = a[i + 1] out[i] = c[i] """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) knl = lp.set_temporary_scope(knl, "c", "global") ref_knl = knl knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cgr = lp.generate_code_v2(knl) assert len(cgr.device_programs) == 2 #print(cgr.device_code()) #print(cgr.host_code()) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 ))
def test_local_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i, j]: 0 <= i < n and 0 <= j < 5}", """ z[j] = sum(i, i+j) """) knl = lp.fix_parameters(knl, n=size) ref_knl = knl def variant0(knl): return lp.tag_inames(knl, "i:l.0") def variant1(knl): return lp.tag_inames(knl, "i:l.0,j:l.1") def variant2(knl): return lp.tag_inames(knl, "i:l.0,j:g.0") for variant in [ variant0, variant1, variant2 ]: knl = variant(ref_knl) lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_domain_dependency_via_existentially_quantified_variable(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 10 knl = lp.make_kernel([ "{[i]: 0<=i<n }", "{[k]: k=i and (exists l: k = 2*l) }", ], [ "a[i] = 5 {id=set}", "b[k] = 6 {dep=set}", ], [ lp.GlobalArg("a,b", dtype, shape="n", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") seq_knl = knl lp.auto_test_vs_ref(seq_knl, ctx, knl, parameters=dict(n=n))
def test_interp_diff(ctx_factory): 1/0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N [ "[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("P", dtype, shape=interim_field_shape, order=order), lp.GlobalArg("I", dtype, shape=(M, N), order=order), lp.GlobalArg("V", dtype, shape=(N, M), order=order), lp.GlobalArg("Pu", dtype, shape=field_shape, order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_lap_precon", assumptions="K>=1") print(knl) 1/0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)
def test_laplacian(ctx_factory): 1 / 0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # load: 1+6 fields + 1/N D entry # store: 1 fields # perform: N*2*6 + 3*5 flops # ratio: (12*N+15)/8 flops per 4 bytes on bus # ~ 14 FLOPS per 4 bytes at N=8 # ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))", "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))", "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))", # define function "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)", "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)", "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*Gu(m,j,k))" "+ sum_float32(m, D[m,j]*Gv(i,m,k))" "+ sum_float32(m, D[m,k]*Gw(i,j,m))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") #print(lp.preprocess_kernel(knl, cse_ok=True)) #1/0 # #print(knl) #1/0 knl = lp.realize_cse(knl, "urf", np.float32, ["o1"]) knl = lp.realize_cse(knl, "usf", np.float32, ["o2"]) knl = lp.realize_cse(knl, "utf", np.float32, ["o3"]) knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"]) knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"]) knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: pass #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"], default_tag="l.auto") #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]", default_tag="l.auto") knl = lp.add_prefetch(knl, "D", ["m", "j"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]", default_tag="l.auto") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules( knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
def test_lbm(ctx_factory): ctx = ctx_factory() # D2Q4Q4Q4 lattice Boltzmann scheme for the shallow water equations # Example by Loic Gouarin <*****@*****.**> knl = lp.make_kernel( "{[ii,jj]:0<=ii<nx-2 and 0<=jj<ny-2}", """ # noqa (silences flake8 line length warning) i := ii + 1 j := jj + 1 for ii, jj with {id_prefix=init_m} <> m[0] = + f[i-1, j, 0] + f[i, j-1, 1] + f[i+1, j, 2] + f[i, j+1, 3] m[1] = + 4.*f[i-1, j, 0] - 4.*f[i+1, j, 2] m[2] = + 4.*f[i, j-1, 1] - 4.*f[i, j+1, 3] m[3] = + f[i-1, j, 0] - f[i, j-1, 1] + f[i+1, j, 2] - f[i, j+1, 3] m[4] = + f[i-1, j, 4] + f[i, j-1, 5] + f[i+1, j, 6] + f[i, j+1, 7] m[5] = + 4.*f[i-1, j, 4] - 4.*f[i+1, j, 6] m[6] = + 4.*f[i, j-1, 5] - 4.*f[i, j+1, 7] m[7] = + f[i-1, j, 4] - f[i, j-1, 5] + f[i+1, j, 6] - f[i, j+1, 7] m[8] = + f[i-1, j, 8] + f[i, j-1, 9] + f[i+1, j, 10] + f[i, j+1, 11] m[9] = + 4.*f[i-1, j, 8] - 4.*f[i+1, j, 10] m[10] = + 4.*f[i, j-1, 9] - 4.*f[i, j+1, 11] m[11] = + f[i-1, j, 8] - f[i, j-1, 9] + f[i+1, j, 10] - f[i, j+1, 11] end with {id_prefix=update_m,dep=init_m*} m[1] = m[1] + 2.*(m[4] - m[1]) m[2] = m[2] + 2.*(m[8] - m[2]) m[3] = m[3]*(1. - 1.5) m[5] = m[5] + 1.5*(0.5*(m[0]*m[0]) + (m[4]*m[4])/m[0] - m[5]) m[6] = m[6] + 1.5*(m[4]*m[8]/m[0] - m[6]) m[7] = m[7]*(1. - 1.2000000000000000) m[9] = m[9] + 1.5*(m[4]*m[8]/m[0] - m[9]) m[10] = m[10] + 1.5*(0.5*(m[0]*m[0]) + (m[8]*m[8])/m[0] - m[10]) m[11] = m[11]*(1. - 1.2) end with {dep=update_m*} f_new[i, j, 0] = + 0.25*m[0] + 0.125*m[1] + 0.25*m[3] f_new[i, j, 1] = + 0.25*m[0] + 0.125*m[2] - 0.25*m[3] f_new[i, j, 2] = + 0.25*m[0] - 0.125*m[1] + 0.25*m[3] f_new[i, j, 3] = + 0.25*m[0] - 0.125*m[2] - 0.25*m[3] f_new[i, j, 4] = + 0.25*m[4] + 0.125*m[5] + 0.25*m[7] f_new[i, j, 5] = + 0.25*m[4] + 0.125*m[6] - 0.25*m[7] f_new[i, j, 6] = + 0.25*m[4] - 0.125*m[5] + 0.25*m[7] f_new[i, j, 7] = + 0.25*m[4] - 0.125*m[6] - 0.25*m[7] f_new[i, j, 8] = + 0.25*m[8] + 0.125*m[9] + 0.25*m[11] f_new[i, j, 9] = + 0.25*m[8] + 0.125*m[10] - 0.25*m[11] f_new[i, j, 10] = + 0.25*m[8] - 0.125*m[9] + 0.25*m[11] f_new[i, j, 11] = + 0.25*m[8] - 0.125*m[10] - 0.25*m[11] end end """) knl = lp.add_and_infer_dtypes(knl, {"f": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "ii", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "jj", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.expand_subst(knl) knl = lp.add_prefetch(knl, "f", "ii_inner,jj_inner", fetch_bounding_box=True, default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nx": 20, "ny": 20})
def test_advect(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, N, N, N) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic # A. updated for CSE: notation. # B. fixed temp indexing and C ordering # load: 3+9 fields + 1/N D entry # store: 3 fields # perform: N*2*6 + 3*5 + 3*5 flops # ratio: (12*N+30)/15 flops per 4 bytes on bus # ~ 8.4 FLOPS per 4 bytes at N=8 # ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N, [ # differentiate u "CSE: ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])", "CSE: us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])", "CSE: ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])", # differentiate v "CSE: vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])", "CSE: vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])", "CSE: vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])", # differentiate w "CSE: wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])", "CSE: ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])", "CSE: wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])", # find velocity in (r,s,t) coordinates # CSE? "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]", "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]", "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]", # form nonlinear term on integration nodes "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)", "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)", "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("v", dtype, shape=field_shape, order=order), lp.ArrayArg("w", dtype, shape=field_shape, order=order), lp.ArrayArg("Nu", dtype, shape=field_shape, order=order), lp.ArrayArg("Nv", dtype, shape=field_shape, order=order), lp.ArrayArg("Nw", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(9, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(N, N), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "eu", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K})
def test_advect_dealias(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" % M % N[ # interpolate u to integration nodes "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", # differentiate u on integration nodes "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", # interpolate v to integration nodes "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", # differentiate v on integration nodes "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", # interpolate w to integration nodes "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", # differentiate v on integration nodes "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", # find velocity in (r,s,t) coordinates # QUESTION: should I use CSE here ? "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", # form nonlinear term on integration nodes # QUESTION: should I use CSE here ? "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", # L2 project Nu back to Lagrange basis "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", # L2 project Nv back to Lagrange basis "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", # L2 project Nw back to Lagrange basis "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("v", dtype, shape=field_shape, order=order), lp.ArrayArg("w", dtype, shape=field_shape, order=order), lp.ArrayArg("INu", dtype, shape=field_shape, order=order), lp.ArrayArg("INv", dtype, shape=field_shape, order=order), lp.ArrayArg("INw", dtype, shape=field_shape, order=order), lp.ArrayArg("D", dtype, shape=(M, M), order=order), lp.ArrayArg("I", dtype, shape=(M, N), order=order), lp.ArrayArg("V", dtype, shape=(N, M), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_interp_diff(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" % M % N["[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("P", dtype, shape=interim_field_shape, order=order), lp.ArrayArg("I", dtype, shape=(M, N), order=order), lp.ArrayArg("V", dtype, shape=(N, M), order=order), lp.ArrayArg("Pu", dtype, shape=field_shape, order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_lap_precon", assumptions="K>=1") print(knl) 1 / 0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_rank_one(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "F" #n = int(get_suitable_size(ctx)**(2.7/2)) n = 16**3 knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n}", ["c[i, j] = a[i]*b[j] {id=mylabel, priority =5}"], [ lp.GlobalArg("a", dtype, shape=("n", ), order=order), lp.GlobalArg("b", dtype, shape=("n", ), order=order), lp.GlobalArg("c", dtype, shape=("n, n"), order=order), lp.ValueArg("n", np.int32, approximately=n), ], name="rank_one", assumptions="n >= 16") def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.set_loop_priority(knl, ["i", "j"]) return knl def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl def variant_3(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a", ["i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner"]) return knl def variant_4(knl): knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") knl = lp.split_iname(knl, "j_inner", 16, inner_tag="l.1") knl = lp.split_iname(knl, "a_dim_0", 16, outer_tag="l.1", inner_tag="l.0") knl = lp.split_iname(knl, "b_dim_0", 16, outer_tag="l.1", inner_tag="l.0") return knl seq_knl = knl for variant in [variant_1, variant_2, variant_3, variant_4]: lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize * n**2 / 1e9], op_label=["GBytes"], parameters={"n": n})
def test_axpy(ctx_factory): logging.basicConfig(level="INFO") ctx = ctx_factory() n = 3145182 vec = cl_array.vec if ctx.devices[0].platform.vendor.startswith("Advanced Micro"): pytest.skip("crashes on AMD 15.12") for dtype, check, a, b in [ (np.complex64, None, 5, 7), (vec.float4, check_float4, vec.make_float4(1, 2, 3, 4), vec.make_float4(6, 7, 8, 9)), (np.float32, None, 5, 7), ]: knl = lp.make_kernel("[n] -> {[i]: 0<=i<n}", ["z[i] = a*x[i]+b*y[i]"], [ lp.ValueArg("a", dtype), lp.GlobalArg("x", dtype, shape="n,"), lp.ValueArg("b", dtype), lp.GlobalArg("y", dtype, shape="n,"), lp.GlobalArg("z", dtype, shape="n,"), lp.ValueArg("n", np.int32, approximately=n), ], name="axpy", assumptions="n>=1") seq_knl = knl def variant_cpu(knl): unroll = 16 block_size = unroll * 4096 knl = lp.split_iname(knl, "i", block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", unroll, inner_tag="unr") return knl def variant_gpu(knl): unroll = 4 block_size = 256 knl = lp.split_iname(knl, "i", unroll * block_size, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") return knl #for variant in [ variant_gpu]: for variant in [variant_cpu, variant_gpu]: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), op_count=[np.dtype(dtype).itemsize * n * 3 / 1e9], op_label=["GBytes"], parameters={ "a": a, "b": b, "n": n }, check_result=check, blacklist_ref_vendors=["Advanced Micro"])
def test_convolution(ctx_factory): ctx = ctx_factory() dtype = np.float32 knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", options="annotate_inames") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3) ref_knl = knl def variant_0(knl): #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") return knl def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", fetch_outer_inames="im_x_outer, im_y_outer, ifeat", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", fetch_outer_inames="iimg, im_x_outer, im_y_outer, ifeat, icolor", default_tag="l.auto") return knl for variant in [ #variant_0, #variant_1, variant_2 ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict( im_w=128, im_h=128, f_w=f_w, nfeats=3, nimgs=3 ))
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") # noqa field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2<n and 0<=e<K and 0<=gi<3}", [ "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])", "us(a,b) := simul_reduce(sum, o2, D[b,o2]*u[e,a,o2])", #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", "lap[e,i,j] = " " simul_reduce(sum, m, D[m,i]*Gux(m,j))" "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(3, ) + field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") knl = lp.fix_parameters(knl, n=n) knl = lp.duplicate_inames(knl, "o", within="id:ur") knl = lp.duplicate_inames(knl, "o", within="id:us") seq_knl = knl def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto") knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto") knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl for variant in [variant_orig]: K = 1000 # noqa lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), op_count=[ K * (n * n * n * 2 * 2 + n * n * 2 * 3 + n**3 * 2 * 2) / 1e9 ], op_label=["GFlops"], parameters={"K": K})
def test_dg_volume(ctx_factory): #logging.basicConfig(level=logging.DEBUG) dtype = np.float32 dtype4 = cl.array.vec.float4 ctx = ctx_factory() order = "F" N = 3 # noqa Np = (N + 1) * (N + 2) * (N + 3) // 6 # noqa K = 10000 # noqa knl = lp.make_kernel( [ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m]) <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m]) <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m]) <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m]) # volume flux rhsu[k,n] = dot(drst_dx[k],dp_drst) rhsv[k,n] = dot(drst_dy[k],dp_drst) rhsw[k,n] = dot(drst_dz[k],dp_drst) rhsp[k,n] = dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \ + dot(drst_dz[k], dw_drst) """, [ lp.GlobalArg( "u,v,w,p,rhsu,rhsv,rhsw,rhsp", dtype, shape="K, Np", order="C"), lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="C"), lp.GlobalArg( "drst_dx,drst_dy,drst_dz", dtype4, shape="K", order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="dg_volume", assumptions="K>=1") knl = lp.fix_parameters(knl, Np=Np) seq_knl = knl def variant_basic(knl): knl = lp.tag_inames(knl, dict(k="g.0", n="l.0")) return knl def variant_more_per_work_group(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") return knl def variant_image_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.change_arg_to_image(knl, "DrDsDt") return knl def variant_prefetch_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", fetch_outer_inames="k_outer", default_tag="l.auto") return knl def variant_prefetch_fields(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"], default_tag="l.auto") return knl def variant_k_ilp(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp") knl = lp.tag_inames(knl, dict(m="unr")) return knl def variant_simple_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") arg_names = [ prefix + name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"] ] for name in arg_names: knl = lp.add_padding(knl, name, axis=0, align_bytes=32) knl = lp.tag_inames(knl, dict(m="unr")) return knl def variant_fancy_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) pad_mult = lp.find_padding_multiple(knl, "u", 1, 32) arg_names = [ prefix + name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"] ] knl = lp.split_array_dim(knl, [(nm, 0) for nm in arg_names], pad_mult) return knl parameters_dict = dict(K=K) variants = [ variant_basic, variant_more_per_work_group, variant_prefetch_d, variant_prefetch_fields, variant_k_ilp, variant_simple_padding, variant_fancy_padding ] if (ctx.devices[0].image_support and ctx.devices[0].platform.name != "Portable Computing Language"): variants.append(variant_image_d) for variant in variants: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) )
def test_tim3d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" " + sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" " + sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap3D", assumptions="K>=1") seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "k", "o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o", "k"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "k", n, inner_tag="l.2") #, slabs=(0, 1)) knl = lp.split_iname(knl, "j", n, inner_tag="l.1") #, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.0") #, slabs=(0, 1)) # knl = lp.tag_inames(knl, dict(k_nner="unr")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.tag_inames(knl, dict(i="unr")) knl = lp.add_prefetch(knl, "G", [2, 3, 4], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 4000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K * ((n**4) * 3 * 2 + (n**3) * 5 * 3 + (n**4) * 3 * 2) / 1e9, op_label="GFlops", parameters={"K": K})
def test_red2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 16 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ue(a,b) := u[e,a,b]", "ur(a,b) := sum_float32(@o, D[a,o]*ue(o,b))", "us(a,b) := sum_float32(@o, D[b,o]*ue(a,o))", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j)+G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m)+G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i", "o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ue", np.float32, ["a", "b", "m"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 2, outer_tag="g.0") knl = lp.split_iname(knl, "j", n, inner_tag="l.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "i", n, inner_tag="l.1") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.add_prefetch(knl, "G", [2, 3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K * ((n**3) * 2 * 2 + n * n * 2 * 3 + (n**3) * 2 * 2) / 1e9, op_label="GFlops", parameters={"K": K})
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching( hsv, prep_var_name + "_*subst*"), default_tag="l.auto") if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, default_tag="l.auto") if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_map = lp.get_op_map(hsv) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
and qbx_forced_limit == 0) or (in_disk and qbx_forced_limit != 0 and qbx_forced_limit * center_side[ictr] > 0) ) <> post_dist_sq = if(matches, dist_sq, HUGE) end <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq) tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1) end """) knl = lp.fix_parameters(knl, ambient_dim=2) knl = lp.add_and_infer_dtypes( knl, { "tgt,center,radius,HUGE": np.float32, "center_side,qbx_forced_limit": np.int32, }) lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={ "HUGE": 1e20, "ncenters": 200, "ntargets": 300, "qbx_forced_limit": 1 })
def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() if (buffer_inames and ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ prog = lp.parse_fortran(fortran_src) assert len(prog["dgemm"].domains) == 1 ref_prog = prog prog = lp.split_iname(prog, "i", 16, outer_tag="g.0", inner_tag="l.1") prog = lp.split_iname(prog, "j", 8, outer_tag="g.1", inner_tag="l.0") prog = lp.split_iname(prog, "k", 32) prog = lp.assume(prog, "n mod 32 = 0") prog = lp.assume(prog, "m mod 32 = 0") prog = lp.assume(prog, "ell mod 16 = 0") prog = lp.extract_subst(prog, "a_acc", "a[i1,i2]", parameters="i1, i2") prog = lp.extract_subst(prog, "b_acc", "b[i1,i2]", parameters="i1, i2") prog = lp.precompute(prog, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") prog = lp.precompute(prog, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128))
def test_laplacian_stiffness(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" dim = 2 # (baked into code) Nq = 40 # num. quadrature points (baked into code) Nb = 20 # num. basis functions (baked into code) Nc = 100 # num. cells (run-time symbolic) from pymbolic import var Nc_sym = var("Nc") knl = lp.make_kernel(ctx.devices[0], "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " "and 0<= dx_axis, ax_b < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [ "dPsi(ij, dxi) := sum_float32(@ax_b," " jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))" ], [ lp.GlobalArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), lp.ConstantArg("w", dtype, shape=(Nq,), order=order), lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), lp.ValueArg("Nc", np.int32, approximately=1000), ], name="lapquad", assumptions="Nc>=1") knl = lp.tag_inames(knl, dict(ax_b="unr")) seq_knl = knl def variant_fig31(knl): # This (mostly) reproduces Figure 3.1. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_pg4(knl): # This (mostly) reproduces the unlabeled code snippet on pg. 4. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"] def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"] def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"] def variant_simple_gpu(knl): # This is a simple GPU-ish variant. # It's not the same thing as Matt's code, but I'll need some more time # to reverse-engineer what is going on there. Some discussion might # help, too. :) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"]) knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2]) knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3]) knl = lp.add_prefetch(knl, "jacDet", [1]) return knl, ["K", "i", "j", "q", "ax_b_insn"] # Plug in variant name here # | # v for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) kernel_gen = lp.generate_loop_schedules(var_knl, loop_priority=loop_prio) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) #print lp.preprocess_kernel(var_knl) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"Nc": Nc}, print_ref_code=True)
def no_test_dg_surface(ctx_factory): # tough to test, would need the right index info dtype = np.float32 ctx = ctx_factory() order = "F" N = 3 # noqa Np = (N + 1) * (N + 2) * (N + 3) // 6 # noqa Nfp = (N + 1) * (N + 2) // 2 # noqa Nfaces = 4 # noqa K = 10000 # noqa knl = lp.make_kernel( ["{[m,n,k]: 0<= m < NfpNfaces and 0<= n < Np and 0<= k < K }"], """ <> idP = vmapP[m,k] <> idM = vmapM[m,k] <> du = u[[idP]]-u[[idM]] <> dv = v[[idP]]-v[[idM]] <> dw = w[[idP]]-w[[idM]] <> dp = bc[m,k]*p[[idP]] - p[[idM]] <> dQ = 0.5*Fscale[m,k]* \ (dp - nx[m,k]*du - ny[m,k]*dv - nz[m,k]*dw) <> fluxu = -nx[m,k]*dQ <> fluxv = -ny[m,k]*dQ <> fluxw = -nz[m,k]*dQ <> fluxp = dQ # reduction here rhsu[n,k] = sum(m, LIFT[n,m]*fluxu) rhsv[n,k] = sum(m, LIFT[n,m]*fluxv) rhsw[n,k] = sum(m, LIFT[n,m]*fluxw) rhsp[n,k] = sum(m, LIFT[n,m]*fluxp) """, [ lp.GlobalArg( "vmapP,vmapM", np.int32, shape="NfpNfaces, K", order=order), lp.GlobalArg("u,v,w,p,rhsu,rhsv,rhsw,rhsp", dtype, shape="Np, K", order=order), lp.GlobalArg( "nx,ny,nz,Fscale,bc", dtype, shape="NfpNfaces, K", order=order), lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="C"), lp.ValueArg("K", np.int32, approximately=1000), ], name="dg_surface", assumptions="K>=1") knl = lp.fix_parameters(knl, Np=Np, Nfp=Nfp, NfpNfaces=Nfaces * Nfp, nsurf_dofs=K * Nfp) seq_knl = knl def variant_basic(knl): return knl parameters_dict = dict(K=K) for variant in [ variant_basic, ]: lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), parameters=parameters_dict)
def test_poisson_fem(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() nbf = 5 nqp = 5 sdim = 3 knl = lp.make_kernel("{ [c,i,j,k,ell,ell2]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ 0 <= ell,ell2 < sdim}", """ dpsi(bf,k0,dir) := \ simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) Ael[c,i,j] = \ J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") print(knl) knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl def add_types(knl): return lp.add_and_infer_dtypes( knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, )) for variant in [ #variant_1, variant_2 ]: knl = variant(knl) lp.auto_test_vs_ref(add_types(ref_knl), ctx, add_types(knl), parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))