def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_slab_decomposition_does_not_double_execute(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{ [i]: 0<=i<n }", "a[i] = 2*a[i]", assumptions="n>=1") ref_knl = knl for outer_tag in ["for", "g.0"]: knl = ref_knl knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) knl = lp.set_loop_priority(knl, "i_outer") a = cl.array.empty(queue, 20, np.float32) a.fill(17) a_ref = a.copy() a_knl = a.copy() knl = lp.set_options(knl, write_cl=True) print("TEST-----------------------------------------") knl(queue, a=a_knl) print("REF-----------------------------------------") ref_knl(queue, a=a_ref) print("DONE-----------------------------------------") print("REF", a_ref) print("KNL", a_knl) assert (a_ref == a_knl).get().all() print("_________________________________")
def tune_prims_kernel(knl, shape=None, ng=None, prefetch_args=()): """Parameters for 3D""" knl = spec_prims_kernel(knl, shape, ng) # This assumes linear, see above knl = _lp.split_iname(knl, "k", lsize[0], outer_tag=otags[0], inner_tag=itags[0]) knl = _lp.split_iname(knl, "j", lsize[1], outer_tag=otags[1], inner_tag=itags[1]) knl = _lp.split_iname(knl, "i", lsize[2], outer_tag=otags[2], inner_tag=itags[2]) knl = _lp.set_loop_priority( knl, "p,i_outer,j_outer,k_outer,i_inner,j_inner,k_inner") for arg in prefetch_args: knl = _lp.add_prefetch(knl, arg, "i_inner,j_inner,k_inner", default_tag="l.auto") #knl = lp.set_options(knl, no_numpy=True) return knl
def test_ilp_loop_bound(ctx_factory): # The salient bit of this test is that a joint bound on (outer, inner) # from a split occurs in a setting where the inner loop has been ilp'ed. # In 'normal' parallel loops, the inner index is available for conditionals # throughout. In ILP'd loops, not so much. ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j,k]: 0<=i,j,k<n }", """ out[i,k] = sum(j, a[i,j]*b[j,k]) """, [ lp.GlobalArg("a,b", np.float32, shape=lp.auto), "...", ], assumptions="n>=1") ref_knl = knl knl = lp.set_loop_priority(knl, "j,i,k") knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=200 ))
def test_poisson_fem(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() nbf = 5 nqp = 5 sdim = 3 knl = lp.make_kernel( "{ [c,i,j,k,ell,ell2,ell3]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ 0 <= ell,ell2 < sdim}", """ dpsi(bf,k0,dir) := \ simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) Ael[c,i,j] = \ J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") print(knl) knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def add_types(knl): return lp.add_and_infer_dtypes(knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, )) for variant in [ #variant_1, variant_2 ]: knl = variant(knl) lp.auto_test_vs_ref( add_types(ref_knl), ctx, add_types(knl), parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
def transform_surface_kernel(knl): #print knl knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0")) knl = lp.split_iname(knl, "mp", 4, inner_tag="unr") knl = lp.add_prefetch(knl, "LIFT") for name in ["nx", "ny", "nz", "Fscale", "bc"]: knl = lp.add_prefetch(knl, name) knl = lp.set_loop_priority(knl, "mp_outer,mp_inner") return knl
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format(inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv)) knl = lp.set_loop_priority(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. knl = lp.assignment_to_subst(knl, "prev") knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.set_loop_priority(knl, ["j_outer", "j_inner"]) return knl
def test_index_cse(ctx_factory): knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m<n}"], """ for i for j c[i,j,m] = sum((k,l), a[i,j,l]*b[i,j,k,l]) end end """) knl = lp.tag_inames(knl, "l:unr") knl = lp.set_loop_priority(knl, "i,j,k,l") knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "b": np.float32}) knl = lp.fix_parameters(knl, n=5) print(lp.generate_code_v2(knl).device_code())
def test_chunk_iname(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], assumptions="n>0") ref_knl = knl knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") knl = lp.set_loop_priority(knl, "i_outer, i_inner") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv)) knl = lp.set_loop_priority(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. knl = lp.assignment_to_subst(knl, "prev") knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_chunk_iname(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") knl = lp.set_loop_priority(knl, "i_outer, i_inner") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
def test_numba_cuda_target(): knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]") knl = lp.fix_parameters(knl, N=3) knl = lp.set_loop_priority(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]") knl = lp.fix_parameters(knl, N=3) knl = lp.set_loop_priority(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I] * u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_ilp_loop_bound(ctx_factory): # The salient bit of this test is that a joint bound on (outer, inner) # from a split occurs in a setting where the inner loop has been ilp'ed. # In 'normal' parallel loops, the inner index is available for conditionals # throughout. In ILP'd loops, not so much. ctx = ctx_factory() knl = lp.make_kernel("{ [i,j,k]: 0<=i,j,k<n }", """ out[i,k] = sum(j, a[i,j]*b[j,k]) """, [ lp.GlobalArg("a,b", np.float32, shape=lp.auto), "...", ], assumptions="n>=1") ref_knl = knl knl = lp.set_loop_priority(knl, "j,i,k") knl = lp.split_iname(knl, "k", 4, inner_tag="ilp") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=200))
def test_slab_decomposition_does_not_double_execute(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{ [i]: 0<=i<n }", "a[i] = 2*a[i]", assumptions="n>=1") ref_knl = knl for outer_tag in ["for", "g.0"]: knl = ref_knl knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr", outer_tag=outer_tag) knl = lp.set_loop_priority(knl, "i_outer") a = cl.array.empty(queue, 20, np.float32) a.fill(17) a_ref = a.copy() a_knl = a.copy() knl = lp.set_options(knl, write_cl=True) print("TEST-----------------------------------------") knl(queue, a=a_knl) print("REF-----------------------------------------") ref_knl(queue, a=a_ref) print("DONE-----------------------------------------") print("REF", a_ref) print("KNL", a_knl) assert (a_ref == a_knl).get().all() print("_________________________________")
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(E=200))
nimg=nimg, f_w=f_w)) ref_knl = knl ############################################################## # Play with things in here to optimize for speed lp.split_reduction_outward(knl, "color") ## Order of loop variables knl = lp.set_loop_priority(knl, [ "im_x_outer", "im_y_outer", "im_x_inner", "im_y_inner", "n", "feat", "f_x", "f_y"]) ## Split loop into innner/outer. 2nd arg is size of inner loop knl = lp.split_iname(knl, "im_x", 16) knl = lp.split_iname(knl, "im_y", 16) #knl = lp.split_iname(knl, "color", 3) ## specifying block/tread ordering of each varible: g.N, l.N, unr, [seq], ilp knl = lp.tag_inames(knl, dict(im_x_inner="l.0", im_x_outer="g.0", im_y_inner="l.1",
def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl
def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.set_loop_priority(knl, ["i", "j"]) return knl
def set_up_volume_loop(kernel, Nq): kernel = lp.fix_parameters(kernel, Nq=Nq) kernel = lp.set_loop_priority(kernel, "e,k,j,i") kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0")) kernel = lp.assume(kernel, "elements >= 1") return kernel
def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl
def variant_1(knl): knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") knl = lp.set_loop_priority(knl, ["i", "j"]) knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch") return knl
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)