def test_fp_prec_comparison(): # FIXME: This test should succeed even when the number is exactly # representable in single precision. # # https://gitlab.tiker.net/inducer/loopy/issues/187 fortran_src_dp = """ subroutine assign_scalar(a) real*8 a(1) a(1) = 1.1d0 end """ prg_dp = lp.parse_fortran(fortran_src_dp) fortran_src_sp = """ subroutine assign_scalar(a) real*8 a(1) a(1) = 1.1 end """ prg_sp = lp.parse_fortran(fortran_src_sp) assert prg_sp != prg_dp
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format(inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv)) knl = lp.prioritize_loops(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. knl = lp.assignment_to_subst(knl, "prev") knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv = lp.parse_fortran( fortran_template.format(inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]), data_flow=[("result", 0, 1)]) knl = knl.with_kernel( lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k")) assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv)) knl = lp.set_loop_priority(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. knl = lp.assignment_to_subst(knl, "prev") knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_assignment_to_subst_two_defs(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a, out(n), out2(n), inp(n) integer n, i do i = 1, n a = inp(i) out(i) = 5*a a = 3*inp(n) out2(i) = 6*a end do end """ knl, = lp.parse_fortran(fortran_src) ref_knl = knl knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_tagged(ctx_factory): fortran_src = """ subroutine rot_norm(out, alpha, out2, inp, inp2, n) implicit none real*8 a, b, r, out(n), out2(n), inp(n), inp2(n) real*8 alpha integer n, i do i = 1, n !$loopy begin tagged: input a = cos(alpha)*inp(i) + sin(alpha)*inp2(i) b = -sin(alpha)*inp(i) + cos(alpha)*inp2(i) !$loopy end tagged: input r = sqrt(a**2 + b**2) a = a/r b = b/r out(i) = a out2(i) = b end do end """ knl, = lp.parse_fortran(fortran_src) assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2
def test_assignment_to_subst_indices(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a(n), out(n), out2(n), inp(n) integer n, i do i = 1, n a(i) = 6*inp(i) enddo do i = 1, n out(i) = 5*a(i) end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl assert "a" in knl.temporary_variables knl = lp.assignment_to_subst(knl, "a") assert "a" not in knl.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_if(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a, b, out(n), out2(n), inp(n) integer n, i, j do i = 1, n a = inp(i) if (a.ge.3) then b = 2*a do j = 1,3 b = 3 * b end do out(i) = 5*b else out(i) = 4*a endif end do end """ knl, = lp.parse_fortran(fortran_src) ref_knl = knl knl = lp.assignment_to_subst(knl, "a") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) knl = lp.prioritize_loops(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() if (buffer_inames and ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") knl = lp.assume(knl, "ell mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def test_precompute_some_exist(ctx_factory): fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl = lp.parse_fortran(fortran_src) assert len(knl["dgemm"].domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def test_matmul(ctx_factory, buffer_inames): logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,l,a,b,c) implicit none real*8 a(m,l),b(l,n),c(m,n) integer m,n,k,i,j,l do j = 1,n do i = 1,m do k = 1,l c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") knl = lp.assume(knl, "l mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
def test_batched_sparse(): fortran_src = """ subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y) implicit none integer rowstarts(m+1), colindices(nvals) real*8 values(nvals) real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) integer m, n, rowstart, rowend, length, nvals, nvecs integer i, j, k do i = 1, m rowstart = rowstarts(i) rowend = rowstarts(i+1) length = rowend - rowstart do k = 1, nvecs rowsum(k) = 0 enddo do k = 1, nvecs do j = 1, length rowsum(k) = rowsum(k) + & x(colindices(rowstart+j-1),k)*values(rowstart+j-1) end do end do do k = 1, nvecs y(i,k) = rowsum(k) end do end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) knl = lp.tag_inames(knl, {"i_inner": "l.0"}) knl = lp.add_prefetch(knl, "values", default_tag="l.auto") knl = lp.add_prefetch(knl, "colindices", default_tag="l.auto") knl = lp.fix_parameters(knl, nvecs=4)
def test_precompute_some_exist(ctx_factory): fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", default_tag="l.auto") ref_knl = knl ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def test_fill_const(ctx_factory): fortran_src = """ subroutine fill(out, a, n) implicit none real*8 a, out(n) integer n, i do i = 1, n out(i) = 3.45 end do end """ knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5, a=5))
def test_division_in_shapes(ctx_factory): fortran_src = """ subroutine halve(m, a) implicit none integer m, i, j real*8 a(m/2,m/2) do i = 1,m/2 do j = 1,m/2 a(i, j) = 2*a(i, j) end do end do end subroutine """ t_unit = lp.parse_fortran(fortran_src) ref_t_unit = t_unit print(t_unit) ctx = ctx_factory() lp.auto_test_vs_ref(ref_t_unit, ctx, t_unit, parameters=dict(m=128))
def test_assign_double_precision_scalar_as_rational(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) fortran_src = """ subroutine assign_scalar(a) real*8 a(1) a(1) = 11 a(1) = a(1) / 10 end """ t_unit = lp.parse_fortran(fortran_src) a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") t_unit(queue, a=a_dev) abs_err = abs(a_dev.get()[0] - 1.1) assert abs_err < 1e-15
def test_domain_fusion_imperfectly_nested(): fortran_src = """ subroutine imperfect(n, m, a, b) implicit none integer i, j, n, m real a(n), b(n,n) do i=1, n a(i) = i do j=1, m b(i,j) = i*j end do end do end subroutine """ t_unit = lp.parse_fortran(fortran_src) # If n > 0 and m == 0, a single domain would be empty, # leading (incorrectly) to no assignments to 'a'. assert len(t_unit["imperfect"].domains) > 1
def test_assign_double_precision_scalar(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) fortran_src = """ subroutine assign_scalar(a) real*8 a(1) a(1) = 1.1d0 end """ t_unit = lp.parse_fortran(fortran_src) print(lp.generate_code_v2(t_unit).device_code()) assert "1.1;" in lp.generate_code_v2(t_unit).device_code() a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F") t_unit(queue, a=a_dev) abs_err = abs(a_dev.get()[0] - 1.1) assert abs_err < 1e-15
def test_asterisk_in_shape(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a, out(n), out2(n), inp(*) integer n, i do i = 1, n a = inp(n) out(i) = 5*a out2(i) = 6*a end do end """ knl, = lp.parse_fortran(fortran_src) ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl(queue, inp=np.array([1, 2, 3.]), n=3)
def test_fortran_subroutines(): fortran_src = """ subroutine twice(n, a) implicit none real*8 a(n) integer i,n do i=1,n a(i) = a(i) * 2 end do end subroutine subroutine twice_cross(n, a, i) implicit none integer i, n real*8 a(n,n) call twice(n, a(1:n, i)) call twice(n, a(i, 1:n)) end subroutine """ t_unit = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross") print(lp.generate_code_v2(t_unit).device_code())
def fortran_kernel(self, line, cell): result = lp.parse_fortran(cell) for knl in result: self.shell.user_ns[knl.name] = knl
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def fortran_kernel(self, line, cell): result = lp.parse_fortran(cell) self.shell.user_ns["prog"] = result
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)