def test_alias_temporaries(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ times2(i) := 2*a[i] times3(i) := 3*a[i] times4(i) := 4*a[i] x[i] = times2(i) y[i] = times3(i) z[i] = times4(i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.precompute(knl, "times2", "i_inner") knl = lp.precompute(knl, "times3", "i_inner") knl = lp.precompute(knl, "times4", "i_inner") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_alias_temporaries(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ times2(i) := 2*a[i] times3(i) := 3*a[i] times4(i) := 4*a[i] x[i] = times2(i) y[i] = times3(i) z[i] = times4(i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.precompute(knl, "times2", "i_inner") knl = lp.precompute(knl, "times3", "i_inner") knl = lp.precompute(knl, "times4", "i_inner") knl = lp.alias_temporaries(knl, ["times2_0", "times3_0", "times4_0"]) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def test_precompute_with_preexisting_inames_fail(): knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") with pytest.raises(lp.LoopyError): lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj")
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.prioritize_loops(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto") # TODO this adds `a` and `b` to domains, which leads to unused inames knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto") knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl
def test_matmul(ctx_factory, buffer_inames): ctx = ctx_factory() if (buffer_inames and ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") knl = lp.assume(knl, "ell mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_outer_inames='i_outer, j_outer, k_outer', default_tag="l.auto") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def test_precompute_some_exist(ctx_factory): fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl = lp.parse_fortran(fortran_src) assert len(knl["dgemm"].domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") ref_knl = knl ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def test_funny_shape_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) m = n + 12 ell = m + 12 knl = lp.make_kernel("{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", ["c[i, j] = sum(k, a[i, k]*b[k, j])"], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_outer_inames="i_outer, j_outer, k_outer", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters={ "n": n, "m": m, "ell": ell })
def test_matmul(ctx_factory, buffer_inames): logging.basicConfig(level=logging.INFO) fortran_src = """ subroutine dgemm(m,n,l,a,b,c) implicit none real*8 a(m,l),b(l,n),c(m,n) integer m,n,k,i,j,l do j = 1,n do i = 1,m do k = 1,l c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) knl = lp.assume(knl, "n mod 32 = 0") knl = lp.assume(knl, "m mod 32 = 0") knl = lp.assume(knl, "l mod 16 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner") knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames, init_expression="0", store_expression="base+buffer") ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, l=128))
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, i/13) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_precompute_confusing_subst_arguments(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ D(i):=a[i+1]-a[i] b[i,j] = D(j) """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", sweep_inames="j", precompute_outer_inames="j, i_inner, i_outer") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E") ]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL) knl = lp.realize_reduction(knl) knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E")]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def no_test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) # ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") print(knl) 1/0 knl = lp.realize_reduction(knl) evt, (z,) = knl(queue, n=size)
def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"]
def test_precompute_some_exist(ctx_factory): fortran_src = """ subroutine dgemm(m,n,ell,a,b,c) implicit none real*8 a(m,ell),b(ell,n),c(m,n) integer m,n,k,i,j,ell do j = 1,n do i = 1,m do k = 1,ell c(i,j) = c(i,j) + b(k,j)*a(i,k) end do end do end do end subroutine """ knl, = lp.parse_fortran(fortran_src) assert len(knl.domains) == 1 knl = lp.split_iname(knl, "i", 8, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 8) knl = lp.assume(knl, "n mod 8 = 0") knl = lp.assume(knl, "m mod 8 = 0") knl = lp.assume(knl, "ell mod 8 = 0") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", precompute_inames="ktemp,itemp", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", precompute_inames="itemp,k2temp", default_tag="l.auto") ref_knl = knl ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]
def test_funny_shape_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) m = n+12 ell = m+12 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], name="matmul", assumptions="n,m,ell >= 1") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 8, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 32) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2") knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2") knl = lp.precompute(knl, "a_acc", "k_inner,i_inner", default_tag="l.auto") knl = lp.precompute(knl, "b_acc", "j_inner,k_inner", default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n, "m": m, "ell": ell})
def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "u[e, :, :]") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl
def test_fd_1d(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "result[i] = u[i+1]-u[i]") knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") knl = lp.assume(knl, "n mod 16 = 0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=2048))
def test_finite_difference_expr_subst(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) grid = np.linspace(0, 2*np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) fin_diff_knl = lp.make_kernel( "{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", [lp.GlobalArg("out", shape="n+2"), "..."]) flux_knl = lp.make_kernel( "{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="n+2"), lp.GlobalArg("u", shape="n+2"), ]) fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], data_flow=[ ("f", 1, 0) ]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused_knl = lp.assignment_to_subst(fused_knl, "f") fused_knl = lp.set_options(fused_knl, write_cl=True) # This is the real test here: The automatically generated # shape expressions are '2+n' and the ones above are 'n+2'. # Is loopy smart enough to understand that these are equal? evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") gpu_knl = lp.split_iname( fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute( gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h)
def test_finite_difference_expr_subst(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) grid = np.linspace(0, 2 * np.pi, 2048, endpoint=False) h = grid[1] - grid[0] u = cl.clmath.sin(cl.array.to_device(queue, grid)) fin_diff_knl = lp.make_kernel("{[i]: 1<=i<=n}", "out[i] = -(f[i+1] - f[i-1])/h", [lp.GlobalArg("out", shape="n+2"), "..."]) flux_knl = lp.make_kernel("{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [ lp.GlobalArg("f", shape="n+2"), lp.GlobalArg("u", shape="n+2"), ]) fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl], data_flow=[("f", 1, 0)]) fused_knl = lp.set_options(fused_knl, write_cl=True) evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused_knl = lp.assignment_to_subst(fused_knl, "f") fused_knl = lp.set_options(fused_knl, write_cl=True) # This is the real test here: The automatically generated # shape expressions are '2+n' and the ones above are 'n+2'. # Is loopy smart enough to understand that these are equal? evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1)) fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i") gpu_knl = lp.split_iname(fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0") precomp_knl = lp.precompute(gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True) precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"}) precomp_knl = lp.set_options(precomp_knl, return_dict=True) evt, _ = precomp_knl(queue, u=u, h=h)
def test_precompute_does_not_lead_to_dep_cycle(ctx_factory): # See https://github.com/inducer/loopy/issues/498 ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<10}", """ <> tmp0[i] = 2 * i <> tmp1[i] = 2 * tmp0[i] <> tmp2[i] = 3 * tmp1[i] out[i] = 2*tmp1[i] + 3*tmp2[i] """) ref_knl = knl knl = lp.assignment_to_subst(knl, "tmp1") knl = lp.precompute(knl, "tmp1_subst") lp.auto_test_vs_ref(knl, ctx, ref_knl)
def test_fd_1d(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", "result[i] = u[i+1]-u[i]") knl = lp.add_and_infer_dtypes(knl, {"u": np.float32}) ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.extract_subst(knl, "u_acc", "u[j]", parameters="j") knl = lp.precompute(knl, "u_acc", "i_inner", default_tag="for") knl = lp.assume(knl, "n mod 16 = 0") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=2048))
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size})
def test_precompute_confusing_subst_arguments(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ D(i):=a[i+1]-a[i] b[i,j] = D(j) """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"]) knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"]) knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"]) knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) knl = lp.precompute(knl, "eu", np.float32, ["b", "c"]) knl = lp.precompute(knl, "ur", np.float32, ["b", "c"]) knl = lp.precompute(knl, "us", np.float32, ["b", "c"]) knl = lp.precompute(knl, "ut", np.float32, ["b", "c"]) knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) #knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"]) # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K})
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "eu", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K})
def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import ( fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.set_loop_priority(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner",) flux_ilp_inames = ("kk",) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)), ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions(hsv, "tag:{knl_tag} and reads:{flux_var}" .format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"}, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames(tap_hsv, dict( rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames(hsv, dict( rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf", vary_by_axes=(0,) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_poly = lp.get_op_poly(hsv) print(lp.stringify_stats_mapping(op_poly)) print("MEM") gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv)) print(lp.stringify_stats_mapping(gmem_poly)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl
def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl
def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl