def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def test_ilp_race_matmul(ctx_factory): dtype = np.float32 order = "C" n = 9 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) knl = lp.add_prefetch(knl, 'b', ["k_inner"]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: knl = lp.preprocess_kernel(knl) list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n }", [ "a[i] = 5+i+j", ], [ lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_divisibility_assumption(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i]: 0<=i<n}", [ "b[i] = 2*a[i]" ], [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1 and (exists zz: n = 16*zz)") ref_knl = knl knl = lp.split_iname(knl, "i", 16) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "if" not in code lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": 16**3})
def test_interp_diff(ctx_factory): 1/0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N [ "[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("P", dtype, shape=interim_field_shape, order=order), lp.GlobalArg("I", dtype, shape=(M, N), order=order), lp.GlobalArg("V", dtype, shape=(N, M), order=order), lp.GlobalArg("Pu", dtype, shape=field_shape, order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_lap_precon", assumptions="K>=1") print(knl) 1/0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)
def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<=i<16 and 0<=j<17 }", [ "<> a[i] = 5+i+j", ], []) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, 17)
def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[j]: 0<=j<16 }", [ "<> a = 5+j", ], []) knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, )
def test_variable_size_temporary(): knl = lp.make_kernel(''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch(knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_owed_barriers(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i]"], [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100, ))]) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_ilp_write_race_avoidance_private(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[j]: 0<=j<16 }", [ "<> a = 5+j", ], []) knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16,)
def test_ilp_write_race_avoidance_local(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<16 and 0<=j<17 }", [ "<> a[i] = 5+i+j", ], []) knl = lp.tag_inames(knl, dict(i="l.0", j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): assert k.temporary_variables["a"].shape == (16, 17)
def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch( knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_multiple_writes_to_local_temporary(): # Loopy would previously only handle barrier insertion correctly if exactly # one instruction wrote to each local temporary. This tests that multiple # writes are OK. knl = lp.make_kernel( "{[i,e]: 0<=i<5 and 0<=e<nelements}", """ <> temp[i, 0] = 17 temp[i, 1] = 15 """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): code, _ = lp.generate_code(k) print(code)
def test_wg_too_small(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] {id=copy}"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest for gen_knl in kernel_gen: with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, gen_knl).get_code()
def test_multi_cse(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<100}", ["<float32> z[i] = a[i] + a[i]**2"], [lp.GlobalArg("a", np.float32, shape=(100, ))], local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel("{[i,j]: 0<= i,j < 32}", ["a[i] = b[i]"], [ lp.GlobalArg("a", np.float32, shape=(1000, )), lp.GlobalArg("b", np.float32, shape=(1000, )) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def test_simple_side_effect(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i,j<100}", """ a[i] = a[i] + 1 """, [lp.GlobalArg("a", np.float32, shape=(100,))] ) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n", )), lp.GlobalArg("b", np.float32, shape=("n", )), lp.GlobalArg("c", np.float32, shape=("n", )), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_owed_barriers(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i]" ], [lp.GlobalArg("a", np.float32, shape=(100,))] ) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) import pytest with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))
def test_wg_too_small(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] {id=copy}" ], [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) import pytest for gen_knl in kernel_gen: with pytest.raises(RuntimeError): lp.CompiledKernel(ctx, gen_knl).get_code()
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.set_loop_priority(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_multi_cse(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<100}", [ "<float32> z[i] = a[i] + a[i]**2" ], [lp.GlobalArg("a", np.float32, shape=(100,))], local_sizes={0: 16}) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") knl = lp.add_prefetch(knl, "a", []) knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code())
def test_dependent_loop_bounds_3(ctx_factory): # The point of this test is that it shows a dependency between # domains that is exclusively mediated by the row_len temporary. # It also makes sure that row_len gets read before any # conditionals use it. dtype = np.dtype(np.float32) ctx = ctx_factory() knl = lp.make_kernel( [ "{[i]: 0<=i<n}", "{[jj]: 0<=jj<row_len}", ], [ "<> row_len = a_row_lengths[i]", "a[i,jj] = 1", ], [ lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto), lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), lp.ValueArg("n", np.int32), ]) assert knl.parents_per_domain()[1] == 0 knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") cknl = lp.CompiledKernel(ctx, knl) print("---------------------------------------------------") print(cknl.get_highlighted_code()) print("---------------------------------------------------") knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1", inner_tag="l.1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) with pytest.raises(RuntimeError): list(lp.generate_loop_schedules(knl_bad))
def test_type_inference_no_artificial_doubles(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> bb = a[i] - b[i] c[i] = bb """, [ lp.GlobalArg("a", np.float32, shape=("n",)), lp.GlobalArg("b", np.float32, shape=("n",)), lp.GlobalArg("c", np.float32, shape=("n",)), lp.ValueArg("n", np.int32), ], assumptions="n>=1") knl = lp.preprocess_kernel(knl, ctx.devices[0]) for k in lp.generate_loop_schedules(knl): code = lp.generate_code(k) assert "double" not in code
def test_eq_constraint(ctx_factory): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<= i,j < 32}", [ "a[i] = b[i]" ], [ lp.GlobalArg("a", np.float32, shape=(1000,)), lp.GlobalArg("b", np.float32, shape=(1000,)) ]) knl = lp.split_iname(knl, "i", 16, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for knl in kernel_gen: print(lp.generate_code(knl))
def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("[n] -> {[i,j]: 0<=i,j<n }", [ "a[i] = 5+i+j", ], [ lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl)) assert any( isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_laplacian(ctx_factory): 1/0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # load: 1+6 fields + 1/N D entry # store: 1 fields # perform: N*2*6 + 3*5 flops # ratio: (12*N+15)/8 flops per 4 bytes on bus # ~ 14 FLOPS per 4 bytes at N=8 # ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))", "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))", "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))", # define function "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)", "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)", "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*Gu(m,j,k))" "+ sum_float32(m, D[m,j]*Gv(i,m,k))" "+ sum_float32(m, D[m,k]*Gw(i,j,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") #print lp.preprocess_kernel(knl, cse_ok=True) #1/0 # #print knl #1/0 knl = lp.realize_cse(knl, "urf", np.float32, ["o1"]) knl = lp.realize_cse(knl, "usf", np.float32, ["o2"]) knl = lp.realize_cse(knl, "utf", np.float32, ["o3"]) knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"]) knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"]) knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: pass #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") knl = lp.add_prefetch(knl, "D", ["m", "j"]) #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print seq_knl #print lp.preprocess_kernel(knl) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
def test_laplacian_lmem(ctx_factory): 1/0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(@o, D[i,o]*u[e,o,j,k])", "CSE: us(i,j,k) = sum_float32(@o, D[j,o]*u[e,i,o,k])", "CSE: ut(i,j,k) = sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") knl = lp.add_prefetch(knl, "D", ["m", "j"]) knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print seq_knl #print lp.preprocess_kernel(knl) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
def test_advect_dealias(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" % M % N[ # interpolate u to integration nodes "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", # differentiate u on integration nodes "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", # interpolate v to integration nodes "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", # differentiate v on integration nodes "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", # interpolate w to integration nodes "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", # differentiate v on integration nodes "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", # find velocity in (r,s,t) coordinates # QUESTION: should I use CSE here ? "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", # form nonlinear term on integration nodes # QUESTION: should I use CSE here ? "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", # L2 project Nu back to Lagrange basis "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", # L2 project Nv back to Lagrange basis "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", # L2 project Nw back to Lagrange basis "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("v", dtype, shape=field_shape, order=order), lp.GlobalArg("w", dtype, shape=field_shape, order=order), lp.GlobalArg("INu", dtype, shape=field_shape, order=order), lp.GlobalArg("INv", dtype, shape=field_shape, order=order), lp.GlobalArg("INw", dtype, shape=field_shape, order=order), lp.GlobalArg("D", dtype, shape=(M, M), order=order), lp.GlobalArg("I", dtype, shape=(M, N), order=order), lp.GlobalArg("V", dtype, shape=(N, M), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_advect(ctx_factory): 1 / 0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, N, N, N) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic # A. updated for CSE: notation. # B. fixed temp indexing and C ordering # load: 3+9 fields + 1/N D entry # store: 3 fields # perform: N*2*6 + 3*5 + 3*5 flops # ratio: (12*N+30)/15 flops per 4 bytes on bus # ~ 8.4 FLOPS per 4 bytes at N=8 # ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N, [ # differentiate u "CSE: ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])", "CSE: us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])", "CSE: ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])", # differentiate v "CSE: vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])", "CSE: vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])", "CSE: vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])", # differentiate w "CSE: wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])", "CSE: ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])", "CSE: wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])", # find velocity in (r,s,t) coordinates # CSE? "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]", "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]", "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]", # form nonlinear term on integration nodes "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)", "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)", "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("v", dtype, shape=field_shape, order=order), lp.GlobalArg("w", dtype, shape=field_shape, order=order), lp.GlobalArg("Nu", dtype, shape=field_shape, order=order), lp.GlobalArg("Nv", dtype, shape=field_shape, order=order), lp.GlobalArg("Nw", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(9, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(N, N), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1 / 0 seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True, )
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6,)+field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"]) knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"]) knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"]) knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"]) knl = lp.precompute(knl, "eu", np.float32, ["b", "c"]) knl = lp.precompute(knl, "ur", np.float32, ["b", "c"]) knl = lp.precompute(knl, "us", np.float32, ["b", "c"]) knl = lp.precompute(knl, "ut", np.float32, ["b", "c"]) knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"]) #knl = lp.add_prefetch(knl, "G", [2,3,4]) # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"]) # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*n*2*3 + n*n*n*5*3 + n**4 * 2*3)/1e9, op_label="GFlops", parameters={"K": K})
def test_advect_dealias(ctx_factory): 1/0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,m,e]: 0<=i,j,k,m<%d AND 0<=o,ip,jp,kp<%d 0<=e<K}" %M %N [ # interpolate u to integration nodes "CSE: u0[i,jp,kp,e] = sum_float32(@o, I[i,o]*u[o,jp,kp,e])", "CSE: u1[i,j,kp,e] = sum_float32(@o, I[j,o]*u0[i,o,kp,e])", "CSE: Iu[i,j,k,e] = sum_float32(@o, I[k,o]*u1[i,j,o,e])", # differentiate u on integration nodes "CSE: Iur[i,j,k,e] = sum_float32(@m, D[i,m]*Iu[m,j,k,e])", "CSE: Ius[i,j,k,e] = sum_float32(@m, D[j,m]*Iu[i,m,k,e])", "CSE: Iut[i,j,k,e] = sum_float32(@m, D[k,m]*Iu[i,j,m,e])", # interpolate v to integration nodes "CSE: v0[i,jp,kp,e] = sum_float32(@o, I[i,o]*v[o,jp,kp,e])", "CSE: v1[i,j,kp,e] = sum_float32(@o, I[j,o]*v0[i,o,kp,e])", "CSE: Iv[i,j,k,e] = sum_float32(@o, I[k,o]*v1[i,j,o,e])", # differentiate v on integration nodes "CSE: Ivr[i,j,k,e] = sum_float32(@m, D[i,m]*Iv[m,j,k,e])", "CSE: Ivs[i,j,k,e] = sum_float32(@m, D[j,m]*Iv[i,m,k,e])", "CSE: Ivt[i,j,k,e] = sum_float32(@m, D[k,m]*Iv[i,j,m,e])", # interpolate w to integration nodes "CSE: w0[i,jp,kp,e] = sum_float32(@o, I[i,o]*w[o,jp,kp,e])", "CSE: w1[i,j,kp,e] = sum_float32(@o, I[j,o]*w0[i,o,kp,e])", "CSE: Iw[i,j,k,e] = sum_float32(@o, I[k,o]*w1[i,j,o,e])", # differentiate v on integration nodes "CSE: Iwr[i,j,k,e] = sum_float32(@m, D[i,m]*Iw[m,j,k,e])", "CSE: Iws[i,j,k,e] = sum_float32(@m, D[j,m]*Iw[i,m,k,e])", "CSE: Iwt[i,j,k,e] = sum_float32(@m, D[k,m]*Iw[i,j,m,e])", # find velocity in (r,s,t) coordinates # QUESTION: should I use CSE here ? "CSE: Vr[i,j,k,e] = G[i,j,k,0,e]*Iu[i,j,k,e] + G[i,j,k,1,e]*Iv[i,j,k,e] + G[i,j,k,2,e]*Iw[i,j,k,e]", "CSE: Vs[i,j,k,e] = G[i,j,k,3,e]*Iu[i,j,k,e] + G[i,j,k,4,e]*Iv[i,j,k,e] + G[i,j,k,5,e]*Iw[i,j,k,e]", "CSE: Vt[i,j,k,e] = G[i,j,k,6,e]*Iu[i,j,k,e] + G[i,j,k,7,e]*Iv[i,j,k,e] + G[i,j,k,8,e]*Iw[i,j,k,e]", # form nonlinear term on integration nodes # QUESTION: should I use CSE here ? "<SE: Nu[i,j,k,e] = Vr[i,j,k,e]*Iur[i,j,k,e]+Vs[i,j,k,e]*Ius[i,j,k,e]+Vt[i,j,k,e]*Iut[i,j,k,e]", "<SE: Nv[i,j,k,e] = Vr[i,j,k,e]*Ivr[i,j,k,e]+Vs[i,j,k,e]*Ivs[i,j,k,e]+Vt[i,j,k,e]*Ivt[i,j,k,e]", "<SE: Nw[i,j,k,e] = Vr[i,j,k,e]*Iwr[i,j,k,e]+Vs[i,j,k,e]*Iws[i,j,k,e]+Vt[i,j,k,e]*Iwt[i,j,k,e]", # L2 project Nu back to Lagrange basis "CSE: Nu2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nu[m,j,k,e])", "CSE: Nu1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nu2[ip,m,k,e])", "INu[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nu1[ip,jp,m,e])", # L2 project Nv back to Lagrange basis "CSE: Nv2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nv[m,j,k,e])", "CSE: Nv1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nv2[ip,m,k,e])", "INv[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nv1[ip,jp,m,e])", # L2 project Nw back to Lagrange basis "CSE: Nw2[ip,j,k,e] = sum_float32(@m, V[ip,m]*Nw[m,j,k,e])", "CSE: Nw1[ip,jp,k,e] = sum_float32(@m, V[jp,m]*Nw2[ip,m,k,e])", "INw[ip,jp,kp,e] = sum_float32(@m, V[kp,m]*Nw1[ip,jp,m,e])", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("v", dtype, shape=field_shape, order=order), lp.GlobalArg("w", dtype, shape=field_shape, order=order), lp.GlobalArg("INu", dtype, shape=field_shape, order=order), lp.GlobalArg("INv", dtype, shape=field_shape, order=order), lp.GlobalArg("INw", dtype, shape=field_shape, order=order), lp.GlobalArg("D", dtype, shape=(M,M), order=order), lp.GlobalArg("I", dtype, shape=(M, N), order=order), lp.GlobalArg("V", dtype, shape=(N, M), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1/0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)
def test_laplacian_lmem(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 4 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "ur(a,b,c) := sum_float32(@o, D[a,o]*u[e,o,b,c])", "us(a,b,c) := sum_float32(@o, D[b,o]*u[e,a,o,c])", "ut(a,b,c) := sum_float32(@o, D[c,o]*u[e,a,b,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") seq_knl = knl if 1: # original knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["a", "b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") else: # experiment # knl = lp.add_prefetch(knl, "u", ["i", "j", "k", "o"], default_tag="l.auto") knl = lp.precompute(knl, "eu", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.precompute(knl, "ut", np.float32, ["b", "c"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "D", ["m", "j", "k", "i"], default_tag="l.auto") #knl = lp.add_prefetch(knl, "G", [2,3,4], default_tag="l.auto") # axis/argument indices on G #knl = lp.add_prefetch(knl, "G", ["i", "j", "m", "k"], default_tag="l.auto") # axis/argument indices on G #print(knl) #1/0 #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") # knl = lp.join_dimensions(knl, ["i", "j"], "i_and_j") #print(seq_knl) #print(lp.preprocess_kernel(knl)) #1/0 # TW: turned this off since it generated: # ValueError: cannot tag 'i_and_j'--not known # knl = lp.tag_inames(knl, dict(i_and_j="l.0", k="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K})
def auto_test_vs_ref( ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel, get_highlighted_cl_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.preprocess import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue(ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("%s (ref): trying %s for the reference calculation" % ( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75*"-") print("Reference Code:") print(75*"-") print(get_highlighted_cl_code(ref_compiled.code)) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_cl_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75*"-", "On %s:" % dev, 75*"-", traceback.format_exc(), 75*"-"])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("%s (ref): using %s for the reference calculation" % ( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop-ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n"+"\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) args = None from loopy.kernel import kernel_state if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.preprocess import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) if args is None: cl_kernel_info = compiled.cl_kernel_info(frozenset()) args = make_args(kernel, cl_kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75*"-") print("Kernel #%d:" % i) print(75*"-") if print_code: print(compiled.get_highlighted_code()) print(75*"-") if dump_binary: print(type(compiled.cl_program)) print(compiled.cl_program.binaries[0]) print(75*"-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = warmup_rounds while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9*evt_end.profile.START - 1e-9*evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time-start_time)/timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " %g %s/s" % (cnt/elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % ( format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: %g s event, %g s wall%s" % ( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def test_laplacian_lmem(ctx_factory): 1 / 0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(@o, D[i,o]*u[e,o,j,k])", "CSE: us(i,j,k) = sum_float32(@o, D[j,o]*u[e,i,o,k])", "CSE: ut(i,j,k) = sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") knl = lp.add_prefetch(knl, "D", ["m", "j"]) knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print seq_knl #print lp.preprocess_kernel(knl) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
def test_advect(ctx_factory): 1/0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, N, N, N) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic # A. updated for CSE: notation. # B. fixed temp indexing and C ordering # load: 3+9 fields + 1/N D entry # store: 3 fields # perform: N*2*6 + 3*5 + 3*5 flops # ratio: (12*N+30)/15 flops per 4 bytes on bus # ~ 8.4 FLOPS per 4 bytes at N=8 # ~ 300 GFLOPS max on a 150GB/s device at N=8 if done perfectly knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,k,m,e]: 0<=i,j,k,m<%d AND 0<=e<K}" % N, [ # differentiate u "CSE: ur(i,j,k) = sum_float32(@m, D[i,m]*u[e,m,j,k])", "CSE: us(i,j,k) = sum_float32(@m, D[j,m]*u[e,i,m,k])", "CSE: ut(i,j,k) = sum_float32(@m, D[k,m]*u[e,i,j,m])", # differentiate v "CSE: vr(i,j,k) = sum_float32(@m, D[i,m]*v[e,m,j,k])", "CSE: vs(i,j,k) = sum_float32(@m, D[j,m]*v[e,i,m,k])", "CSE: vt(i,j,k) = sum_float32(@m, D[k,m]*v[e,i,j,m])", # differentiate w "CSE: wr(i,j,k) = sum_float32(@m, D[i,m]*w[e,m,j,k])", "CSE: ws(i,j,k) = sum_float32(@m, D[j,m]*w[e,i,m,k])", "CSE: wt(i,j,k) = sum_float32(@m, D[k,m]*w[e,i,j,m])", # find velocity in (r,s,t) coordinates # CSE? "CSE: Vr(i,j,k) = G[0,e,i,j,k]*u[e,i,j,k] + G[1,e,i,j,k]*v[e,i,j,k] + G[2,e,i,j,k]*w[e,i,j,k]", "CSE: Vs(i,j,k) = G[3,e,i,j,k]*u[e,i,j,k] + G[4,e,i,j,k]*v[e,i,j,k] + G[5,e,i,j,k]*w[e,i,j,k]", "CSE: Vt(i,j,k) = G[6,e,i,j,k]*u[e,i,j,k] + G[7,e,i,j,k]*v[e,i,j,k] + G[8,e,i,j,k]*w[e,i,j,k]", # form nonlinear term on integration nodes "Nu[e,i,j,k] = Vr(i,j,k)*ur(i,j,k)+Vs(i,j,k)*us(i,j,k)+Vt(i,j,k)*ut(i,j,k)", "Nv[e,i,j,k] = Vr(i,j,k)*vr(i,j,k)+Vs(i,j,k)*vs(i,j,k)+Vt(i,j,k)*vt(i,j,k)", "Nw[e,i,j,k] = Vr(i,j,k)*wr(i,j,k)+Vs(i,j,k)*ws(i,j,k)+Vt(i,j,k)*wt(i,j,k)", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("v", dtype, shape=field_shape, order=order), lp.GlobalArg("w", dtype, shape=field_shape, order=order), lp.GlobalArg("Nu", dtype, shape=field_shape, order=order), lp.GlobalArg("Nv", dtype, shape=field_shape, order=order), lp.GlobalArg("Nw", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(9,)+field_shape, order=order), lp.GlobalArg("D", dtype, shape=(N, N), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_advect", assumptions="K>=1") print(knl) 1/0 seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)
def test_laplacian(ctx_factory): 1 / 0 # not adapted to new language dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # load: 1+6 fields + 1/N D entry # store: 1 fields # perform: N*2*6 + 3*5 flops # ratio: (12*N+15)/8 flops per 4 bytes on bus # ~ 14 FLOPS per 4 bytes at N=8 # ~ 525 GFLOPS max on a 150GB/s device at N=8 if done perfectly # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o1,o2,o3,gi]: 0<=i,j,k,m,o1,o2,o3<%d and 0<=e<K and 0<=gi<6}" % n, [ "CSE: ur(i,j,k) = sum_float32(o1, D[i,o1]*cse(u[e,o1,j,k], urf))", "CSE: us(i,j,k) = sum_float32(o2, D[j,o2]*cse(u[e,i,o2,k], usf))", "CSE: ut(i,j,k) = sum_float32(o3, D[k,o3]*cse(u[e,i,j,o3], utf))", # define function "CSE: Gu(i,j,k) = G[0,e,i,j,k]*ur(i,j,k) + G[1,e,i,j,k]*us(i,j,k) + G[2,e,i,j,k]*ut(i,j,k)", "CSE: Gv(i,j,k) = G[1,e,i,j,k]*ur(i,j,k) + G[3,e,i,j,k]*us(i,j,k) + G[4,e,i,j,k]*ut(i,j,k)", "CSE: Gw(i,j,k) = G[2,e,i,j,k]*ur(i,j,k) + G[4,e,i,j,k]*us(i,j,k) + G[5,e,i,j,k]*ut(i,j,k)", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*Gu(m,j,k))" "+ sum_float32(m, D[m,j]*Gv(i,m,k))" "+ sum_float32(m, D[m,k]*Gw(i,j,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") #print lp.preprocess_kernel(knl, cse_ok=True) #1/0 # #print knl #1/0 knl = lp.realize_cse(knl, "urf", np.float32, ["o1"]) knl = lp.realize_cse(knl, "usf", np.float32, ["o2"]) knl = lp.realize_cse(knl, "utf", np.float32, ["o3"]) knl = lp.realize_cse(knl, "Gu", np.float32, ["m", "j", "k"]) knl = lp.realize_cse(knl, "Gv", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "Gw", np.float32, ["i", "j", "m"]) knl = lp.realize_cse(knl, "ur", np.float32, ["k", "j", "m"]) knl = lp.realize_cse(knl, "us", np.float32, ["i", "m", "k"]) knl = lp.realize_cse(knl, "ut", np.float32, ["i", "j", "m"]) if 0: pass #seq_knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") #seq_knl = lp.add_prefetch(seq_knl, "D", ["m", "j"]) #seq_knl = lp.add_prefetch(seq_knl, "u", ["i", "j", "k"], "u[*,i,j,k]") else: seq_knl = knl knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.add_prefetch(knl, "G", ["gi", "m", "j", "k"], "G[gi,e,m,j,k]") knl = lp.add_prefetch(knl, "D", ["m", "j"]) #knl = lp.add_prefetch(knl, "u", ["i", "j", "k"], "u[*,i,j,k]") #knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") #print seq_knl #print lp.preprocess_kernel(knl) #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules( knl, loop_priority=["m_fetch_G", "i_fetch_u"]) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref( seq_knl, ctx, kernel_gen, op_count=K * (n * n * n * n * 2 * 3 + n * n * n * 5 * 3 + n**4 * 2 * 3) / 1e9, op_label="GFlops", parameters={"K": K}, print_seq_code=True)
n = 128 r = 3 #a = np.ones((n, n), dtype = np.float32) #b = np.ones((n, n), dtype = np.float32) f = np.zeros((n, r), dtype = np.float32) a = np.random.randn(n, n, n) v = np.random.randn(n, r) w = np.random.randn(n,r) a = a.astype(np.float32) v = v.astype(np.float32) w = w.astype(np.float32) parameters = {"a" : a, "v": v, "w" : w} knl_gen = lp.generate_loop_schedules(knl) ref_compiled = lp.CompiledKernel(ctx,knl_gen,options=[],codegen_kwargs={}) a1 = cl_array.to_device(queue,a) v1 = cl_array.to_device(queue,v) w1 = cl_array.to_device(queue,w) f = cl_array.to_device(queue,f) qq = {"a":a1,"v":v1,"w":w1,'f':f} from pymbolic import evaluate kk = {} for arg in knl.args: kk[arg.name] = qq[arg.name].astype(arg.dtype) import time
def test_laplacian_lmem_ilp(ctx_factory): # This does not lead to practical/runnable code (out of lmem), but it's an # excellent stress test for the code generator. :) dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n, n) # K - run-time symbolic knl = lp.make_kernel( ctx.devices[0], "[K] -> {[i,j,k,e,m,o,gi]: 0<=i,j,k,m,o<%d and 0<=e<K }" % n, [ "ur(i,j,k) := sum_float32(@o, D[i,o]*u[e,o,j,k])", "us(i,j,k) := sum_float32(@o, D[j,o]*u[e,i,o,k])", "ut(i,j,k) := sum_float32(@o, D[k,o]*u[e,i,j,o])", "lap[e,i,j,k] = " " sum_float32(m, D[m,i]*(G[0,e,m,j,k]*ur(m,j,k) + G[1,e,m,j,k]*us(m,j,k) + G[2,e,m,j,k]*ut(m,j,k)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m,k]*ur(i,m,k) + G[3,e,i,m,k]*us(i,m,k) + G[4,e,i,m,k]*ut(i,m,k)))" "+ sum_float32(m, D[m,k]*(G[2,e,i,j,m]*ur(i,j,m) + G[4,e,i,j,m]*us(i,j,m) + G[5,e,i,j,m]*ut(i,j,m)))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(6, ) + field_shape, order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap", assumptions="K>=1") # Must act on u first, otherwise stencil becomes crooked and # footprint becomes non-convex. knl = lp.split_iname(knl, "e", 16, outer_tag="g.0") #, slabs=(0, 1)) knl = lp.split_iname(knl, "e_inner", 4, inner_tag="ilp") knl = lp.add_prefetch(knl, "u", [1, 2, 3, "e_inner_inner"]) knl = lp.precompute(knl, "ur", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "us", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.precompute(knl, "ut", np.float32, [0, 1, 2, "e_inner_inner"]) knl = lp.add_prefetch(knl, "G", ["m", "i", "j", "k", "e_inner_inner"]) knl = lp.add_prefetch(knl, "D", ["m", "j"]) #print seq_knl #1/0 knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) for knl in kernel_gen: print(lp.generate_code(knl))
def test_laplacian_stiffness(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" dim = 2 # (baked into code) Nq = 40 # num. quadrature points (baked into code) Nb = 20 # num. basis functions (baked into code) Nc = 100 # num. cells (run-time symbolic) from pymbolic import var Nc_sym = var("Nc") knl = lp.make_kernel( ctx.devices[0], "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " "and 0<= dx_axis, ax_b < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [ "dPsi(ij, dxi) := sum_float32(@ax_b," " jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))" ], [ lp.GlobalArg( "jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), lp.ConstantArg("w", dtype, shape=(Nq, ), order=order), lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), lp.ValueArg("Nc", np.int32, approximately=1000), ], name="lapquad", assumptions="Nc>=1") knl = lp.tag_inames(knl, dict(ax_b="unr")) seq_knl = knl def variant_fig31(knl): # This (mostly) reproduces Figure 3.1. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_pg4(knl): # This (mostly) reproduces the unlabeled code snippet on pg. 4. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"] def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"] def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"] def variant_simple_gpu(knl): # This is a simple GPU-ish variant. # It's not the same thing as Matt's code, but I'll need some more time # to reverse-engineer what is going on there. Some discussion might # help, too. :) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto") knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto") return knl, ["K", "i", "j", "q", "ax_b_insn"] # Plug in variant name here # | # v for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) kernel_gen = lp.generate_loop_schedules(var_knl, loop_priority=loop_prio) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) #print lp.preprocess_kernel(var_knl) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"Nc": Nc}, print_ref_code=True)
cknl.print_code() n = 128 r = 3 #a = np.ones((n, n), dtype = np.float32) #b = np.ones((n, n), dtype = np.float32) f = np.zeros((n, r), dtype=np.float32) a = np.random.randn(n, n, n) v = np.random.randn(n, r) w = np.random.randn(n, r) a = a.astype(np.float32) v = v.astype(np.float32) w = w.astype(np.float32) parameters = {"a": a, "v": v, "w": w} knl_gen = lp.generate_loop_schedules(knl) ref_compiled = lp.CompiledKernel(ctx, knl_gen, options=[], codegen_kwargs={}) a1 = cl_array.to_device(queue, a) v1 = cl_array.to_device(queue, v) w1 = cl_array.to_device(queue, w) f = cl_array.to_device(queue, f) qq = {"a": a1, "v": v1, "w": w1, 'f': f} from pymbolic import evaluate kk = {} for arg in knl.args: kk[arg.name] = qq[arg.name].astype(arg.dtype) import time
def auto_test_vs_ref(ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={}, print_ref_code=False, print_code=True, warmup_rounds=2, dump_binary=False, fills_entire_output=None, do_check=True, check_result=None, max_test_kernel_count=1, quiet=False, blacklist_ref_vendors=[]): """Compare results of `ref_knl` to the kernels generated by scheduling *test_knl*. :arg check_result: a callable with :class:`numpy.ndarray` arguments *(result, reference_result)* returning a a tuple (class:`bool`, message) indicating correctness/acceptability of the result :arg max_test_kernel_count: Stop testing after this many *test_knl* """ import pyopencl as cl if test_knl is None: test_knl = ref_knl do_check = False if len(ref_knl.args) != len(test_knl.args): raise LoopyError("ref_knl and test_knl do not have the same number " "of arguments") for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)): if ref_arg.name != test_arg.name: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) if ref_arg.dtype != test_arg.dtype: raise LoopyError( "ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i + 1)) from loopy.compiled import CompiledKernel from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) op_count = [op_count] if isinstance(op_label, str): warn("op_label should be a list", stacklevel=2) op_label = [op_label] from time import time if check_result is None: check_result = _default_check_result if fills_entire_output is not None: warn("fills_entire_output is deprecated", DeprecationWarning, stacklevel=2) # {{{ compile and run reference code from loopy.type_inference import infer_unknown_types ref_knl = infer_unknown_types(ref_knl, expect_completion=True) found_ref_device = False ref_errors = [] from loopy.kernel.data import ImageArg need_ref_image_support = any( isinstance(arg, ImageArg) for arg in ref_knl.args) for dev in _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_ref_image_support): ref_ctx = cl.Context([dev]) ref_queue = cl.CommandQueue( ref_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) pp_ref_knl = lp.preprocess_kernel(ref_knl) for knl in lp.generate_loop_schedules(pp_ref_knl): ref_sched_kernel = knl break logger.info("{} (ref): trying {} for the reference calculation".format( ref_knl.name, dev)) ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel) if not quiet and print_ref_code: print(75 * "-") print("Reference Code:") print(75 * "-") print(get_highlighted_code(ref_compiled.get_code())) print(75 * "-") ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: if e.code == cl.status_code.IMAGE_FORMAT_NOT_SUPPORTED: import traceback ref_errors.append("\n".join([ 75 * "-", "On %s:" % dev, 75 * "-", traceback.format_exc(), 75 * "-" ])) continue else: raise found_ref_device = True if not do_check: break ref_queue.finish() logger.info("{} (ref): using {} for the reference calculation".format( ref_knl.name, dev)) logger.info("%s (ref): run" % ref_knl.name) ref_start = time() if not AUTO_TEST_SKIP_RUN: ref_evt, _ = ref_compiled(ref_queue, **ref_args) else: ref_evt = cl.enqueue_marker(ref_queue) ref_queue.finish() ref_stop = time() ref_elapsed_wall = ref_stop - ref_start logger.info("%s (ref): run done" % ref_knl.name) ref_evt.wait() ref_elapsed_event = 1e-9 * (ref_evt.profile.END - ref_evt.profile.START) break if not found_ref_device: raise LoopyError("could not find a suitable device for the " "reference computation.\n" "These errors were encountered:\n" + "\n".join(ref_errors)) # }}} # {{{ compile and run parallel code need_check = do_check queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) from loopy.kernel import KernelState from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ KernelState.PREPROCESSED, KernelState.LINEARIZED ]: if isinstance(test_knl.target, PyOpenCLTarget): test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: test_kernels = lp.generate_loop_schedules(test_knl) else: test_kernels = [test_knl] test_kernel_count = 0 from loopy.type_inference import infer_unknown_types for i, kernel in enumerate(test_kernels): test_kernel_count += 1 if test_kernel_count > max_test_kernel_count: break kernel = infer_unknown_types(kernel, expect_completion=True) compiled = CompiledKernel(ctx, kernel) kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False if not quiet: print(75 * "-") print("Kernel #%d:" % i) print(75 * "-") if print_code: print(compiled.get_highlighted_code()) print(75 * "-") if dump_binary: # {{{ find cl program for name in dir(kernel_info.cl_kernels): if name.startswith("__"): continue cl_kernel = getattr(kernel_info.cl_kernels, name) cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM) break else: assert False, "could not find cl_program" # }}} print(type(cl_program)) if hasattr(cl_program, "binaries"): print(cl_program.binaries[0]) print(75 * "-") logger.info("%s: run warmup" % (knl.name)) for i in range(warmup_rounds): if not AUTO_TEST_SKIP_RUN: compiled(queue, **args) if need_check and not AUTO_TEST_SKIP_RUN: for arg_desc in ref_arg_data: if arg_desc is None: continue if not arg_desc.needs_checking: continue from pyopencl.compyte.array import as_strided ref_ary = as_strided( arg_desc.ref_storage_array.get(), shape=arg_desc.ref_shape, strides=arg_desc.ref_numpy_strides).flatten() test_ary = as_strided( arg_desc.test_storage_array.get(), shape=arg_desc.test_shape, strides=arg_desc.test_numpy_strides).flatten() common_len = min(len(ref_ary), len(test_ary)) ref_ary = ref_ary[:common_len] test_ary = test_ary[:common_len] error_is_small, error = check_result(test_ary, ref_ary) if not error_is_small: raise AutomaticTestFailure(error) need_check = False events = [] queue.finish() logger.info("%s: warmup done" % (knl.name)) logger.info("%s: timing run" % (knl.name)) timing_rounds = max(warmup_rounds, 1) while True: from time import time start_time = time() evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): if not AUTO_TEST_SKIP_RUN: evt, _ = compiled(queue, **args) events.append(evt) else: events.append(cl.enqueue_marker(queue)) evt_end = cl.enqueue_marker(queue) queue.finish() stop_time = time() for evt in events: evt.wait() evt_start.wait() evt_end.wait() elapsed_event = (1e-9*events[-1].profile.END - 1e-9*events[0].profile.START) \ / timing_rounds try: elapsed_event_marker = ((1e-9 * evt_end.profile.START - 1e-9 * evt_start.profile.START) / timing_rounds) except cl.RuntimeError: elapsed_event_marker = None elapsed_wall = (stop_time - start_time) / timing_rounds if elapsed_wall * timing_rounds < 0.3: timing_rounds *= 4 else: break logger.info("%s: timing run done" % (knl.name)) rates = "" for cnt, lbl in zip(op_count, op_label): rates += " {:g} {}/s".format(cnt / elapsed_wall, lbl) if not quiet: def format_float_or_none(v): if v is None: return "<unavailable>" else: return "%g" % v print("elapsed: %s s event, %s s marker-event %s s wall " "(%d rounds)%s" % (format_float_or_none(elapsed_event), format_float_or_none(elapsed_event_marker), format_float_or_none(elapsed_wall), timing_rounds, rates)) if do_check: ref_rates = "" for cnt, lbl in zip(op_count, op_label): ref_rates += " {:g} {}/s".format(cnt / ref_elapsed_event, lbl) if not quiet: print("ref: elapsed: {:g} s event, {:g} s wall{}".format( ref_elapsed_event, ref_elapsed_wall, ref_rates)) # }}} result_dict = {} result_dict["elapsed_event"] = elapsed_event result_dict["elapsed_event_marker"] = elapsed_event_marker result_dict["elapsed_wall"] = elapsed_wall result_dict["timing_rounds"] = timing_rounds if do_check: result_dict["ref_elapsed_event"] = ref_elapsed_event result_dict["ref_elapsed_wall"] = ref_elapsed_wall return result_dict
def test_laplacian_stiffness(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" dim = 2 # (baked into code) Nq = 40 # num. quadrature points (baked into code) Nb = 20 # num. basis functions (baked into code) Nc = 100 # num. cells (run-time symbolic) from pymbolic import var Nc_sym = var("Nc") knl = lp.make_kernel(ctx.devices[0], "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " "and 0<= dx_axis, ax_b < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [ "dPsi(ij, dxi) := sum_float32(@ax_b," " jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))" ], [ lp.GlobalArg("jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), lp.ConstantArg("w", dtype, shape=(Nq,), order=order), lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), lp.ValueArg("Nc", np.int32, approximately=1000), ], name="lapquad", assumptions="Nc>=1") knl = lp.tag_inames(knl, dict(ax_b="unr")) seq_knl = knl def variant_fig31(knl): # This (mostly) reproduces Figure 3.1. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_pg4(knl): # This (mostly) reproduces the unlabeled code snippet on pg. 4. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"] def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"] def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"] def variant_simple_gpu(knl): # This is a simple GPU-ish variant. # It's not the same thing as Matt's code, but I'll need some more time # to reverse-engineer what is going on there. Some discussion might # help, too. :) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"]) knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2]) knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3]) knl = lp.add_prefetch(knl, "jacDet", [1]) return knl, ["K", "i", "j", "q", "ax_b_insn"] # Plug in variant name here # | # v for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) kernel_gen = lp.generate_loop_schedules(var_knl, loop_priority=loop_prio) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) #print lp.preprocess_kernel(var_knl) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"Nc": Nc}, print_ref_code=True)