def test_plain_matrix_mul(ctx_factory): ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) for dtype, check, vec_size in [ (cl_array.vec.float4, check_float4, 4), (np.float32, None, 1), ]: knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner", ]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[vec_size*2*n**3/1e9], op_label=["GFlops"], parameters={"n": n}, check_result=check)
def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") return knl
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def right_u(ctx): order = "C" dtype = np.float32 n=128 r=3 knl = lp.make_kernel(ctx.devices[0], "{[i,j,k,s]: 0<=i<%d and 0<=j<%d and 0<=k<%d and 0<=s<%d}" % (n,n,n,r), [ "f[i,s] = sum((j,k), a[i,j,k]*v[j,s]*w[k,s])", ], [ lp.GlobalArg("a", dtype, shape=(n, n, n), order=order), lp.GlobalArg("v", dtype, shape=(n, r), order=order), lp.GlobalArg("w", dtype, shape=(n, r), order=order), lp.GlobalArg("f", dtype, shape=(n, r), order=order), ], name="pravaya") knl = lp.split_iname(knl, "i", 2,outer_tag = "g.0", inner_tag="l.1") knl = lp.split_iname(knl, "s", r, outer_tag = "g.1", inner_tag = "l.0") knl = lp.split_iname(knl, "j", 16) knl = lp.split_iname(knl, "k", 16) #,16,outer_tag="g.2", inner_tag="l.2") # knl = lp.add_prefetch(knl, "a", ["k_inner","j_inner", "i_inner"]) knl = lp.add_prefetch(knl, "v", ["s_inner", "j_inner"]) knl = lp.add_prefetch(knl, "w", ["s_inner", "k_inner"]) #knl = lp.add_prefetch(knl, "a", ["k_inner", "j_inner"]) #knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) seq_knl = knl return knl
def test_fancy_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape="(n, n)", order=order), lp.GlobalArg("b", dtype, shape="(n, n)", order=order), lp.GlobalArg("c", dtype, shape="(n, n)", order=order), lp.ValueArg("n", np.int32, approximately=1000), ], name="fancy_matmul", assumptions="n>=1") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_iname(knl, "k", 16, slabs=(0, 1)) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters=dict(n=n))
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={"n": n})
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def transform_surface_kernel(knl): #print knl knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0")) knl = lp.split_iname(knl, "mp", 4, inner_tag="unr") knl = lp.add_prefetch(knl, "LIFT") for name in ["nx", "ny", "nz", "Fscale", "bc"]: knl = lp.add_prefetch(knl, name) knl = lp.set_loop_priority(knl, "mp_outer,mp_inner") return knl
def variant_3(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.add_prefetch(knl, "a", ["i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner"]) return knl
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.1") knl = knl.set_loop_priority(knl, ["i", "j"]) knl = lp.add_prefetch(knl, "a") knl = lp.add_prefetch(knl, "b") return knl
def transform_vol(knl): knl = lp.tag_inames(knl, dict(n="l.0", k="g.0")) #knl = lp.change_arg_to_image(knl, "DrDsDt") # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name) for name in ["drst_dx", "drst_dy", "drst_dz"]: knl = lp.add_prefetch(knl, "%s" % name) knl = lp.add_prefetch(knl, "DrDsDt") return knl
def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) return knl
def test_intel_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 128+32 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 4 j_reg = 4 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) #knl = lp.split_iname(knl, "k_inner", 8, outer_tag="unr") knl = lp.add_prefetch(knl, 'a', ["i_inner_inner", "k_inner", "i_inner_outer"], default_tag="l.auto") knl = lp.add_prefetch(knl, 'b', ["j_inner_inner", "k_inner", "j_inner_outer"], default_tag="l.auto") # FIXME: Grouped prefetch #knl = lp.add_prefetch(knl, 'a', ["k_inner", ("i_inner_inner", "i_inner_outer")], # default_tag="l.auto") #knl = lp.add_prefetch(knl, 'b', # ["k_inner", ("j_inner_inner", "j_inner_outer"),], default_tag="l.auto") #hints=["k_outer", "k_inner_outer", "k_inner_inner"] lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def test_fd_demo(): knl = lp.make_kernel( "{[i,j]: 0<=i,j<n}", "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \ + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \ + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]") #assumptions="n mod 16=0") knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "u", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") #n = 1000 #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32) knl = lp.set_options(knl, write_cl=True) knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32)) code, inf = lp.generate_code(knl) print(code) assert "double" not in code
def variant_prefetch_fields(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"]) return knl
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) knl = lp.set_loop_priority(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_small_batched_matvec(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" K = 9997 # noqa Np = 36 # noqa knl = lp.make_kernel( "{[i,j,k]: 0<=k<K and 0<= i,j < %d}" % Np, [ "result[k, i] = sum(j, d[i, j]*f[k, j])" ], [ lp.GlobalArg("d", dtype, shape=(Np, Np), order=order), lp.GlobalArg("f", dtype, shape=("K", Np), order=order), lp.GlobalArg("result", dtype, shape=("K", Np), order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="batched_matvec", assumptions="K>=1") seq_knl = knl align_bytes = 64 knl = lp.add_prefetch(knl, 'd[:,:]') pad_mult = lp.find_padding_multiple(knl, "f", 0, align_bytes) knl = lp.split_array_dim(knl, ("f", 0), pad_mult) knl = lp.add_padding(knl, "f", 0, align_bytes) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[K*2*Np**2/1e9], op_label=["GFlops"], parameters=dict(K=K))
def test_ilp_race_matmul(ctx_factory): dtype = np.float32 order = "C" n = 9 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_iname(knl, "j", 2, outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) knl = lp.add_prefetch(knl, 'b', ["k_inner"]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: knl = lp.preprocess_kernel(knl) list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 6*16*2 knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl i_reg = 2 j_reg = 2 i_chunks = 16 j_chunks = 16 knl = lp.split_iname(knl, "i", i_reg*i_chunks, outer_tag="g.0") knl = lp.split_iname(knl, "i_inner", i_reg, outer_tag="l.0", inner_tag="ilp") knl = lp.split_iname(knl, "j", j_reg*j_chunks, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_reg, outer_tag="l.1", inner_tag="ilp") knl = lp.split_iname(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner_inner", "i_inner_outer"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def test_transpose(ctx_factory): dtype = np.dtype(np.float32) ctx = ctx_factory() order = "C" n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i,j<%d}" % n, [ "b[i, j] = a[j, i]" ], [ lp.GlobalArg("a", dtype, shape=(n, n), order=order), lp.GlobalArg("b", dtype, shape=(n, n), order=order), ], name="transpose") seq_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.add_prefetch(knl, 'a', ["i_inner", "j_inner"]) lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={})
def no_test_image_matrix_mul_ilp(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" if (not ctx.devices[0].image_support or ctx.devices[0].platform.name == "Portable Computing Language"): pytest.skip("crashes on pocl") image_format = cl.ImageFormat(cl.channel_order.R, cl.channel_type.FLOAT) if image_format not in cl.get_supported_image_formats( ctx, cl.mem_flags.READ_ONLY, cl.mem_object_type.IMAGE2D): pytest.skip("image format not supported") n = get_suitable_size(ctx) knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<%d}" % n, [ "c[i, j] = sum(k, a[i, k]*b[k, j])" ], [ lp.ImageArg("a", dtype, shape=(n, n)), lp.ImageArg("b", dtype, shape=(n, n)), lp.GlobalArg("c", dtype, shape=(n, n), order=order), ], name="matmul") seq_knl = knl ilp = 4 knl = lp.split_iname(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") j_inner_split = 4 knl = lp.split_iname(knl, "j", ilp*j_inner_split, outer_tag="g.1") knl = lp.split_iname(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") knl = lp.split_iname(knl, "k", 2) # conflict-free? knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"], default_tag="l.auto") knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"], default_tag="l.auto") lp.auto_test_vs_ref(seq_knl, ctx, knl, op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={})
def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"]) knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2]) knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3]) knl = lp.add_prefetch(knl, "jacDet", [1]) return knl, ["K", "i", "j", "q", "ax_b_insn"]
def variant_overfetch(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", slabs=(1, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", slabs=(1, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_batched_sparse(): fortran_src = """ subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y) implicit none integer rowstarts(m+1), colindices(nvals) real*8 values(nvals) real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) integer m, n, rowstart, rowend, length, nvals, nvecs integer i, j, k do i = 1, m rowstart = rowstarts(i) rowend = rowstarts(i+1) length = rowend - rowstart do k = 1, nvecs rowsum(k) = 0 enddo do k = 1, nvecs do j = 1, length rowsum(k) = rowsum(k) + & x(colindices(rowstart+j-1),k)*values(rowstart+j-1) end do end do do k = 1, nvecs y(i,k) = rowsum(k) end do end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) knl = lp.tag_inames(knl, {"i_inner": "l.0"}) knl = lp.add_prefetch(knl, "values", default_tag="l.auto") knl = lp.add_prefetch(knl, "colindices", default_tag="l.auto") knl = lp.fix_parameters(knl, nvecs=4)
def variant_4(knl): knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 256, outer_tag="g.1", slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner"], default_tag=None) knl = lp.add_prefetch(knl, "b", ["j_inner"], default_tag=None) knl = lp.split_iname(knl, "i_inner", 16, inner_tag="l.0") knl = lp.split_iname(knl, "j_inner", 16, inner_tag="l.1") knl = lp.split_iname(knl, "a_dim_0", 16, outer_tag="l.1", inner_tag="l.0") knl = lp.split_iname(knl, "b_dim_0", 16, outer_tag="l.1", inner_tag="l.0") return knl
def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "u[e, :, :]") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl
def test_variable_size_matrix_mul(ctx_factory): ctx = ctx_factory() n = get_suitable_size(ctx) knl = lp.make_kernel("{[i,j,k]: 0<=i,j,k<n}", "c[i, j] = sum(k, a[i, k]*b[k, j])") knl = lp.add_dtypes(knl, { "a": np.float32, "b": np.float32, }) ref_knl = knl knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1", slabs=(0, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", slabs=(0, 1)) knl = lp.split_iname(knl, "k", 8, slabs=(0, 1)) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) lp.auto_test_vs_ref(ref_knl, ctx, knl, op_count=[2 * n**3 / 1e9], op_label=["GFlops"], parameters={"n": n})
def test_variable_size_temporary(): knl = lp.make_kernel( ''' { [i,j]: 0<=i,j<n } ''', ''' out[i] = sum(j, a[i,j])''') knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.add_prefetch( knl, "a[:,:]", default_tag=None) # Make sure that code generation succeeds even if # there are variable-length arrays. knl = lp.preprocess_kernel(knl) for k in lp.generate_loop_schedules(knl): lp.generate_code(k)
def test_prefetch_through_indirect_access(): knl = lp.make_kernel("{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", """ for i, j, k a[map1[indirect[i], j], k] = 2 end """, [ lp.GlobalArg("a", strides=(2, 1), dtype=int), lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..." ], target=lp.CTarget()) knl = lp.prioritize_loops(knl, "i,j,k") with pytest.raises(LoopyError): knl = lp.add_prefetch(knl, "map1[:, j]")
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def tune_grid_kernel(knl, shape=None, ng=None, prefetch_args=()): """Parameters for 3D""" if shape is not None: if len(shape) > 3: knl = _lp.fix_parameters(knl, ndim=int(shape[0]), n1=int(shape[1]), n2=int(shape[2]), n3=int(shape[3])) else: knl = _lp.fix_parameters(knl, n1=int(shape[0]), n2=int(shape[1]), n3=int(shape[2])) if ng is not None: knl = _lp.fix_parameters(knl, ng=ng) knl = _lp.split_iname(knl, "k", lsize[0], outer_tag=otags[0], inner_tag=itags[0]) knl = _lp.split_iname(knl, "j", lsize[1], outer_tag=otags[1], inner_tag=itags[1]) knl = _lp.split_iname(knl, "i", lsize[2], outer_tag=otags[2], inner_tag=itags[2]) for arg in prefetch_args: knl = _lp.add_prefetch(knl, arg, "i_inner,j_inner,k_inner", default_tag="l.auto") #knl = lp.set_options(knl, no_numpy=True) return knl