def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=30))
def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]", fetch_outer_inames='e', default_tag="l.auto") knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto") # TODO this adds `a` and `b` to domains, which leads to unused inames knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto") knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto") knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n, [ "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])", "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])", "lap[e,i,j] = " " sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))" "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))" ], [ lp.ArrayArg("u", dtype, shape=field_shape, order=order), lp.ArrayArg("lap", dtype, shape=field_shape, order=order), lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.ArrayArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") unroll = 32 seq_knl = knl knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto") knl = lp.add_prefetch(knl, "u", ["i", "j", "o"], default_tag="l.auto") knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto") knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) # knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000)) K = 1000 lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9, op_label="GFlops", parameters={"K": K})
def test_vectorize(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", """ <> temp = 2*b[i] a[i] = temp """) knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32)) knl = lp.set_array_dim_names(knl, "a,b", "i") knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4, split_kwargs=dict(slabs=(0, 1))) knl = lp.tag_data_axes(knl, "a,b", "c,vec") ref_knl = knl ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"}) knl = lp.tag_inames(knl, {"i_inner": "vec"}) knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) code, inf = lp.generate_code(knl) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
def test_unused_hw_axes_in_callee(ctx_factory, inline): ctx = ctx_factory() twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.1"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") lp.auto_test_vs_ref(knl, ctx, knl)
def test_double_hw_axes_used_in_knl_call(inline): from loopy.diagnostic import LoopyError twice = lp.make_function("{[i]: 0<=i<10}", """ y[i] = 2*x[i] """, name="twice") knl = lp.make_kernel("{[i]: 0<=i<10}", """ y[:, i] = twice(x[:, i]) """, [ lp.GlobalArg("x", shape=(10, 10), dtype=float), lp.GlobalArg("y", shape=(10, 10)) ], name="outer") twice = lp.tag_inames(twice, {"i": "l.0"}) knl = lp.tag_inames(knl, {"i": "l.0"}) knl = lp.merge([knl, twice]) if inline: knl = lp.inline_callable_kernel(knl, "twice") with pytest.raises(LoopyError): lp.generate_code_v2(knl)
def variant_simple_gpu(knl): # This is a simple GPU-ish variant. # It's not the same thing as Matt's code, but I'll need some more time # to reverse-engineer what is going on there. Some discussion might # help, too. :) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) return knl, ["K", "i", "j", "q", "ax_b_insn"]
def transform_loopy_program(self, program): # FIXME: This could be much smarter. import loopy as lp # accommodate loopy with and without kernel callables try: all_inames = program.all_inames() except AttributeError: all_inames = program.root_kernel.all_inames() inner_iname = None if "iel" not in all_inames and "i0" in all_inames: outer_iname = "i0" if "i1" in all_inames: inner_iname = "i1" elif "iel" in all_inames: outer_iname = "iel" if "idof" in all_inames: inner_iname = "idof" else: # cannot "fit" the optimization strategy for the provided kernel # => bail return program if inner_iname is not None: program = lp.split_iname(program, inner_iname, 16, inner_tag="l.0") return lp.tag_inames(program, {outer_iname: "g.0"})
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E")]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def element_centers_of_mass(discr): knl = lp.make_kernel( """{[dim,k,i]: 0<=dim<ndims and 0<=k<nelements and 0<=i<nunit_nodes}""", """ panels[dim, k] = sum(i, nodes[dim, k, i])/nunit_nodes """, default_offset=lp.auto, name="find_panel_centers_of_mass", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, ndims=discr.ambient_dim) knl = lp.split_iname(knl, "k", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_inames(knl, dict(dim="ilp")) with cl.CommandQueue(discr.cl_context) as queue: mesh = discr.mesh panels = cl.array.empty(queue, (mesh.ambient_dim, mesh.nelements), dtype=discr.real_dtype) for group_nr, group in enumerate(discr.groups): _, (result,) = knl(queue, nelements=group.nelements, nunit_nodes=group.nunit_nodes, nodes=group.view(discr.nodes()), panels=el_view(discr, group_nr, panels)) panels.finish() panels = panels.with_queue(None) return tuple(panels[d, :] for d in range(mesh.ambient_dim))
def test_global_parallel_reduction_simpler(ctx_factory, size): ctx = ctx_factory() pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") knl = lp.make_kernel( "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", """ <> key = make_uint2(l+nl*g, 1234) {inames=l:g} <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) result = sum(j, tmp[j]) """) ng = 50 knl = lp.fix_parameters(knl, ng=ng) knl = lp.set_options(knl, write_cl=True) ref_knl = knl knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "l_inner") knl = lp.tag_inames(knl, "g:g.0,j:l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
def get_kernel(self): if self.radius_factor < 1.0: radius_expr = "(1.0 - {f}) * rblk + {f} * rqbx" else: radius_expr = "{f} * rqbx" radius_expr = radius_expr.format(f=self.radius_factor) # NOTE: centers of mass are computed using a second-order approximation knl = lp.make_kernel([ "{[irange]: 0 <= irange < nranges}", "{[i]: 0 <= i < npoints}", "{[idim]: 0 <= idim < dim}" ], [""" for irange <> ioffset = srcranges[irange] <> npoints = srcranges[irange + 1] - srcranges[irange] proxy_center[idim, irange] = 1.0 / npoints * \ reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \ {{dup=idim:i}} <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \ (proxy_center[idim, irange] - sources[idim, srcindices[i + ioffset]]) ** 2))) <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \ (proxy_center[idim, irange] - center_int[idim, srcindices[i + ioffset]]) ** 2)) + \ expansion_radii[srcindices[i + ioffset]]) <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \ (proxy_center[idim, irange] - center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \ expansion_radii[srcindices[i + ioffset]]) <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext proxy_radius[irange] = {radius_expr} end """.format(radius_expr=radius_expr)], [ lp.GlobalArg("sources", None, shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"), lp.GlobalArg("center_int", None, shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"), lp.GlobalArg("center_ext", None, shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"), lp.GlobalArg("proxy_center", None, shape=(self.ambient_dim, "nranges")), lp.GlobalArg("proxy_radius", None, shape="nranges"), lp.ValueArg("nsources", np.int), "..." ], name="find_proxy_radii_knl", assumptions="dim>=1 and nranges>=1", fixed_parameters=dict(dim=self.ambient_dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.tag_inames(knl, "idim*:unr") return knl
def test_ilp_write_race_detection_global(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i,j<n }", [ "a[i] = 5+i+j", ], [ lp.GlobalArg("a", np.float32), lp.ValueArg("n", np.int32, approximately=1000), ], assumptions="n>=1") knl = lp.tag_inames(knl, dict(j="ilp")) knl = lp.preprocess_kernel(knl, ctx.devices[0]) with lp.CacheMode(False): from loopy.diagnostic import WriteRaceConditionWarning from warnings import catch_warnings with catch_warnings(record=True) as warn_list: list(lp.generate_loop_schedules(knl)) assert any(isinstance(w.message, WriteRaceConditionWarning) for w in warn_list)
def variant_simple_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") arg_names = [ prefix+name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"]] for name in arg_names: knl = lp.add_padding(knl, name, axis=0, align_bytes=32) knl = lp.tag_inames(knl, dict(m="unr")) return knl
def variant_prefetch_fields(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"]) return knl
def knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes and 0<=j<n_from_nodes}""", "result[target_element_indices[k], i] \ = sum(j, resample_mat[i, j] \ * vec[source_element_indices[k], j])", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("vec", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), "...", ], name="oversample") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def variant_prefetch_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", fetch_outer_inames='k_outer', default_tag="l.auto") return knl
def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y") return knl
def test_nested_scan(ctx_factory, i_tag, j_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "[n] -> {[i]: 0 <= i < n}", "[i] -> {[j]: 0 <= j <= i}", "[i] -> {[k]: 0 <= k <= i}" ], """ <>tmp[i] = sum(k, 1) out[i] = sum(j, tmp[j]) """) knl = lp.fix_parameters(knl, n=10) knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) knl = lp.realize_reduction(knl, force_scan=True) print(knl) evt, (out,) = knl(queue) print(out)
def pick_knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes}""", "result[to_element_indices[k], i] \ = vec[from_element_indices[k], pick_list[i]]", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("vec", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 ))
def parallelize(self, knl, lsize): # global hist is zeroed in its own kernel knl = lp.split_iname(knl, "bb", lsize[0], outer_tag="g.0", inner_tag="l.0", within="id:zero_hist*") # outer_tag of loops writing/reading temp_hist cannot be a global index knl = lp.split_iname(knl, "bb", lsize[0], inner_tag="l.0", within="id:zero_temp*") knl = lp.split_iname(knl, "k", lsize[0], inner_tag="l.0", within="not id:zero* and not id:glb*") knl = lp.split_iname(knl, "b", lsize[0], inner_tag="l.0", within="id:glb*") knl = lp.tag_inames(knl, "j:g.0") return knl
def pick_knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes}""", "result[to_element_indices[k], i] \ = vec[from_element_indices[k], pick_list[i]]", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("vec", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), lp.ValueArg("n_from_nodes", np.int32), "...", ], name="resample_by_picking", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def mat_knl(): import loopy as lp knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i<n_to_nodes and 0<=j<n_from_nodes}""", "result[to_element_indices[k], i] \ = sum(j, resample_mat[i, j] \ * vec[from_element_indices[k], j])", [ lp.GlobalArg("result", None, shape="nelements_result, n_to_nodes", offset=lp.auto), lp.GlobalArg("vec", None, shape="nelements_vec, n_from_nodes", offset=lp.auto), lp.ValueArg("nelements_result", np.int32), lp.ValueArg("nelements_vec", np.int32), "...", ], name="resample_by_mat", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def test_precompute_nested_subst(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ E:=a[i] D:=E*E b[i] = D """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", "i_inner") # There's only one surviving 'E' rule. assert len([ rule_name for rule_name in knl.substitutions if rule_name.startswith("E") ]) == 1 # That rule should use the newly created prefetch inames, # not the prior 'i_inner' assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
def test_precompute_confusing_subst_arguments(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<5}", """ D(i):=a[i+1]-a[i] b[i,j] = D(j) """) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl knl = lp.tag_inames(knl, dict(j="g.1")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") from loopy.symbolic import get_dependencies assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression) knl = lp.precompute(knl, "D", sweep_inames="j", precompute_outer_inames="j, i_inner, i_outer") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(n=12345))
def test_gmem_access_counter_consec(): knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="consec", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) poly = lp.get_gmem_access_poly(knl) n = 512 m = 256 l = 128 params = {'n': n, 'm': m, 'l': l} f64consec = poly[(np.dtype(np.float64), 'consecutive', 'load')].eval_with_dict(params) f32consec = poly[(np.dtype(np.float32), 'consecutive', 'load')].eval_with_dict(params) assert f64consec == 2 * n * m assert f32consec == 3 * n * m * l f64consec = poly[(np.dtype(np.float64), 'consecutive', 'store')].eval_with_dict(params) f32consec = poly[(np.dtype(np.float32), 'consecutive', 'store')].eval_with_dict(params) assert f64consec == n * m assert f32consec == n * m * l
def test_split_iname2(self): "Split useful for omp simd inner loop" knl = lp.split_iname(self.knl, 'i', 8) knl = lp.tag_inames(knl, [( 'i_inner', 'ilp.unr', )]) print(self._dtype_and_code(knl))
def variant_pg4(knl): # This (mostly) reproduces the unlabeled code snippet on pg. 4. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"]
def test_parallel_multi_output_reduction(): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, fabs(a[i])) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl) print(knl)
def knl(): knl = lp.make_kernel( "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}", "result[k,i] = weights[i]", name="quad_weights") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def test_interp_diff(ctx_factory): 1/0 # not ready dtype = np.float32 ctx = ctx_factory() order = "C" N = 8 M = 8 from pymbolic import var K_sym = var("K") field_shape = (N, N, N, K_sym) interim_field_shape = (M, M, M, K_sym) # 1. direction-by-direction similarity transform on u # 2. invert diagonal # 3. transform back (direction-by-direction) # K - run-time symbolic knl = lp.make_kernel(ctx.devices[0], "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N [ "[|i,jp,kp] <float32> u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])", "[|i,j ,kp] <float32> u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])", "[|i,j ,k ] <float32> u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])", "[|i,j ,k ] <float32> Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]", "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])", "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])", "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])", ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("P", dtype, shape=interim_field_shape, order=order), lp.GlobalArg("I", dtype, shape=(M, N), order=order), lp.GlobalArg("V", dtype, shape=(N, M), order=order), lp.GlobalArg("Pu", dtype, shape=field_shape, order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="sem_lap_precon", assumptions="K>=1") print(knl) 1/0 knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i="l.0", j="l.1")) print(knl) #1/0 kernel_gen = lp.generate_loop_schedules(knl) kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"K": K}, print_seq_code=True,)
def transform_surface_kernel(knl): #print knl knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0")) knl = lp.split_iname(knl, "mp", 4, inner_tag="unr") knl = lp.add_prefetch(knl, "LIFT") for name in ["nx", "ny", "nz", "Fscale", "bc"]: knl = lp.add_prefetch(knl, name) knl = lp.set_loop_priority(knl, "mp_outer,mp_inner") return knl
def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto") knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto") return knl, ["K", "i", "j", "q", "ax_b_insn"]
def knl(): knl = lp.make_kernel( "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}", "result[k,i] = weights[i]", name="quad_weights", default_offset=lp.auto) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"]) knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2]) knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3]) knl = lp.add_prefetch(knl, "jacDet", [1]) return knl, ["K", "i", "j", "q", "ax_b_insn"]
def knl(): import loopy as lp knl = lp.make_kernel( "{[el, idof, jdof]: 0<=el<nelements and 0<=idof,jdof<ndofs}", "result[el, idof] = %s(jdof, input[el, jdof])" % reduction_name, default_offset=lp.auto, lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.tag_inames(knl, "el:g.0,idof:l.0") return knl
def test_mem_access_counter_nonconsec(): knl = lp.make_kernel( "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [ """ c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] e[i, k] = g[i,k]*(2+h[i,k]) """ ], name="nonconsec", assumptions="n,m,ell >= 1") knl = lp.add_and_infer_dtypes( knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 ell = 128 params = {'n': n, 'm': m, 'ell': ell} f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='g')].eval_with_dict(params) f64nonconsec += mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='load', variable='h')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='a')].eval_with_dict(params) f32nonconsec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=Variable('m') * Variable('ell'), direction='load', variable='b')].eval_with_dict(params) assert f64nonconsec == 2 * n * m assert f32nonconsec == 3 * n * m * ell f64nonconsec = mem_map[lp.MemAccess('global', np.float64, stride=Variable('m'), direction='store', variable='e')].eval_with_dict(params) f32nonconsec = mem_map[lp.MemAccess('global', np.float32, stride=Variable('m') * Variable('ell'), direction='store', variable='c')].eval_with_dict(params) assert f64nonconsec == n * m assert f32nonconsec == n * m * ell
def test_batched_sparse(): fortran_src = """ subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y) implicit none integer rowstarts(m+1), colindices(nvals) real*8 values(nvals) real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) integer m, n, rowstart, rowend, length, nvals, nvecs integer i, j, k do i = 1, m rowstart = rowstarts(i) rowend = rowstarts(i+1) length = rowend - rowstart do k = 1, nvecs rowsum(k) = 0 enddo do k = 1, nvecs do j = 1, length rowsum(k) = rowsum(k) + & x(colindices(rowstart+j-1),k)*values(rowstart+j-1) end do end do do k = 1, nvecs y(i,k) = rowsum(k) end do end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) knl = lp.tag_inames(knl, {"i_inner": "l.0"}) knl = lp.add_prefetch(knl, "values", default_tag="l.auto") knl = lp.add_prefetch(knl, "colindices", default_tag="l.auto") knl = lp.fix_parameters(knl, nvecs=4)
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1") knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) print( lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )))
def knl(): knl = lp.make_kernel( "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}", "result[k,i] = weights[i]", name="quad_weights", default_offset=lp.auto, lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def knl(): knl = lp.make_kernel( """{[k,i,j]: 0<=k<nelements and 0<=i,j<ndiscr_nodes}""", "result[k,i] = sum(j, diff_mat[i, j] * vec[k, j])", default_offset=lp.auto, name="diff") knl = lp.split_iname(knl, "i", 16, inner_tag="l.0") return lp.tag_inames(knl, dict(k="g.0"))
def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]
def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"]
def get_kernel(self): ncoeffs = len(self.expansion) from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( [ "{[isrc_box]: 0<=isrc_box<nsrc_boxes}", "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}", ], [""" for isrc_box <> src_ibox = source_boxes[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> center[idim] = centers[idim, src_ibox] {id=fetch_center} for isrc <> a[idim] = center[idim] - sources[idim, isrc] {dup=idim} <> strength = strengths[isrc] """] + self.get_loopy_instructions() + [""" end """] + [""" tgt_expansions[src_ibox-tgt_base_ibox, {coeffidx}] = \ simul_reduce(sum, isrc, strength*coeff{coeffidx}) \ {{id_prefix=write_expn}} """.format(coeffidx=i) for i in range(ncoeffs)] + [""" end """], [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources"), dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.ValueArg("rscale", None), lp.GlobalArg("tgt_expansions", None, shape=("nboxes", ncoeffs), offset=lp.auto), lp.ValueArg("nboxes,aligned_nboxes,tgt_base_ibox", np.int32), lp.ValueArg("nsources", np.int32), "..." ] + gather_loopy_source_arguments([self.expansion]), name=self.name, assumptions="nsrc_boxes>=1", silenced_warnings="write_race(write_expn*)", default_offset=lp.auto, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") return loopy_knl
def transform_vol(knl): knl = lp.tag_inames(knl, dict(n="l.0", k="g.0")) #knl = lp.change_arg_to_image(knl, "DrDsDt") # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name) for name in ["drst_dx", "drst_dy", "drst_dz"]: knl = lp.add_prefetch(knl, "%s" % name) knl = lp.add_prefetch(knl, "DrDsDt") return knl
def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) return knl
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1" ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) print(lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )))
def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() kernel_exprs = self.get_kernel_exprs(result_names) arguments = ( self.get_default_src_tgt_arguments() + [ lp.GlobalArg("srcindices", None, shape="nresult"), lp.GlobalArg("tgtindices", None, shape="nresult"), lp.ValueArg("nresult", None) ] + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") for i, dtype in enumerate(self.value_dtypes)]) loopy_knl = lp.make_kernel( "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", self.get_kernel_scaling_assignments() # NOTE: itgt, isrc need to always be defined in case a statement # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in # places like get_kernel_exprs) + [""" for imat <> itgt = tgtindices[imat] <> isrc = srcindices[imat] <> d[idim] = targets[idim, itgt] - sources[idim, isrc] """] + [""" <> is_self = (isrc == target_to_source[itgt]) """ if self.exclude_self else ""] + loopy_insns + kernel_exprs + [""" result_{i}[imat] = \ knl_{i}_scaling * pair_result_{i} \ {{id_prefix=write_p2p}} """.format(i=iknl) for iknl in range(len(self.kernels))] + ["end"], arguments, assumptions="nresult>=1", silenced_warnings="write_race(write_p2p*)", name=self.name, fixed_parameters=dict(dim=self.dim), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.add_dtypes(loopy_knl, dict(nsources=np.int32, ntargets=np.int32)) for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) return loopy_knl