def __get_knl(): return lp.make_kernel('{[i]: 0 <= i < 10}', """ a[i] = b[i] """, [lp.GlobalArg('a', shape=(10,), dtype=np.int32), lp.ConstantArg('b', shape=(10))], target=ExecutableCTarget(), name='cache_test')
def __get_knl(): return lp.make_kernel("{[i]: 0 <= i < 10}", """ a[i] = b[i] """, [lp.GlobalArg("a", shape=(10,), dtype=np.int32), lp.ConstantArg("b", shape=(10))], target=ExecutableCTarget(), name="cache_test")
def test_function_decl_extractor(): # ensure that we can tell the difference between pointers, constants, etc. # in execution from loopy.target.c import ExecutableCTarget knl = lp.make_kernel('{[i]: 0 <= i < 10}', """ a[i] = b[i] + v """, [lp.GlobalArg('a', shape=(10,), dtype=np.int32), lp.ConstantArg('b', shape=(10)), lp.ValueArg('v', dtype=np.int32)], target=ExecutableCTarget()) assert np.allclose(knl(b=np.arange(10), v=-1)[1], np.arange(10) - 1)
def get_vars(self, cname): vars = [] comp = self.templates[cname] # Parameters are constant arguments for p in list(comp.parameters): p_edge = comp.edge_info[p] dtype = hu.get_attribute_value(p_edge.attributes['type']) dims = self.get_edge_dims(p_edge) param_var = lp.ConstantArg(p, dtype, shape=dims) vars.append(param_var) # Need to add read only option for inputs # Inputs, outputs, and states represent global_args = list(comp.input) + list(comp.output) + list(comp.state) dim_vars = [] for g in global_args: g_edge = comp.edge_info[g] dtype = hu.get_attribute_value(g_edge.attributes['type']) dims = self.get_edge_dims(g_edge) for d in dims: if d not in dim_vars: dim_vars.append(d) dim_var = lp.ValueArg(d, dtype=np.int32) vars.append(dim_var) if len(dims) == 0: g_var = lp.ValueArg(g, dtype=dtype) else: g_var = lp.GlobalArg(g, dtype=dtype, shape=dims) vars.append(g_var) # Each flow declaration is a temporary variable declared in the comp scope for s in comp.statements: if s.op_type == 'declaration' and s.op_cat == 'declaration': d_vars = list(s.output) for d in d_vars: edge_decl = comp.edge_info[d] dims = self.get_edge_dims(edge_decl) dtype = hu.get_attribute_value( edge_decl.attributes['type']) t_var = lp.TemporaryVariable(d, dtype=dtype, shape=dims) vars.append(t_var) return vars
def test_laplacian_stiffness(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" dim = 2 # (baked into code) Nq = 40 # num. quadrature points (baked into code) Nb = 20 # num. basis functions (baked into code) Nc = 100 # num. cells (run-time symbolic) from pymbolic import var Nc_sym = var("Nc") knl = lp.make_kernel( ctx.devices[0], "[Nc] -> {[K,i,j,q, dx_axis, ax_b]: 0<=K<Nc and 0<=i,j<%(Nb)d and 0<=q<%(Nq)d " "and 0<= dx_axis, ax_b < %(dim)d}" % dict(Nb=Nb, Nq=Nq, dim=dim), [ "dPsi(ij, dxi) := sum_float32(@ax_b," " jacInv[ax_b,dxi,K,q] * DPsi[ax_b,ij,q])", "A[K, i, j] = sum_float32(q, w[q] * jacDet[K,q] * (" "sum_float32(dx_axis, dPsi$one(i,dx_axis)*dPsi$two(j,dx_axis))))" ], [ lp.GlobalArg( "jacInv", dtype, shape=(dim, dim, Nc_sym, Nq), order=order), lp.ConstantArg("DPsi", dtype, shape=(dim, Nb, Nq), order=order), lp.GlobalArg("jacDet", dtype, shape=(Nc_sym, Nq), order=order), lp.ConstantArg("w", dtype, shape=(Nq, ), order=order), lp.GlobalArg("A", dtype, shape=(Nc_sym, Nb, Nb), order=order), lp.ValueArg("Nc", np.int32, approximately=1000), ], name="lapquad", assumptions="Nc>=1") knl = lp.tag_inames(knl, dict(ax_b="unr")) seq_knl = knl def variant_fig31(knl): # This (mostly) reproduces Figure 3.1. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_pg4(knl): # This (mostly) reproduces the unlabeled code snippet on pg. 4. knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"] def variant_fig32(knl): # This (mostly) reproduces Figure 3.2. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"}) return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"] def variant_fig33(knl): # This is meant to (mostly) reproduce Figure 3.3. Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc") knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None) knl = lp.tag_inames(knl, {"j": "ilp.seq"}) return knl, ["Ko", "Kloc"] def variant_simple_gpu(knl): # This is a simple GPU-ish variant. # It's not the same thing as Matt's code, but I'll need some more time # to reverse-engineer what is going on there. Some discussion might # help, too. :) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) return knl, ["K", "i", "j", "q", "ax_b_insn"] def variant_simple_gpu_prefetch(knl): # This adds prefetching to the GPU variant above. # In this variant (on my machine), loopy makes a silly choice # for the upper bound of Kloc (it uses Nc). I'll investigate and # fix that. (FIXME) knl = lp.tag_inames(knl, {"dx_axis": "unr"}) Ncloc = 16 knl = lp.split_iname(knl, "K", Ncloc, outer_iname="Ko", inner_iname="Kloc", outer_tag="g.0") knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"}) knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto") knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto") knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto") return knl, ["K", "i", "j", "q", "ax_b_insn"] # Plug in variant name here # | # v for variant in [variant_fig33]: var_knl, loop_prio = variant(knl) kernel_gen = lp.generate_loop_schedules(var_knl, loop_priority=loop_prio) kernel_gen = lp.check_kernels(kernel_gen, dict(Nc=Nc)) #print lp.preprocess_kernel(var_knl) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, op_count=0, op_label="GFlops", parameters={"Nc": Nc}, print_ref_code=True)