def test_diamond_tiling(ctx_factory, interactive=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) ref_knl = lp.make_kernel( "[nx,nt] -> {[ix, it]: 1<=ix<nx-1 and 0<=it<nt}", """ u[ix, it+2] = ( 2*u[ix, it+1] + dt**2/dx**2 * (u[ix+1, it+1] - 2*u[ix, it+1] + u[ix-1, it+1]) - u[ix, it]) """) knl_for_transform = ref_knl ref_knl = lp.prioritize_loops(ref_knl, "it, ix") import islpy as isl m = isl.BasicMap( "[nx,nt] -> {[ix, it] -> [tx, tt, tparity, itt, itx]: " "16*(tx - tt) + itx - itt = ix - it and " "16*(tx + tt + tparity) + itt + itx = ix + it and " "0<=tparity<2 and 0 <= itx - itt < 16 and 0 <= itt+itx < 16}") knl = lp.map_domain(knl_for_transform, m) knl = lp.prioritize_loops(knl, "tt,tparity,tx,itt,itx") if interactive: nx = 43 u = np.zeros((nx, 200)) x = np.linspace(-1, 1, nx) dx = x[1] - x[0] u[:, 0] = u[:, 1] = np.exp(-100 * x**2) u_dev = cl.array.to_device(queue, u) knl(queue, u=u_dev, dx=dx, dt=dx) u = u_dev.get() import matplotlib.pyplot as plt plt.imshow(u.T) plt.show() else: types = {"dt,dx,u": np.float64} knl = lp.add_and_infer_dtypes(knl, types) ref_knl = lp.add_and_infer_dtypes(ref_knl, types) lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={ "nx": 200, "nt": 300, "dx": 1, "dt": 1 })
def build_loopy_kernel_A_text(): knl_name = "kernel_tensor_A" knl = lp.make_kernel("{ [i,j,k]: 0<=i,j<n and 0<=k<m }", """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """, name=knl_name, assumptions="n >= 1 and m >= 1", lang_version=lp.MOST_RECENT_LANGUAGE_VERSION, target=lp.CTarget()) knl = lp.add_and_infer_dtypes( knl, { "A": np.dtype(np.double), "B": np.dtype(np.double), "c": np.dtype(np.double) }) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") #print(knl) knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.prioritize_loops(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_prefetch_local_into_private(): # https://gitlab.tiker.net/inducer/loopy/-/issues/210 n = 32 m = 32 n_vecs = 32 knl = lp.make_kernel( """{[k,i,j]: 0<=k<n_vecs and 0<=i<m and 0<=j<n}""", """ result[i,k] = sum(j, mat[i, j] * vec[j, k]) """, kernel_data=[ lp.GlobalArg("result", np.float32, shape=(m, n_vecs), order="C"), lp.GlobalArg("mat", np.float32, shape=(m, n), order="C"), lp.GlobalArg("vec", np.float32, shape=(n, n_vecs), order="C") ], assumptions="n > 0 \ and m > 0 \ and n_vecs > 0", name="mxm" ) knl = lp.fix_parameters(knl, m=m, n=n, n_vecs=n_vecs) knl = lp.prioritize_loops(knl, "i,k,j") knl = lp.add_prefetch( knl, "mat", "i, j", temporary_name="s_mat", default_tag="for") knl = lp.add_prefetch( knl, "s_mat", "j", temporary_name="p_mat", default_tag="for")
def test_generate_c_snippet(): from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel("{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I] * u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") print(lp.generate_code_v2(knl))
def variant_2(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True) knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array, centers_is_obj_array): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() if targets_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C") if sources_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C") if centers_is_obj_array: loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0", inner_tag="l.0") loopy_knl = lp.split_iname(loopy_knl, "isrc", 256) loopy_knl = lp.prioritize_loops(loopy_knl, ["isrc_outer", "itgt_inner"]) else: from warnings import warn warn( "don't know how to tune layer potential computation for '%s'" % dev) loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0") return loopy_knl
def variant_1(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_poisson_fem(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() nbf = 5 nqp = 5 sdim = 3 knl = lp.make_kernel( "{ [c,i,j,k,ell,ell2,ell3]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ 0 <= ell,ell2 < sdim}", """ dpsi(bf,k0,dir) := \ simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) Ael[c,i,j] = \ J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") print(knl) knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl knl = lp.prioritize_loops(knl, ["c", "j", "i", "k"]) def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl def add_types(knl): return lp.add_and_infer_dtypes(knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, )) for variant in [ #variant_1, variant_2 ]: knl = variant(knl) lp.auto_test_vs_ref( add_types(ref_knl), ctx, add_types(knl), parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
def variant_overfetch(knl): knl = lp.split_iname(knl, "i", 16, outer_tag="g.1", inner_tag="l.1", slabs=(1, 1)) knl = lp.split_iname(knl, "j", 16, outer_tag="g.0", inner_tag="l.0", slabs=(1, 1)) knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], fetch_bounding_box=True, default_tag="l.auto") knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl
def test_split_iname3(self): "Split one of two inames." from loopy.target.ispc import ISPCTarget as CTarget knl = lp.make_kernel("{[i,j]:0<=i,j<n}", "out[i, j] = in[i, j]", target=CTarget()) knl = lp.split_iname(knl, 'i', 8) knl = lp.prioritize_loops(knl, ['i_outer', 'j', 'i_inner']) print(self._dtype_and_code(knl))
def generate(impero_c, args, precision, scalar_type, kernel_name="loopy_kernel", index_names=[]): """Generates loopy code. :arg impero_c: ImperoC tuple with Impero AST and other data :arg args: list of loopy.GlobalArgs :arg precision: floating-point precision for printing :arg scalar_type: type of scalars as C typename string :arg kernel_name: function name of the kernel :arg index_names: pre-assigned index names :returns: loopy kernel """ ctx = LoopyContext() ctx.indices = impero_c.indices ctx.index_names = defaultdict(lambda: "i", index_names) ctx.precision = precision ctx.scalar_type = scalar_type ctx.epsilon = 10.0 ** (-precision) # Create arguments data = list(args) for i, temp in enumerate(impero_c.temporaries): name = "t%d" % i if isinstance(temp, gem.Constant): data.append(lp.TemporaryVariable(name, shape=temp.shape, dtype=temp.array.dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True)) else: shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape data.append(lp.TemporaryVariable(name, shape=shape, dtype=numpy.float64, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False)) ctx.gem_to_pymbolic[temp] = p.Variable(name) # Create instructions instructions = statement(impero_c.tree, ctx) # Create domains domains = [] for idx, extent in ctx.index_extent.items(): inames = isl.make_zero_and_vars([idx]) domains.append(((inames[0].le_set(inames[idx])) & (inames[idx].lt_set(inames[0] + extent)))) if not domains: domains = [isl.BasicSet("[] -> {[]}")] # Create loopy kernel knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"]) # Prevent loopy interchange by loopy knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys())) # Help loopy in scheduling by assigning priority to instructions insn_new = [] for i, insn in enumerate(knl.instructions): insn_new.append(insn.copy(priority=len(knl.instructions) - i)) knl = knl.copy(instructions=insn_new) return knl
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format(inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv)) knl = lp.prioritize_loops(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 # This is needed for correctness, otherwise ordering could foul things up. knl = lp.assignment_to_subst(knl, "prev") knl = lp.assignment_to_subst(knl, "prev_0") ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def variant_gpu(knl): knl = lp.expand_subst(knl) knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "j", 256) knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"], ["x_fetch_j", "x_fetch_k"], default_tag=None) knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0")) knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None) knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"]) return knl
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv = lp.parse_fortran( fortran_template.format(inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]), data_flow=[("result", 0, 1)]) knl = knl.with_kernel( lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k")) assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_chunk_iname(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..."], assumptions="n>0") ref_knl = knl knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") knl = lp.prioritize_loops(knl, "i_outer, i_inner") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."], target=lp.PyOpenCLTarget(ctx.devices[0])) knl = lp.split_iname(knl, "i", 16) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") code = lp.generate_code_v2(knl).device_code() assert "if" not in code
def tune_geom_kernel(knl, shape=None, ng=None): """Parameters specific to kernels where one array is 2D and one is 3D""" # These kernels get run on lots of grid sizes if shape is not None: pass # Deal with various vectros/tensors #knl = lp.fix_parameters(knl, n1=int(shape[0]), n2=int(shape[1]), n3=int(shape[2])) knl = _lp.split_iname( knl, "k", lsize[0], outer_tag="for", inner_tag="unr") # Worthwhile to cache so explicitly? #knl = lp.split_iname(knl, "k", lsize[0], outer_tag=otags[0], inner_tag=itags[0]) knl = _lp.split_iname(knl, "j", lsize[1], outer_tag=otags[0], inner_tag=itags[0]) knl = _lp.split_iname(knl, "i", lsize[2], outer_tag=otags[1], inner_tag=itags[1]) # TODO caching geom values is especially important. Try things here... if shape is not None: if len(shape) > 4: knl = _lp.prioritize_loops( knl, "mu, nu,i_outer,i_inner,j_outer,j_inner,k_outer,k_inner") elif len(shape) > 3: knl = _lp.prioritize_loops( knl, "mu,i_outer,i_inner,j_outer,j_inner,k_outer,k_inner") else: knl = _lp.prioritize_loops( knl, "i_outer,i_inner,j_outer,j_inner,k_outer,k_inner") # Currently these functions get used on frontend, too #knl = lp.set_options(knl, no_numpy=True) return knl
def test_chunk_iname(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], assumptions="n>0") ref_knl = knl knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0") knl = lp.prioritize_loops(knl, "i_outer, i_inner") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
def test_fuse_kernels(ctx_factory): fortran_template = """ subroutine {name}(nelements, ndofs, result, d, q) implicit none integer e, i, j, k integer nelements, ndofs real*8 result(nelements, ndofs, ndofs) real*8 q(nelements, ndofs, ndofs) real*8 d(ndofs, ndofs) real*8 prev do e = 1,nelements do i = 1,ndofs do j = 1,ndofs do k = 1,ndofs {inner} end do end do end do end do end subroutine """ xd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,i,k) """ yd_line = """ prev = result(e,i,j) result(e,i,j) = prev + d(i,k)*q(e,k,j) """ xderiv, = lp.parse_fortran( fortran_template.format(inner=xd_line, name="xderiv")) yderiv, = lp.parse_fortran( fortran_template.format(inner=yd_line, name="yderiv")) xyderiv, = lp.parse_fortran( fortran_template.format( inner=(xd_line + "\n" + yd_line), name="xyderiv")) knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)]) knl = lp.prioritize_loops(knl, "e,i,j,k") assert len(knl.temporary_variables) == 2 ctx = ctx_factory() lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
def test_numba_cuda_target(): knl = lp.make_kernel("{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def test_prefetch_through_indirect_access(): knl = lp.make_kernel( "{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", """ for i, j, k a[map1[indirect[i], j], k] = 2 end """, [ lp.GlobalArg("a", strides=(2, 1), dtype=int), lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..." ], target=lp.CTarget()) knl = lp.prioritize_loops(knl, "i,j,k") with pytest.raises(LoopyError): knl = lp.add_prefetch(knl, "map1[:, j]")
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: loopy_knl = lp.split_iname(loopy_knl, "itgt", 16, outer_tag="g.0", inner_tag="l.0") loopy_knl = lp.split_iname(loopy_knl, "isrc", 256) loopy_knl = lp.prioritize_loops(loopy_knl, ["isrc_outer", "itgt_inner"]) else: from warnings import warn warn("don't know how to tune layer potential computation for '%s'" % dev) loopy_knl = lp.split_iname(loopy_knl, "itgt", 128, outer_tag="g.0") return loopy_knl
def test_prefetch_through_indirect_access(): knl = lp.make_kernel("{[i, j, k]: 0 <= i,k < 10 and 0<=j<2}", """ for i, j, k a[map1[indirect[i], j], k] = 2 end """, [ lp.GlobalArg("a", strides=(2, 1), dtype=int), lp.GlobalArg("map1", shape=(10, 10), dtype=int), "..." ], target=lp.CTarget()) knl = lp.prioritize_loops(knl, "i,j,k") with pytest.raises(LoopyError): knl = lp.add_prefetch(knl, "map1[:, j]")
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel("{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_assume(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0<=i<n}", "a[i] = a[i] + 1", [lp.GlobalArg("a", np.float32, shape="n"), "..."]) knl = lp.split_iname(knl, "i", 16) knl = lp.prioritize_loops(knl, "i_outer,i_inner") knl = lp.assume(knl, "n mod 16 = 0") knl = lp.assume(knl, "n > 10") knl = lp.preprocess_kernel(knl, ctx.devices[0]) kernel_gen = lp.generate_loop_schedules(knl) for gen_knl in kernel_gen: print(gen_knl) compiled = lp.CompiledKernel(ctx, gen_knl) print(compiled.get_code()) assert "if" not in compiled.get_code()
def test_generate_c_snippet(): from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.prioritize_loops(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.prioritize_loops(knl, "c,i,j") return knl
def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset( [builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [ tuple(merger(i) for i in inits) for inits in mapper.initialisers ] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % ( prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [ rename_nodes(inits, replacements) for inits in mapper.initialisers ] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append( d.add_constraint( isl.Constraint.eq_from_names(d.space, { "n": 1, "start": -1 }))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append( d.add_constraint( isl.Constraint.eq_from_names( d.space, { "layer": 1, t1: -1, 1: 1 }))) else: new_domains.append(d) domains = new_domains assumptions, = reduce( operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set( pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set( ["#include <math.h>", "#include <complex.h>", "#include <petsc.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, petsc_function_lookup) return wrapper
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa pytest.importorskip("fparser") ctx = ctx_factory() filename = os.path.join(os.path.dirname(__file__), "strongVolumeKernels.f90") with open(filename) as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") program = lp.parse_fortran(source, filename, seq_dependencies=False) hsv_r, hsv_s = program["strongVolumeKernelR"], program[ "strongVolumeKernelS"] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s hsv = lp.add_nosync(hsv, "any", "writes:rhsQ", "writes:rhsQ", force=True) from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) from loopy.frontend.fortran.translator import specialize_fortran_division hsv = specialize_fortran_division(hsv) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]", fetch_outer_inames="e", default_tag="l.auto") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute(hsv, lp.find_one_rule_matching( hsv, prep_var_name + "_*subst*"), default_tag="l.auto") if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames, default_tag="l.auto") if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros" ]) if 1: print("OPS") op_map = lp.get_op_map(hsv, subgroup_size=32) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) # FIXME: renaming's a bit tricky in this program model. # add a simple transformation for it # hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def variant_1(knl): knl = lp.add_prefetch(knl, "a", default_tag="l.auto") knl = lp.add_prefetch(knl, "b", default_tag="l.auto") knl = lp.prioritize_loops(knl, ["i", "j"]) knl = lp.add_inames_to_insn(knl, "i", "writes:b_fetch") return knl
def set_up_volume_loop(kernel, Nq): # noqa kernel = lp.fix_parameters(kernel, Nq=Nq) kernel = lp.prioritize_loops(kernel, "e,k,j,i") kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0")) kernel = lp.assume(kernel, "elements >= 1") return kernel
def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", "{[itgt]: itgt_start<=itgt<itgt_end}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }", "{[idim]: 0<=idim<dim}", ], self.get_kernel_scaling_assignment() + [ """ for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] for itgt <> tgt[idim] = targets[idim,itgt] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] """ ] + [ """ <> coeff{coeffidx} = \ src_expansions[src_ibox - src_base_ibox, {coeffidx}] """.format(coeffidx=i) for i in range(ncoeffs) ] + [ """ <> center[idim] = centers[idim, src_ibox] {dup=idim} <> b[idim] = tgt[idim] - center[idim] {dup=idim} """ ] + loopy_insns + [""" end """] + [ """ result[{resultidx}, itgt] = result[{resultidx}, itgt] + \ kernel_scaling * simul_reduce(sum, isrc_box, result_{resultidx}_p) {{id_prefix=write_result}} """.format(resultidx=i) for i in range(len(result_names)) ] + [ """ end end """ ], [ lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), dim_tags="sep,C"), lp.GlobalArg("box_target_starts,box_target_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_expansions", None, shape=("nsrc_level_boxes", ncoeffs), offset=lp.auto), lp.ValueArg("src_base_ibox", np.int32), lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32), lp.ValueArg("ntargets", np.int32), lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C"), lp.GlobalArg("source_box_starts, source_box_lists,", None, shape=None, offset=lp.auto), "..." ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_result*)", default_offset=lp.auto, fixed_parameters=dict(dim=self.dim, nresults=len(result_names))) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box") loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) return loopy_knl
osc = model.HMJE() osc.dt = 0.1 osc_knl = osc.kernel(target) osc_knl = batch_knl(osc_knl) osc_fn = compiler.CompiledKernel(osc_knl, comp()) cfun = coupling.Diff(osc) net = network.Network(osc, cfun) net_knl = net.kernel(target) net_knl = batch_knl(net_knl) net_fn = compiler.CompiledKernel(net_knl, comp()) scm = scheme.EulerStep(osc.dt) scm_knl = scm.kernel(target) scm_knl = batch_knl(scm_knl) scm_knl = lp.prioritize_loops(scm_knl, ['i_subj', 'i', 'j']) scm_knl = lp.fix_parameters(scm_knl, nsvar=len(osc.state_sym)) scm_knl = lp.tag_inames(scm_knl, [('j', 'ilp')]) scm_fn = compiler.CompiledKernel(scm_knl, comp()) # build other data arrays next, state, drift = np.zeros((3, nsubj, nnode, osc.state_sym.size), np.float32) input, param, diffs = np.zeros((3, nsubj, nnode, 6), np.float32) obsrv = np.zeros((Dmax + 3, nsubj, nnode, 2), np.float32) LOG.info('obsrv %r %.3f MB', obsrv.shape, obsrv.nbytes / 2**20) # step function def step(n_step=1): for _ in range(n_step):
def generate(builder, wrapper_name=None): if builder.layer_index is not None: outer_inames = frozenset([builder._loop_index.name, builder.layer_index.name]) else: outer_inames = frozenset([builder._loop_index.name]) instructions = list(builder.emit_instructions()) parameters = Bag() parameters.domains = OrderedDict() parameters.assumptions = OrderedDict() parameters.wrapper_arguments = builder.wrapper_args parameters.layer_start = builder.layer_extents[0].name parameters.layer_end = builder.layer_extents[1].name parameters.conditions = [] parameters.kernel_data = list(None for _ in parameters.wrapper_arguments) parameters.temporaries = OrderedDict() parameters.kernel_name = builder.kernel.name # replace Materialise mapper = Memoizer(replace_materialise) mapper.initialisers = [] instructions = list(mapper(i) for i in instructions) # merge indices merger = index_merger(instructions) instructions = list(merger(i) for i in instructions) initialiser = list(itertools.chain(*mapper.initialisers)) merger = index_merger(initialiser) initialiser = list(merger(i) for i in initialiser) instructions = instructions + initialiser mapper.initialisers = [tuple(merger(i) for i in inits) for inits in mapper.initialisers] # rename indices and nodes (so that the counters start from zero) pattern = re.compile(r"^([a-zA-Z_]+)([0-9]+)(_offset)?$") replacements = {} counter = defaultdict(itertools.count) for node in traversal(instructions): if isinstance(node, (Index, RuntimeIndex, Variable, Argument, NamedLiteral)): match = pattern.match(node.name) if match is None: continue prefix, _, postfix = match.groups() if postfix is None: postfix = "" replacements[node] = "%s%d%s" % (prefix, next(counter[(prefix, postfix)]), postfix) instructions = rename_nodes(instructions, replacements) mapper.initialisers = [rename_nodes(inits, replacements) for inits in mapper.initialisers] parameters.wrapper_arguments = rename_nodes(parameters.wrapper_arguments, replacements) s, e = rename_nodes([mapper(e) for e in builder.layer_extents], replacements) parameters.layer_start = s.name parameters.layer_end = e.name # scheduling and loop nesting deps = instruction_dependencies(instructions, mapper.initialisers) within_inames = loop_nesting(instructions, deps, outer_inames, parameters.kernel_name) # generate loopy context = Bag() context.parameters = parameters context.within_inames = within_inames context.conditions = [] context.index_ordering = [] context.instruction_dependencies = deps statements = list(statement(insn, context) for insn in instructions) # remote the dummy instructions (they were only used to ensure # that the kernel knows about the outer inames). statements = list(s for s in statements if not isinstance(s, DummyInstruction)) domains = list(parameters.domains.values()) if builder.single_cell: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder._loop_index.name: # n = start new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"n": 1, "start": -1}))) else: new_domains.append(d) domains = new_domains if builder.extruded: new_domains = [] for d in domains: if d.get_dim_name(isl.dim_type.set, 0) == builder.layer_index.name: # layer = t1 - 1 t1 = parameters.layer_end new_domains.append(d.add_constraint(isl.Constraint.eq_from_names(d.space, {"layer": 1, t1: -1, 1: 1}))) else: new_domains.append(d) domains = new_domains assumptions, = reduce(operator.and_, parameters.assumptions.values()).params().get_basic_sets() options = loopy.Options(check_dep_resolution=True, ignore_boostable_into=True) # sometimes masks are not used, but we still need to create the function arguments for i, arg in enumerate(parameters.wrapper_arguments): if parameters.kernel_data[i] is None: arg = loopy.GlobalArg(arg.name, dtype=arg.dtype, shape=arg.shape) parameters.kernel_data[i] = arg if wrapper_name is None: wrapper_name = "wrap_%s" % builder.kernel.name pwaffd = isl.affs_from_space(assumptions.get_space()) assumptions = assumptions & pwaffd["start"].ge_set(pwaffd[0]) if builder.single_cell: assumptions = assumptions & pwaffd["start"].lt_set(pwaffd["end"]) else: assumptions = assumptions & pwaffd["start"].le_set(pwaffd["end"]) if builder.extruded: assumptions = assumptions & pwaffd[parameters.layer_start].le_set(pwaffd[parameters.layer_end]) assumptions = reduce(operator.and_, assumptions.get_basic_sets()) wrapper = loopy.make_kernel(domains, statements, kernel_data=parameters.kernel_data, target=loopy.CTarget(), temporary_variables=parameters.temporaries, symbol_manglers=[symbol_mangler], options=options, assumptions=assumptions, lang_version=(2018, 2), name=wrapper_name) # prioritize loops for indices in context.index_ordering: wrapper = loopy.prioritize_loops(wrapper, indices) # register kernel kernel = builder.kernel headers = set(kernel._headers) headers = headers | set(["#include <math.h>"]) preamble = "\n".join(sorted(headers)) from coffee.base import Node if isinstance(kernel._code, loopy.LoopKernel): knl = kernel._code wrapper = loopy.register_callable_kernel(wrapper, knl) from loopy.transform.callable import _match_caller_callee_argument_dimension_ wrapper = _match_caller_callee_argument_dimension_(wrapper, knl.name) wrapper = loopy.inline_callable_kernel(wrapper, knl.name) else: # kernel is a string, add it to preamble if isinstance(kernel._code, Node): code = kernel._code.gencode() else: code = kernel._code wrapper = loopy.register_function_id_to_in_knl_callable_mapper( wrapper, PyOP2KernelLookup(kernel.name, code, tuple(builder.argument_accesses))) preamble = preamble + "\n" + code wrapper = loopy.register_preamble_generators(wrapper, [_PreambleGen(preamble)]) # register petsc functions wrapper = loopy.register_function_id_to_in_knl_callable_mapper(wrapper, petsc_function_lookup) return wrapper
def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", "{[itgt]: itgt_start<=itgt<itgt_end}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }", "{[idim]: 0<=idim<dim}", ], self.get_kernel_scaling_assignment() + [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] for itgt <> tgt[idim] = targets[idim,itgt] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] """] + [""" <> coeff{coeffidx} = \ src_expansions[src_ibox - src_base_ibox, {coeffidx}] """.format(coeffidx=i) for i in range(ncoeffs)] + [""" <> center[idim] = centers[idim, src_ibox] {dup=idim} <> b[idim] = tgt[idim] - center[idim] {dup=idim} """] + loopy_insns + [""" end """] + [""" result[{resultidx}, itgt] = result[{resultidx}, itgt] + \ kernel_scaling * simul_reduce(sum, isrc_box, result_{resultidx}_p) {{id_prefix=write_result}} """.format(resultidx=i) for i in range(len(result_names))] + [""" end end """], [ lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), dim_tags="sep,C"), lp.GlobalArg("box_target_starts,box_target_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_expansions", None, shape=("nsrc_level_boxes", ncoeffs), offset=lp.auto), lp.ValueArg("src_base_ibox", np.int32), lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32), lp.ValueArg("ntargets", np.int32), lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C"), lp.GlobalArg("source_box_starts, source_box_lists,", None, shape=None, offset=lp.auto), "..." ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_result*)", default_offset=lp.auto, fixed_parameters=dict( dim=self.dim, nresults=len(result_names)), lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box") loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) return loopy_knl
def generate(impero_c, args, scalar_type, kernel_name="loopy_kernel", index_names=[], return_increments=True): """Generates loopy code. :arg impero_c: ImperoC tuple with Impero AST and other data :arg args: list of loopy.GlobalArgs :arg scalar_type: type of scalars as C typename string :arg kernel_name: function name of the kernel :arg index_names: pre-assigned index names :arg return_increments: Does codegen for Return nodes increment the lvalue, or assign? :returns: loopy kernel """ ctx = LoopyContext() ctx.indices = impero_c.indices ctx.index_names = defaultdict(lambda: "i", index_names) ctx.epsilon = numpy.finfo(scalar_type).resolution ctx.scalar_type = scalar_type ctx.return_increments = return_increments # Create arguments data = list(args) for i, (temp, dtype) in enumerate( assign_dtypes(impero_c.temporaries, scalar_type)): name = "t%d" % i if isinstance(temp, gem.Constant): data.append( lp.TemporaryVariable(name, shape=temp.shape, dtype=dtype, initializer=temp.array, address_space=lp.AddressSpace.LOCAL, read_only=True)) else: shape = tuple([i.extent for i in ctx.indices[temp]]) + temp.shape data.append( lp.TemporaryVariable(name, shape=shape, dtype=dtype, initializer=None, address_space=lp.AddressSpace.LOCAL, read_only=False)) ctx.gem_to_pymbolic[temp] = p.Variable(name) # Create instructions instructions = statement(impero_c.tree, ctx) # Create domains domains = create_domains(ctx.index_extent.items()) # Create loopy kernel knl = lp.make_function(domains, instructions, data, name=kernel_name, target=lp.CTarget(), seq_dependencies=True, silenced_warnings=["summing_if_branches_ops"], lang_version=(2018, 2)) # Prevent loopy interchange by loopy knl = lp.prioritize_loops(knl, ",".join(ctx.index_extent.keys())) return knl