def test_nested_scan(ctx_factory, i_tag, j_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "[n] -> {[i]: 0 <= i < n}", "[i] -> {[j]: 0 <= j <= i}", "[i] -> {[k]: 0 <= k <= i}" ], """ <>tmp[i] = sum(k, 1) out[i] = sum(j, tmp[j]) """) knl = lp.fix_parameters(knl, n=10) knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) knl = lp.realize_reduction(knl, force_scan=True) print(knl) evt, (out,) = knl(queue) print(out)
def test_global_parallel_reduction_simpler(ctx_factory, size): ctx = ctx_factory() pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") knl = lp.make_kernel( "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", """ <> key = make_uint2(l+nl*g, 1234) {inames=l:g} <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) result = sum(j, tmp[j]) """) ng = 50 knl = lp.fix_parameters(knl, ng=ng) knl = lp.set_options(knl, write_cl=True) ref_knl = knl knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "l_inner") knl = lp.tag_inames(knl, "g:g.0,j:l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
def test_local_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i, j]: 0 <= i < n and 0 <= j < 5}", """ z[j] = sum(i, i+j) """) knl = lp.fix_parameters(knl, n=size) ref_knl = knl def variant0(knl): return lp.tag_inames(knl, "i:l.0") def variant1(knl): return lp.tag_inames(knl, "i:l.0,j:l.1") def variant2(knl): return lp.tag_inames(knl, "i:l.0,j:g.0") for variant in [ variant0, variant1, variant2 ]: knl = variant(ref_knl) lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_assignment_to_subst_indices(ctx_factory): fortran_src = """ subroutine fill(out, out2, inp, n) implicit none real*8 a(n), out(n), out2(n), inp(n) integer n, i do i = 1, n a(i) = 6*inp(i) enddo do i = 1, n out(i) = 5*a(i) end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.fix_parameters(knl, n=5) ref_knl = knl assert "a" in knl.temporary_variables knl = lp.assignment_to_subst(knl, "a") assert "a" not in knl.temporary_variables ctx = ctx_factory() lp.auto_test_vs_ref(ref_knl, ctx, knl)
def test_precompute_with_preexisting_inames(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) ref_knl = knl knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj") knl = lp.set_loop_priority(knl, "ii,jj,e,j,k") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters=dict(E=200))
def test_vector_types(ctx_factory, vec_len): ctx = ctx_factory() knl = lp.make_kernel( "{ [i,j]: 0<=i<n and 0<=j<vec_len }", "out[i,j] = 2*a[i,j]", [ lp.GlobalArg("a", np.float32, shape=lp.auto), lp.GlobalArg("out", np.float32, shape=lp.auto), "..." ]) knl = lp.fix_parameters(knl, vec_len=vec_len) ref_knl = knl knl = lp.tag_data_axes(knl, "out", "c,vec") knl = lp.tag_inames(knl, dict(j="unr")) knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict( n=20000 ))
def element_centers_of_mass(discr): knl = lp.make_kernel( """{[dim,k,i]: 0<=dim<ndims and 0<=k<nelements and 0<=i<nunit_nodes}""", """ panels[dim, k] = sum(i, nodes[dim, k, i])/nunit_nodes """, default_offset=lp.auto, name="find_panel_centers_of_mass", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, ndims=discr.ambient_dim) knl = lp.split_iname(knl, "k", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_inames(knl, dict(dim="ilp")) with cl.CommandQueue(discr.cl_context) as queue: mesh = discr.mesh panels = cl.array.empty(queue, (mesh.ambient_dim, mesh.nelements), dtype=discr.real_dtype) for group_nr, group in enumerate(discr.groups): _, (result,) = knl(queue, nelements=group.nelements, nunit_nodes=group.nunit_nodes, nodes=group.view(discr.nodes()), panels=el_view(discr, group_nr, panels)) panels.finish() panels = panels.with_queue(None) return tuple(panels[d, :] for d in range(mesh.ambient_dim))
def test_convolution(ctx_factory): ctx = ctx_factory() dtype = np.float32 knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", options="annotate_inames") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3) ref_knl = knl def variant_0(knl): #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") return knl def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", default_tag="l.auto") return knl for variant in [ #variant_0, #variant_1, variant_2 ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict( im_w=128, im_h=128, f_w=f_w, nfeats=3, nimgs=3 ))
def test_dependent_loop_bounds_4(): # https://gitlab.tiker.net/inducer/loopy/issues/23 import loopy as lp loopy_knl = lp.make_kernel( [ "{[a]: 0<=a<10}", "{[b]: b_start<=b<b_end}", "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}", ], """ for a <> b_start = 1 <> b_end = 2 for b <> c_start = 1 <> c_end = 2 for c ... nop end <>t[idim] = 1 end end """, "...", seq_dependencies=True) loopy_knl = lp.fix_parameters(loopy_knl, dim=3) with lp.CacheMode(False): lp.generate_code_v2(loopy_knl)
def test_poisson_fem(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() nbf = 5 nqp = 5 sdim = 3 knl = lp.make_kernel( "{ [c,i,j,k,ell,ell2,ell3]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ 0 <= ell,ell2 < sdim}", """ dpsi(bf,k0,dir) := \ simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) Ael[c,i,j] = \ J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") print(knl) knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def add_types(knl): return lp.add_and_infer_dtypes(knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, )) for variant in [ #variant_1, variant_2 ]: knl = variant(knl) lp.auto_test_vs_ref( add_types(ref_knl), ctx, add_types(knl), parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
def test_convolution_with_nonzero_base(ctx_factory): # This is kept alive as a test for domains that don't start at zero. # These are a bad idea for split_iname, which places its origin at zero # and therefore produces a first block that is odd-sized. # # Therefore, for real tests, check test_convolution further up. ctx = ctx_factory() dtype = np.float32 knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \ img[iimg, im_x-f_x, im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", options="annotate_inames") knl = lp.fix_parameters(knl, ncolors=3) ref_knl = knl f_w = 3 def variant_0(knl): #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") return knl def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl for variant in [ variant_0, variant_1, ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict( im_w=128, im_h=128, f_w=f_w, nfeats=12, nimgs=17 ))
def get_kernel(self): ncoeffs = len(self.expansion) from sumpy.tools import gather_loopy_source_arguments loopy_knl = lp.make_kernel( [ "{[isrc_box]: 0<=isrc_box<nsrc_boxes}", "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}", ], [""" for isrc_box <> src_ibox = source_boxes[isrc_box] <> isrc_start = box_source_starts[src_ibox] <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] <> center[idim] = centers[idim, src_ibox] {id=fetch_center} for isrc <> a[idim] = center[idim] - sources[idim, isrc] {dup=idim} <> strength = strengths[isrc] """] + self.get_loopy_instructions() + [""" end """] + [""" tgt_expansions[src_ibox-tgt_base_ibox, {coeffidx}] = \ simul_reduce(sum, isrc, strength*coeff{coeffidx}) \ {{id_prefix=write_expn}} """.format(coeffidx=i) for i in range(ncoeffs)] + [""" end """], [ lp.GlobalArg("sources", None, shape=(self.dim, "nsources"), dim_tags="sep,c"), lp.GlobalArg("strengths", None, shape="nsources"), lp.GlobalArg("box_source_starts,box_source_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("tgt_expansions", None, shape=("nboxes", ncoeffs), offset=lp.auto), lp.ValueArg("nboxes,aligned_nboxes,tgt_base_ibox", np.int32), lp.ValueArg("nsources", np.int32), "..." ] + gather_loopy_source_arguments([self.expansion]), name=self.name, assumptions="nsrc_boxes>=1", silenced_warnings="write_race(write_expn*)", default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") return loopy_knl
def test_make_copy_kernel(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) intermediate_format = "f,f,sep" a1 = np.random.randn(1024, 4, 3) cknl1 = lp.make_copy_kernel(intermediate_format) cknl1 = lp.fix_parameters(cknl1, n2=3) cknl1 = lp.set_options(cknl1, write_cl=True) evt, a2 = cknl1(queue, input=a1) cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format) cknl2 = lp.fix_parameters(cknl2, n2=3) evt, a3 = cknl2(queue, input=a2) assert (a1 == a3).all()
def test_rob_stroud_bernstein(ctx_factory): ctx = ctx_factory() # NOTE: tmp would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }", """ for el,i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} for alpha2 tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \ {id=write_tmp} w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1" ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) print(lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, coeffs=np.float32, tmp=np.float32, )))
def __test(evalfunc, target, **targetargs): n = 10 knl = lp.make_kernel('{[i]: 0 <= i < n}', """ a[i] = b[i] """, [lp.GlobalArg('a', shape=(n,), dtype=np.int32), lp.GlobalArg('b', shape=(n,), dtype=np.int32)], target=target(**targetargs)) knl = lp.fix_parameters(knl, n=n) return evalfunc(knl)
def test_diff(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( """{ [i,j]: 0<=i,j<n }""", """ <> a = 1/(1+sinh(x[i] + y[j])**2) z[i] = sum(j, exp(a * x[j])) """) knl = lp.fix_parameters(knl, n=50) from loopy.transform.diff import diff_kernel dknl, diff_map = diff_kernel(knl, "z", "x") dknl = lp.remove_unused_arguments(dknl) dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a") print(dknl) n = 50 x = np.random.randn(n) y = np.random.randn(n) dx = np.random.randn(n) fac = 1e-1 h1 = 1e-4 h2 = h1 * fac evt, (z0,) = knl(queue, x=x, y=y) evt, (z1,) = knl(queue, x=(x + h1*dx), y=y) evt, (z2,) = knl(queue, x=(x + h2*dx), y=y) dknl = lp.set_options(dknl, write_cl=True) evt, (df,) = dknl(queue, x=x, y=y) diff1 = (z1-z0) diff2 = (z2-z0) diff1_predicted = df.dot(h1*dx) diff2_predicted = df.dot(h2*dx) err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1) err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2) print(err1, err2) assert (err2 < err1 * fac * 1.1).all()
def test_sequential_scan(ctx_factory, n, stride): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, """ a[i] = sum(j, j**2) """ ) knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (a,) = knl(queue) assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
def test_numba_cuda_target(): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j<M and 0<=k<N}", "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", target=lp.NumbaCudaTarget()) knl = lp.assume(knl, "M>0") knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto") knl = lp.fix_parameters(knl, N=3) knl = lp.prioritize_loops(knl, "i_inner,j_outer") knl = lp.tag_inames(knl, "k:unr") knl = lp.tag_array_axes(knl, "X", "N0,N1") knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) print(lp.generate_code_v2(knl).all_code())
def copy_targets_kernel(self): knl = lp.make_kernel( """{[dim,i]: 0<=dim<ndims and 0<=i<npoints}""", """ targets[dim, i] = points[dim, i] """, default_offset=lp.auto, name="copy_targets", lang_version=MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, ndims=self.ambient_dim) knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0") knl = lp.tag_array_axes(knl, "points", "sep, C") knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1") return lp.tag_inames(knl, dict(dim="ilp"))
def test_batched_sparse(): fortran_src = """ subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y) implicit none integer rowstarts(m+1), colindices(nvals) real*8 values(nvals) real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs) integer m, n, rowstart, rowend, length, nvals, nvecs integer i, j, k do i = 1, m rowstart = rowstarts(i) rowend = rowstarts(i+1) length = rowend - rowstart do k = 1, nvecs rowsum(k) = 0 enddo do k = 1, nvecs do j = 1, length rowsum(k) = rowsum(k) + & x(colindices(rowstart+j-1),k)*values(rowstart+j-1) end do end do do k = 1, nvecs y(i,k) = rowsum(k) end do end do end """ knl, = lp.parse_fortran(fortran_src) knl = lp.split_iname(knl, "i", 128) knl = lp.tag_inames(knl, {"i_outer": "g.0"}) knl = lp.tag_inames(knl, {"i_inner": "l.0"}) knl = lp.add_prefetch(knl, "values", default_tag="l.auto") knl = lp.add_prefetch(knl, "colindices", default_tag="l.auto") knl = lp.fix_parameters(knl, nvecs=4)
def test_generate_c_snippet(): from loopy.target.c import CTarget from pymbolic import var I = var("I") # noqa f = var("f") df = var("df") q_v = var("q_v") eN = var("eN") # noqa k = var("k") u = var("u") from functools import partial l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True) Instr = lp.Assignment # noqa knl = lp.make_kernel( "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}", [ Instr(f[I], l_sum(k, q_v[k, I]*u)), Instr(df[I], l_sum(k, q_v[k, I])), ], [ lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"), lp.GlobalArg("f,df", np.float64, shape="nSpace"), lp.ValueArg("u", np.float64), "...", ], target=CTarget(), assumptions="nQuad>=1") if 0: # enable to play with prefetching # (prefetch currently requires constant sizes) knl = lp.fix_parameters(knl, nQuad=5, nSpace=3) knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None) knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1)) knl = lp.set_loop_priority(knl, "I,k_outer,k_inner") knl = lp.preprocess_kernel(knl) knl = lp.get_one_scheduled_kernel(knl) print(lp.generate_body(knl))
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """ out[i-1] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out,) = knl(queue, a=np.arange(1, 17)) assert (out == np.cumsum(np.arange(1, 17)**2)).all()
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[k]: 0<=k<=1}", "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}" ], "out[k,i] = k + sum(j, j**2)" ) knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) n = 10 knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (out,) = knl(queue) inner = np.cumsum(np.arange(n)**2) assert (out.get() == np.array([inner, 1 + inner])).all()
def test_scan_with_different_lower_bound_from_sweep( ctx_factory, sweep_lbound, scan_lbound): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n, sweep_lbound, scan_lbound] -> " "{[i,j]: sweep_lbound<=i<n+sweep_lbound " "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """ out[i-sweep_lbound] = sum(j, j**2) """ ) n = 10 knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) knl = lp.realize_reduction(knl, force_scan=True) evt, (out,) = knl(queue, n=n) assert (out.get() == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
def test_c_execution_with_global_temporaries(): # ensure that the "host" code of a bare ExecutableCTarget with # global constant temporaries is None from loopy.target.c import ExecutableCTarget from loopy.kernel.data import temp_var_scope as scopes n = 10 knl = lp.make_kernel('{[i]: 0 <= i < n}', """ a[i] = b[i] """, [lp.GlobalArg('a', shape=(n,), dtype=np.int32), lp.TemporaryVariable('b', shape=(n,), initializer=np.arange(n, dtype=np.int32), dtype=np.int32, read_only=True, scope=scopes.GLOBAL)], target=ExecutableCTarget()) knl = lp.fix_parameters(knl, n=n) assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code() assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
def test_precompute_with_preexisting_inames_fail(): knl = lp.make_kernel( "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}", """ result[e,i] = sum(j, D1[i,j]*u[e,j]) result2[e,i] = sum(k, D2[i,k]*u[e,k]) """) knl = lp.add_and_infer_dtypes(knl, { "u": np.float32, "D1": np.float32, "D2": np.float32, }) knl = lp.fix_parameters(knl, n=13) knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj") knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj") knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for", precompute_inames="ii,jj") with pytest.raises(lp.LoopyError): lp.precompute(knl, "D2_subst", "i,k", default_tag="for", precompute_inames="ii,jj")
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr = np.ones(n, dtype=np.float32) segment_boundaries = np.zeros(n, dtype=np.int32) segment_boundaries[(segment_boundaries_indices,)] = 1 knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i}", "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [ lp.GlobalArg("arr", np.float32, shape=("n",)), lp.GlobalArg("segflag", np.int32, shape=("n",)), "..." ]) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i=iname_tag)) knl = lp.realize_reduction(knl, force_scan=True) (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) check_segmented_scan_output(arr, segment_boundaries_indices, out)
def test_local_parallel_scan(ctx_factory, n): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """ out[i] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) print(knl) evt, (a,) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all()
def build_loopy_kernel_A_auto(): knl_name = "kernel_tensor_A" # Inputs to the kernel arg_names = ["A", "B", "c"] # Kernel parameters that will be fixed later param_names = ["n", "m"] # Tuples of inames and extents of their loops loops = [("i", "n"), ("j", "n"), ("k", "m")] # Generate the domains for the loops isl_domains = [] for idx, extent in loops: # Create dict of loop variables (inames) and parameters vs = isl.make_zero_and_vars([idx], [extent]) # Create the loop domain using '<=' and '>' restrictions isl_domains.append( ((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent])))) print("ISL loop domains:") print(isl_domains) print("") # Generate pymbolic variables for all used symbols args = {arg: pb.Variable(arg) for arg in arg_names} params = {param: pb.Variable(param) for param in param_names} inames = {iname: pb.Variable(iname) for iname, extent in loops} # Input arguments for the loopy kernel lp_args = { "A": lp.GlobalArg("A", dtype=np.double, shape=(params["n"], params["n"])), "B": lp.GlobalArg("B", dtype=np.double, shape=(params["m"], params["n"])), "c": lp.ValueArg("c", dtype=np.double) } # Generate the list of arguments & parameters that will be passed to loopy data = [] data += [arg for arg in lp_args.values()] data += [lp.ValueArg(param) for param in ["n", "m"]] # Build the kernel instruction: computation and assignment of the element matrix def build_ass(): """ A[i,j] = c*sum(k, B[k,i]*B[k,j]) """ # The target of the assignment target = pb.Subscript(args["A"], (inames["i"], inames["j"])) # The rhs expression: A reduce operation of the matrix columns # Maybe replace with manual increment? reduce_op = lp.library.reduction.SumReductionOperation() reduce_expr = pb.Subscript(args["B"], (inames["k"], inames["i"])) * pb.Subscript( args["B"], (inames["k"], inames["j"])) expr = args["c"] * lp.Reduction(reduce_op, inames["k"], reduce_expr) return lp.Assignment(target, expr) ass = build_ass() print("Assignment expression:") print(ass) print("") instructions = [ass] # Construct the kernel knl = lp.make_kernel(isl_domains, instructions, data, name=knl_name, target=lp.CTarget(), lang_version=lp.MOST_RECENT_LANGUAGE_VERSION) knl = lp.fix_parameters(knl, n=3, m=2) knl = lp.prioritize_loops(knl, "i,j") print(knl) print("") # Generate kernel code knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str( lp.generate_header(knl)[0]) print(knl_c) print("") # Postprocess kernel code replacements = [("__restrict__", "restrict")] knl_c = utils.replace_strings(knl_c, replacements) knl_h = utils.replace_strings(knl_h, replacements) knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));" return knl_name, knl_call, knl_c, knl_h
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() filename = "strongVolumeKernels.f90" with open(filename, "r") as sourcef: source = sourcef.read() source = source.replace("datafloat", "real*4") hsv_r, hsv_s = [ knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False) if "KernelR" in knl.name or "KernelS" in knl.name ] hsv_r = lp.tag_instructions(hsv_r, "rknl") hsv_s = lp.tag_instructions(hsv_s, "sknl") hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"]) #hsv = hsv_s from gnuma_loopy_transforms import (fix_euler_parameters, set_q_storage_format, set_D_storage_format) hsv = lp.fix_parameters(hsv, Nq=Nq) hsv = lp.prioritize_loops(hsv, "e,k,j,i") hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0")) hsv = lp.assume(hsv, "elements >= 1") hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1) for name in ["Q", "rhsQ"]: hsv = set_q_storage_format(hsv, name) hsv = set_D_storage_format(hsv) #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors") ref_hsv = hsv if opt_level == 0: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "D[:,:]") if opt_level == 1: tap_hsv = hsv # turn the first reads into subst rules local_prep_var_names = set() for insn in lp.find_instructions(hsv, "tag:local_prep"): assignee, = insn.assignee_var_names() local_prep_var_names.add(assignee) hsv = lp.assignment_to_subst(hsv, assignee) # precompute fluxes hsv = lp.assignment_to_subst(hsv, "JinvD_r") hsv = lp.assignment_to_subst(hsv, "JinvD_s") r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl") s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl") if ilp_multiple > 1: hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp") ilp_inames = ("k_inner", ) flux_ilp_inames = ("kk", ) else: ilp_inames = () flux_ilp_inames = () rtmps = [] stmps = [] flux_store_idx = 0 for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes): for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [ ("rknl", rflux_insn, ( "j", "n", ), rtmps, ( "jj", "ii", )), ("sknl", sflux_insn, ( "i", "n", ), stmps, ( "ii", "jj", )), ]: flux_var, = insn.assignee_var_names() print(insn) reader, = lp.find_instructions( hsv, "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag, flux_var=flux_var)) hsv = lp.assignment_to_subst(hsv, flux_var) flux_store_name = "flux_store_%d" % flux_store_idx flux_store_idx += 1 tmps.append(flux_store_name) hsv = lp.precompute(hsv, flux_var + "_subst", flux_inames + ilp_inames, temporary_name=flux_store_name, precompute_inames=flux_precomp_inames + flux_ilp_inames, default_tag=None) if flux_var.endswith("_s"): hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?") else: hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?") n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "") if n_iname.endswith("_0"): n_iname = n_iname[:-2] hsv = lp.rename_iname(hsv, "n", n_iname, within="id:" + reader.id, existing_ok=True) hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1")) for iname in flux_ilp_inames: hsv = lp.tag_inames(hsv, {iname: "ilp"}) hsv = lp.alias_temporaries(hsv, rtmps) hsv = lp.alias_temporaries(hsv, stmps) if opt_level == 2: tap_hsv = hsv for prep_var_name in local_prep_var_names: if prep_var_name.startswith("Jinv") or "_s" in prep_var_name: continue hsv = lp.precompute( hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*")) if opt_level == 3: tap_hsv = hsv hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames) if opt_level == 4: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames, fetch_bounding_box=True, default_tag="for", init_expression="0", store_expression="base + buffer") if opt_level == 5: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) # buffer axes need to be vectorized in order for this to work hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c") hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f") hsv = lp.tag_inames(hsv, { "Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr" }, ignore_nonexistent=True) if opt_level == 6: tap_hsv = hsv tap_hsv = lp.tag_inames( tap_hsv, dict(rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="unr", Q_dim_field_outer="unr")) hsv = lp.tag_inames( hsv, dict(rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec", rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr", Q_dim_field_inner="vec", Q_dim_field_outer="unr")) if opt_level == 7: tap_hsv = hsv hsv = lp.collect_common_factors_on_increment( hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ()) if opt_level >= 8: tap_hsv = hsv hsv = tap_hsv if 1: print("OPS") op_map = lp.get_op_map(hsv) print(lp.stringify_stats_mapping(op_map)) print("MEM") gmem_map = lp.get_mem_access_map(hsv).to_bytes() print(lp.stringify_stats_mapping(gmem_map)) hsv = lp.set_options(hsv, cl_build_options=[ "-cl-denorms-are-zero", "-cl-fast-relaxed-math", "-cl-finite-math-only", "-cl-mad-enable", "-cl-no-signed-zeros", ]) hsv = hsv.copy(name="horizontalStrongVolumeKernel") results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300), quiet=True) elapsed = results["elapsed_wall"] print("elapsed", elapsed)
def mhd_vchar(params, G, P, D, loc, dir, vmax_out=None, vmin_out=None): """Calculate components of magnetosonic velocity from primitive variables""" sh = G.shapes # TODO these can almost certainly be calculated on the fly. Just need geometry # Careful Acov and *con change with dir/loc arguments global Acov, Acon, Bcov, Bcon if Acov is None: Acov = cl_array.empty(params['queue'], sh.grid_vector, dtype=np.float64) Bcov = cl_array.zeros_like(Acov) Bcov[0] = 1 Acon = cl_array.empty_like(Acov) Bcon = cl_array.empty_like(Acov) # Acov needs to change with dir in call Acov.fill(0) Acov[dir].fill(1) G.raise_grid(Acov, loc, out=Acon) G.raise_grid(Bcov, loc, out=Bcon) # Find fast magnetosonic speed global knl_mhd_vchar if knl_mhd_vchar is None: code = replace_prim_names(""" <> bsq = simul_reduce(sum, mu, bcon[mu,i,j,k] * bcov[mu,i,j,k]) <> Asq = simul_reduce(sum, mu, Acon[mu,i,j,k] * Acov[mu,i,j,k]) <> Bsq = simul_reduce(sum, mu, Bcon[mu,i,j,k] * Bcov[mu,i,j,k]) <> AB = simul_reduce(sum, mu, Acon[mu,i,j,k] * Bcov[mu,i,j,k]) <> Au = simul_reduce(sum, mu, Acov[mu,i,j,k] * ucon[mu,i,j,k]) <> Bu = simul_reduce(sum, mu, Bcov[mu,i,j,k] * ucon[mu,i,j,k]) ef := fabs(P[RHO,i,j,k]) + gam * fabs(P[UU,i,j,k]) ee := (bsq + ef) <> va2 = bsq / ee <> cs2 = gam * (gam - 1.) * fabs(P[UU,i,j,k]) / ef # Keep two temps to keep loopy from having to compile complete spaghetti cms21 := cs2 + va2 - cs2 * va2 cms22 := if(cms21 > 0, cms21, 0) <> cms2 = if(cms22 > 1, 1, cms22) A := Bu**2 - (Bsq + Bu**2) * cms2 B := 2. * (Au * Bu - (AB + Au * Bu) * cms2) C := Au**2 - (Asq + Au**2) * cms2 <> discr = sqrt(if(B**2 - 4.*A*C > 0, B**2 - 4.*A*C, 0)) vp := -(-B + discr) / (2. * A) vm := -(-B - discr) / (2. * A) vmax[i,j,k] = if(vp > vm, vp, vm) vmin[i,j,k] = if(vp > vm, vm, vp) """) knl_mhd_vchar = lp.make_kernel( sh.isl_grid_vector, code, [*primsArrayArgs("P", ghosts=False), ...], assumptions=sh.assume_grid, default_offset=lp.auto) knl_mhd_vchar = lp.fix_parameters(knl_mhd_vchar, gam=params['gam'], nprim=params['n_prim']) knl_mhd_vchar = tune_grid_kernel(knl_mhd_vchar, sh.grid_vector) knl_mhd_vchar = lp.tag_inames(knl_mhd_vchar, "mu:unr") print("Compiled mhd_vchar") if vmax_out is None: vmax_out = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.float64) if vmin_out is None: vmin_out = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.float64) evt, _ = knl_mhd_vchar(params['queue'], P=P, Acon=Acon, Acov=Acov, Bcon=Bcon, Bcov=Bcov, ucon=D['ucon'], bcon=D['bcon'], bcov=D['bcov'], vmax=vmax_out, vmin=vmin_out) if 'profile' in params and params['profile']: evt.wait() return vmax_out, vmin_out
def U_to_P(params, G, U, P, Pout=None, iter_max=None): s = G.slices sh = G.shapes # Set some default parameters, but allow overrides if iter_max is None: if 'invert_iter_max' in params: iter_max = params['invert_iter_max'] else: iter_max = 8 if 'invert_err_tol' in params: err_tol = params['invert_err_tol'] else: err_tol = 1.e-8 if 'invert_iter_delta' in params: delta = params['invert_iter_delta'] else: delta = 1.e-5 if 'gamma_max' not in params: params['gamma_max'] = 25 # Don't overwrite old memory by default, just allow using old values # Caller can pass new/old memory but is responsible that it contain logical values if we don't update it if Pout is None: Pout = P.copy() # Update the primitive B-fields G.vecdivbygeom(params['queue'], u=U[s.B3VEC], g=G.gdet_d[Loci.CENT.value], out=Pout[s.B3VEC]) # Cached constant quantities global ncov, ncon, lgdet if ncov is None: # For later on ncov = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64) G.timesgeom(params['queue'], u=cl_array.empty_like(ncov[0]).fill(1.0), g=-G.lapse_d[Loci.CENT.value], out=ncov[0]) ncon = G.raise_grid(ncov) lgdet = G.lapse_d[Loci.CENT.value] / G.gdet_d[Loci.CENT.value] # Eflag will indicate inversion failures # Define a generic kernel so we can split out flagging in the future # Use it to catch negative density early on eflag = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.int32) global knl_set_eflag if knl_set_eflag is None: code = add_ghosts( """eflag[i,j,k] = if(var[i,j,k] < 0, flag, eflag[i,j,k])""") knl_set_eflag = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("eflag", dtype=np.int32), *scalarArrayArgs("var"), lp.ValueArg("flag", dtype=np.int32), ... ], default_offset=lp.auto) knl_set_eflag = tune_grid_kernel(knl_set_eflag, sh.bulk_scalar, ng=G.NG) evt, _ = knl_set_eflag(params['queue'], var=U[s.RHO], eflag=eflag, flag=-100) # Convert from conserved variables to four-vectors Bcon = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64) G.vectimesgeom(params['queue'], u=U[s.B3VEC], g=lgdet, out=Bcon[1:]) Qcov = cl_array.empty_like(Bcon) G.timesgeom(params['queue'], u=(U[s.UU] - U[s.RHO]), g=lgdet, out=Qcov[0]) G.vectimesgeom(params['queue'], u=U[s.U3VEC], g=lgdet, out=Qcov[1:]) Bcov = G.lower_grid(Bcon) Qcon = G.raise_grid(Qcov) # This will have fringes of zeros still! Bsq = G.dot(Bcon, Bcov) QdB = G.dot(Bcon, Qcov) Qdotn = G.dot(Qcon, ncov) Qsq = G.dot(Qcon, Qcov) Qtsq = Qsq + Qdotn**2 Qtcon = cl_array.empty_like(Qcon) for i in range(4): Qtcon[i] = Qcon[i] + ncon[i] * Qdotn # Set up eqn for W', the energy density D = cl_array.zeros_like(Qsq) G.timesgeom(params['queue'], u=U[s.RHO], g=lgdet, out=D) Ep = -Qdotn - D del Bcov, Qcon, Qcov # Numerical rootfinding # Take guesses from primitives Wp = Wp_func(params, G, P, Loci.CENT, eflag) # Trap on any failures so far if debugging. They're very rare. if 'debug' in params and params['debug']: if np.any(eflag.get()[s.bulk] != 0): raise ValueError("Unexpected flag set!") # Step around the guess & evaluate errors h = delta * Wp # TODO stable enough? Need fancy subtraction from iharm3d? errp = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp + h, eflag) err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag) errm = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp - h, eflag) # Preserve Wp/err before updating them below Wp1 = Wp.copy() err1 = err.copy() global knl_utop_prep if knl_utop_prep is None: # TODO keep an accumulator here to avoid that costly any() call? code = add_ghosts(""" # TODO put error/prep in here, not calling below # Attempt a Halley/Muller/Bailey/Press step dedW := (errp[i,j,k] - errm[i,j,k]) / (2 * h[i,j,k]) dedW2 := (errp[i,j,k] - 2.*err[i,j,k] + errm[i,j,k]) / (h[i,j,k]**2) # Limit size of 2nd derivative correction # Loopy trick common in HARM: define intermediate variables (xt, xt2, xt3...) to impose clipping # This allows assignments or substitutions while keeping dependencies straight ft := 0.5*err[i,j,k]*dedW2/(dedW**2) ft2 := if(ft > 0.3, 0.3, ft) f := if(ft2 < -0.3, -0.3, ft2) # Limit size of step dWt := -err[i,j,k] / dedW / (1. - f) dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt) dW := if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) Wp[i,j,k] = Wp[i,j,k] + dW {id=wp} # Guarantee we take one step in every bulk zone stop_flag[i,j,k] = 0 # This would avoid the step where there's convergence, but would require taking 2*dW above or similar #stop_flag[i,j,k] = (fabs(dW / Wp[i,j,k]) < err_tol) + \ # (fabs(err[i,j,k] / Wp[i,j,k]) < err_tol) {dep=wp,nosync=wp} """) knl_utop_prep = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("err", "errp", "errm", "h", "Wp"), *scalarArrayArgs("stop_flag", dtype=np.int8), ... ], assumptions=sh.assume_grid, seq_dependencies=True) knl_utop_prep = lp.fix_parameters(knl_utop_prep, err_tol=err_tol) knl_utop_prep = tune_grid_kernel(knl_utop_prep, sh.bulk_scalar, ng=G.NG) print("Compiled utop_prep") # Fill stop_flag with 1 so we don't have to worry about ghost zones taking steps stop_flag = cl_array.empty(params['queue'], sh.grid_scalar, dtype=np.int8).fill(1) evt, _ = knl_utop_prep(params['queue'], err=err, errp=errp, errm=errm, h=h, Wp=Wp, stop_flag=stop_flag) evt.wait() err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag, out=err) # Iteration kernel for 1Dw solver global knl_utop_iter if knl_utop_iter is None: code = add_ghosts(""" # Evaluate whether we need to do any of this <> go = not(stop_flag[i,j,k]) {id=insn_go} # Normal secant increment is dW. Limit guess to between 0.5 and 2 times current value dWt := (Wp1[i,j,k] - Wp[i,j,k]) * err[i,j,k] / (err[i,j,k] - err1[i,j,k]) dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt) <> dW = if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) {id=dw,if=go} # Preserve last values, after use but before any changes Wp1[i,j,k] = Wp[i,j,k] {id=wp1,nosync=dw,if=go} err1[i,j,k] = err[i,j,k] {nosync=dw,if=go} # Update Wp. Err will be updated outside kernel Wp[i,j,k] = Wp[i,j,k] + dW {id=wp,nosync=dw:wp1,if=go} # Set flag not to continue in zones that have converged stop_flag[i,j,k] = if(fabs(dW / Wp[i,j,k]) < err_tol, 1, stop_flag[i,j,k]) {nosync=dw:wp:insn_go,if=go} # For the future, when we've defined err_eqn for loopy kernels # err = err_eqn(Bsq, D, Ep, QdB, Qtsq, Wp, gam, gamma_max, eflag) {if=go} # stop_flag[i,j,k] = stop_flag[i,j,k] + (fabs(err[] / Wp[]) < err_tol) {if=go} """) knl_utop_iter = lp.make_kernel( sh.isl_grid_scalar, code, [ *scalarArrayArgs("Wp", "Wp1", "err", "err1"), *scalarArrayArgs("stop_flag", dtype=np.int8), ... ], assumptions=sh.assume_grid, default_offset=lp.auto, seq_dependencies=True) knl_utop_iter = lp.fix_parameters(knl_utop_iter, err_tol=err_tol) knl_utop_iter = tune_grid_kernel(knl_utop_iter, sh.bulk_scalar, ng=G.NG) print("Compiled utop_iter") # Iterate at least once to set new values from first step # TODO Needed now we set Wp, err1 right? for niter in range(iter_max): #print("U_to_P iter") evt, _ = knl_utop_iter(params['queue'], Wp=Wp, Wp1=Wp1, err=err, err1=err1, stop_flag=stop_flag) err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag) # TODO there may be better/faster if/reduction statements here... stop_flag |= (clm.fabs(err / Wp) < err_tol) if cl_array.min(stop_flag) >= 1: break # If secant method failed to converge, do not set primitives other than B eflag += (stop_flag == 0) del Wp1, err, err1, stop_flag # Find utsq, gamma, rho0 from Wp gamma = gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag) if 'debug' in params and params['debug']: if np.any(gamma.get()[s.bulk] < 1.): raise ValueError("gamma < 1 failure!") # Find the scalars global knl_utop_set if knl_utop_set is None: code = add_ghosts( replace_prim_names(""" rho0 := D[i,j,k] / gamma[i,j,k] W := Wp[i,j,k] + D[i,j,k] w := W / (gamma[i,j,k]**2) pres := (w - rho0) * (gam - 1.) / gam u := w - (rho0 + pres) # Set flag if prims are < 0 eflag[i,j,k] = if((u < 0)*(rho0 < 0), 8, if(u < 0, 7, if(rho0 < 0, 6, eflag[i,j,k]))) {id=ef} # Don't update flagged primitives (necessary? Could skip the branch if fixup does ok) <> set = not(eflag[i,j,k]) {dep=ef,nosync=ef} P[RHO,i,j,k] = rho0 {if=set} P[UU,i,j,k] = u {if=set} P[U1,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[1,i,j,k] + QdB[i,j,k] * Bcon[1,i,j,k] / W) {if=set} P[U2,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[2,i,j,k] + QdB[i,j,k] * Bcon[2,i,j,k] / W) {if=set} P[U3,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[3,i,j,k] + QdB[i,j,k] * Bcon[3,i,j,k] / W) {if=set} """)) if 'electrons' in params and params['electrons']: code += add_ghosts(""" P[KEL,i,j,k] = U[KEL,i,j,k]/U[RHO,i,j,k] P[KTOT,i,j,k] = U[KTOT,i,j,k]/U[RHO,i,j,k] """) knl_utop_set = lp.make_kernel( sh.isl_grid_scalar, code, [ *primsArrayArgs("P"), *vecArrayArgs("Qtcon", "Bcon"), *scalarArrayArgs("D", "gamma", "Wp", "Bsq", "QdB"), *scalarArrayArgs("eflag", dtype=np.int32), ... ], assumptions=sh.assume_grid, default_offset=lp.auto) knl_utop_set = lp.fix_parameters(knl_utop_set, gam=params['gam'], nprim=params['n_prim'], ndim=4) knl_utop_set = tune_grid_kernel(knl_utop_set, sh.bulk_scalar, ng=G.NG) print("Compiled utop_set") evt, _ = knl_utop_set(params['queue'], P=Pout, Qtcon=Qtcon, Bcon=Bcon, D=D, gamma=gamma, Wp=Wp, Bsq=Bsq, QdB=QdB, eflag=eflag) evt.wait() del Qtcon, Bcon, D, gamma, Wp, Bsq, QdB # Trap on flags early in test problems if 'debug' in params and params['debug']: n_nonzero = np.count_nonzero(eflag.get()) if n_nonzero > 0: print("Nonzero eflag in bulk: {}\nFlags: {}".format( n_nonzero, np.argwhere(eflag.get() != 0))) return Pout, eflag
def get_kernel(self): ncoeffs = len(self.expansion) loopy_insns, result_names = self.get_loopy_insns_and_result_names() loopy_knl = lp.make_kernel( [ "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", "{[itgt]: itgt_start<=itgt<itgt_end}", "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }", "{[idim]: 0<=idim<dim}", ], [""" for itgt_box <> tgt_ibox = target_boxes[itgt_box] <> itgt_start = box_target_starts[tgt_ibox] <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] for itgt <> tgt[idim] = targets[idim,itgt] <> isrc_box_start = source_box_starts[itgt_box] <> isrc_box_end = source_box_starts[itgt_box+1] for isrc_box <> src_ibox = source_box_lists[isrc_box] """] + [""" <> coeff{coeffidx} = \ src_expansions[src_ibox - src_base_ibox, {coeffidx}] """.format(coeffidx=i) for i in range(ncoeffs)] + [""" <> center[idim] = centers[idim, src_ibox] {dup=idim} <> b[idim] = tgt[idim] - center[idim] {dup=idim} """] + loopy_insns + [""" end """] + [""" result[{resultidx}, itgt] = result[{resultidx}, itgt] + \ kernel_scaling * simul_reduce(sum, isrc_box, result_{resultidx}_p) {{id_prefix=write_result}} """.format(resultidx=i) for i in range(len(result_names))] + [""" end end """], [ lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"), dim_tags="sep,C"), lp.GlobalArg("box_target_starts,box_target_counts_nonchild", None, shape=None), lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"), lp.GlobalArg("src_expansions", None, shape=("nsrc_level_boxes", ncoeffs), offset=lp.auto), lp.ValueArg("src_base_ibox", np.int32), lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32), lp.ValueArg("ntargets", np.int32), lp.GlobalArg("result", None, shape="nresults,ntargets", dim_tags="sep,C"), lp.GlobalArg("source_box_starts, source_box_lists,", None, shape=None, offset=lp.auto), "..." ] + [arg.loopy_arg for arg in self.expansion.get_args()], name=self.name, assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_result*)", default_offset=lp.auto) loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim, nresults=len(result_names)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box") loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) return loopy_knl
(in_disk and qbx_forced_limit == 0) or (in_disk and qbx_forced_limit != 0 and qbx_forced_limit * center_side[ictr] > 0) ) <> post_dist_sq = if(matches, dist_sq, HUGE) end <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq) tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1) end """) knl = lp.fix_parameters(knl, ambient_dim=2) knl = lp.add_and_infer_dtypes( knl, { "tgt,center,radius,HUGE": np.float32, "center_side,qbx_forced_limit": np.int32, }) lp.auto_test_vs_ref(knl, cl_ctx, knl, parameters={ "HUGE": 1e20, "ncenters": 200, "ntargets": 300, "qbx_forced_limit": 1 })
def test_poisson_fem(ctx_factory): # Stolen from Peter Coogan and Rob Kirby for FEM assembly ctx = ctx_factory() nbf = 5 nqp = 5 sdim = 3 knl = lp.make_kernel("{ [c,i,j,k,ell,ell2,ell3]: \ 0 <= c < nels and \ 0 <= i < nbf and \ 0 <= j < nbf and \ 0 <= k < nqp and \ 0 <= ell,ell2 < sdim}", """ dpsi(bf,k0,dir) := \ simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] ) Ael[c,i,j] = \ J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell)) """, assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0") print(knl) knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp) ref_knl = knl knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"]) def variant_1(knl): knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def variant_2(knl): knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for') knl = lp.set_loop_priority(knl, "c,i,j") return knl def add_types(knl): return lp.add_and_infer_dtypes( knl, dict( w=np.float32, J=np.float32, DPsi=np.float32, DFinv=np.float32, )) for variant in [ #variant_1, variant_2 ]: knl = variant(knl) lp.auto_test_vs_ref(add_types(ref_knl), ctx, add_types(knl), parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
def test_dg_volume(ctx_factory): #logging.basicConfig(level=logging.DEBUG) dtype = np.float32 dtype4 = cl.array.vec.float4 ctx = ctx_factory() order = "F" N = 3 # noqa Np = (N+1)*(N+2)*(N+3)//6 # noqa K = 10000 # noqa knl = lp.make_kernel([ "{[n,m,k]: 0<= n,m < Np and 0<= k < K}", ], """ <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m]) <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m]) <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m]) <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m]) # volume flux rhsu[k,n] = dot(drst_dx[k],dp_drst) rhsv[k,n] = dot(drst_dy[k],dp_drst) rhsw[k,n] = dot(drst_dz[k],dp_drst) rhsp[k,n] = dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \ + dot(drst_dz[k], dw_drst) """, [ lp.GlobalArg("u,v,w,p,rhsu,rhsv,rhsw,rhsp", dtype, shape="K, Np", order="C"), lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="C"), lp.GlobalArg("drst_dx,drst_dy,drst_dz", dtype4, shape="K", order=order), lp.ValueArg("K", np.int32, approximately=1000), ], name="dg_volume", assumptions="K>=1") knl = lp.fix_parameters(knl, Np=Np) seq_knl = knl def variant_basic(knl): knl = lp.tag_inames(knl, dict(k="g.0", n="l.0")) return knl def variant_more_per_work_group(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") return knl def variant_image_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.change_arg_to_image(knl, "DrDsDt") return knl def variant_prefetch_d(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") knl = lp.add_prefetch(knl, "DrDsDt[:,:]", fetch_outer_inames='k_outer', default_tag="l.auto") return knl def variant_prefetch_fields(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") for name in ["u", "v", "w", "p"]: knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"], default_tag="l.auto") return knl def variant_k_ilp(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp") knl = lp.tag_inames(knl, dict(m="unr")) return knl def variant_simple_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1") arg_names = [ prefix+name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"]] for name in arg_names: knl = lp.add_padding(knl, name, axis=0, align_bytes=32) knl = lp.tag_inames(knl, dict(m="unr")) return knl def variant_fancy_padding(knl): knl = lp.tag_inames(knl, dict(n="l.0")) pad_mult = lp.find_padding_multiple(knl, "u", 1, 32) arg_names = [ prefix+name for name in ["u", "v", "w", "p"] for prefix in ["", "rhs"]] knl = lp.split_array_dim(knl, [(nm, 0) for nm in arg_names], pad_mult) return knl parameters_dict = dict(K=K) variants = [ variant_basic, variant_more_per_work_group, variant_prefetch_d, variant_prefetch_fields, variant_k_ilp, variant_simple_padding, variant_fancy_padding ] if (ctx.devices[0].image_support and ctx.devices[0].platform.name != "Portable Computing Language"): variants.append(variant_image_d) for variant in variants: lp.auto_test_vs_ref( seq_knl, ctx, variant(knl), parameters=parameters_dict, #codegen_kwargs=dict(with_annotation=True) )
def test_convolution(ctx_factory): ctx = ctx_factory() dtype = np.float32 knl = lp.make_kernel( "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \ -f_w <= f_x,f_y <= f_w \ and 0 <= im_x < im_w and 0 <= im_y < im_h \ and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \ }", """ out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \ img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \ * f[ifeat, f_w+f_x, f_w+f_y, icolor]) """, [ lp.GlobalArg("f", dtype, shape=lp.auto), lp.GlobalArg("img", dtype, shape=lp.auto), lp.GlobalArg("out", dtype, shape=lp.auto), "..." ], assumptions= "f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0", options="annotate_inames") f_w = 3 knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3) ref_knl = knl def variant_0(knl): #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y") return knl def variant_1(knl): knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y") return knl def variant_2(knl): knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0") knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1") knl = lp.tag_inames(knl, dict(ifeat="g.2")) knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y", default_tag="l.auto") return knl for variant in [ #variant_0, #variant_1, variant_2 ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict(im_w=128, im_h=128, f_w=f_w, nfeats=3, nimgs=3))
def advance_fluid(params, G, Pi, Ps, dt): s = G.slices sh = G.shapes global Ui, dU, Uf, Ds if Ui is None: Ui = cl_array.zeros(params['queue'], sh.grid_primitives, dtype=np.float64) dU = cl_array.zeros(params['queue'], sh.grid_primitives, dtype=np.float64) Uf = cl_array.zeros(params['queue'], sh.grid_primitives, dtype=np.float64) Ds = get_state(params, G, Ps, Loci.CENT) global ihc if ihc is None and 'use_ctypes' in params and params['use_ctypes']: ihc = Iharmc() F, ndt = get_flux(params, G, Ps) # plot_all_prims(G, F[1], "F1") # plot_all_prims(G, F[2], "F2") # plot_all_prims(G, F[3], "F3") # TODO RESTORE following # if params['metric'][-3:] == "mks": # fix_flux(F) # Constrained transport for B flux_ct(params, G, F) # Flux diagnostic globals # diag_flux(F) # Get conserved variables & source term get_state(params, G, Ps, Loci.CENT, out=Ds) get_fluid_source(params, G, Ps, Ds, out=dU) if Pi is not Ps: Di = get_state(params, G, Pi, Loci.CENT) else: Di = Ds prim_to_flux(params, G, Pi, Di, 0, Loci.CENT, out=Ui) if 'P_U_P_test' in params and params['P_U_P_test']: # Test U_to_P by recovering a prims array we know (Ps), # starting from a close but not identical array (Pi) if Pi is not Ps: Us = prim_to_flux(params, G, Ps, Ds, 0, Loci.CENT) Ps_fromU, _ = ihc.U_to_P(params, G, Us, Pi) print("Pi vs Ps change: ", la.norm((Ps - Pi).get()[s.bulk])) print("Ps->U->Ps Absolute Error: ", la.norm((Ps - Ps_fromU).get()[s.bulk])) else: print("Pi is Ps") global knl_finite_diff if knl_finite_diff is None: code = add_ghosts(""" Uf[p,i,j,k] = Ui[p,i,j,k] + \ dt * ((F1[p,i,j,k] - F1[p,i+1,j,k]) / dx1 + \ (F2[p,i,j,k] - F2[p,i,j+1,k]) / dx2 + \ (F3[p,i,j,k] - F3[p,i,j,k+1]) / dx3 + dU[p,i,j,k]) """) knl_finite_diff = lp.make_kernel( sh.isl_grid_primitives, code, [ lp.ValueArg("dt", dtype=np.float64), *primsArrayArgs("Ui", "F1", "F2", "F3", "dU", "Uf"), ... ], assumptions=sh.assume_grid) knl_finite_diff = lp.fix_parameters(knl_finite_diff, dx1=G.dx[1], dx2=G.dx[2], dx3=G.dx[3]) knl_finite_diff = tune_prims_kernel(knl_finite_diff, sh.bulk_primitives, ng=G.NG) print("Compiled finite_diff") if isinstance(dt, cl_array.Array): dt = dt.get() evt, _ = knl_finite_diff(params['queue'], dt=float(dt), Ui=Ui, F1=F[1], F2=F[2], F3=F[3], dU=dU, Uf=Uf) # Newton-Raphson if ihc is not None: # If we're using iharm3d's C U_to_P fn, negotiate memory. Note eflag is not preserved! Pf, pflag = ihc.U_to_P(params, G, Uf.get(), Pi.get()) Pf = cl_array.to_device(params['queue'], Pf) pflag = cl_array.to_device(params['queue'], pflag) else: # Loopy version Pf, pflag = U_to_P(params, G, Uf, Pi) if params['debug']: print("Uf - Ui: ", la.norm((Uf - Ui).get())) print("Pf - Pi: ", la.norm((Pf - Pi).get())) return Pf, pflag, ndt
def test_tim2d(ctx_factory): dtype = np.float32 ctx = ctx_factory() order = "C" n = 8 from pymbolic import var K_sym = var("K") # noqa field_shape = (K_sym, n, n) # K - run-time symbolic knl = lp.make_kernel( "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2<n and 0<=e<K and 0<=gi<3}", [ "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])", "us(a,b) := simul_reduce(sum, o2, D[b,o2]*u[e,a,o2])", #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)", "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)", "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)", "lap[e,i,j] = " " simul_reduce(sum, m, D[m,i]*Gux(m,j))" "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))" ], [ lp.GlobalArg("u", dtype, shape=field_shape, order=order), lp.GlobalArg("lap", dtype, shape=field_shape, order=order), lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order), # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order), lp.GlobalArg("D", dtype, shape=(n, n), order=order), # lp.ImageArg("D", dtype, shape=(n, n)), lp.ValueArg("K", np.int32, approximately=1000), ], name="semlap2D", assumptions="K>=1") knl = lp.fix_parameters(knl, n=n) knl = lp.duplicate_inames(knl, "o", within="id:ur") knl = lp.duplicate_inames(knl, "o", within="id:us") seq_knl = knl def variant_orig(knl): knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0")) knl = lp.add_prefetch(knl, "D[:,:]") knl = lp.add_prefetch(knl, "u[e, :, :]") knl = lp.precompute(knl, "ur(m,j)", ["m", "j"]) knl = lp.precompute(knl, "us(i,m)", ["i", "m"]) knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"]) knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"]) knl = lp.add_prefetch(knl, "G$x[:,e,:,:]") knl = lp.add_prefetch(knl, "G$y[:,e,:,:]") knl = lp.tag_inames(knl, dict(o="unr")) knl = lp.tag_inames(knl, dict(m="unr")) knl = lp.set_instruction_priority(knl, "id:D_fetch", 5) print(knl) return knl for variant in [variant_orig]: K = 1000 # noqa lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), op_count=[K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9], op_label=["GFlops"], parameters={"K": K})
def no_test_dg_surface(ctx_factory): # tough to test, would need the right index info dtype = np.float32 ctx = ctx_factory() order = "F" N = 3 # noqa Np = (N+1)*(N+2)*(N+3)//6 # noqa Nfp = (N+1)*(N+2)//2 # noqa Nfaces = 4 # noqa K = 10000 # noqa knl = lp.make_kernel( [ "{[m,n,k]: 0<= m < NfpNfaces and 0<= n < Np and 0<= k < K }" ], """ <> idP = vmapP[m,k] <> idM = vmapM[m,k] <> du = u[[idP]]-u[[idM]] <> dv = v[[idP]]-v[[idM]] <> dw = w[[idP]]-w[[idM]] <> dp = bc[m,k]*p[[idP]] - p[[idM]] <> dQ = 0.5*Fscale[m,k]* \ (dp - nx[m,k]*du - ny[m,k]*dv - nz[m,k]*dw) <> fluxu = -nx[m,k]*dQ <> fluxv = -ny[m,k]*dQ <> fluxw = -nz[m,k]*dQ <> fluxp = dQ # reduction here rhsu[n,k] = sum(m, LIFT[n,m]*fluxu) rhsv[n,k] = sum(m, LIFT[n,m]*fluxv) rhsw[n,k] = sum(m, LIFT[n,m]*fluxw) rhsp[n,k] = sum(m, LIFT[n,m]*fluxp) """, [ lp.GlobalArg("vmapP,vmapM", np.int32, shape="NfpNfaces, K", order=order), lp.GlobalArg("u,v,w,p,rhsu,rhsv,rhsw,rhsp", dtype, shape="Np, K", order=order), lp.GlobalArg("nx,ny,nz,Fscale,bc", dtype, shape="NfpNfaces, K", order=order), lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="C"), lp.ValueArg("K", np.int32, approximately=1000), ], name="dg_surface", assumptions="K>=1") knl = lp.fix_parameters(knl, Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp) seq_knl = knl def variant_basic(knl): return knl parameters_dict = dict(K=K) for variant in [ variant_basic, ]: lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), parameters=parameters_dict)
def test_rob_stroud_bernstein_full(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() # NOTE: result would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ \ 0 <= i1_2 < nqp1d and \ 0 <= alpha1_2 <= deg and \ 0 <= i2_2 < nqp1d \ }", """ for el for i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ {id=write_tmp} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end for i1_2 <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 <> w2 = s2**deg for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ w2 * tmp[alpha1_2, i2_2] end w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1" ) knl = lp.fix_parameters(knl, nqp1d=7, deg=4) if 0: knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) from pickle import dumps, loads knl = loads(dumps(knl)) knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) print(knl)
def test_rob_stroud_bernstein_full(ctx_factory): #logging.basicConfig(level=logging.DEBUG) ctx = ctx_factory() # NOTE: result would have to be zero-filled beforehand knl = lp.make_kernel( "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \ 0 <= el < nels and \ 0 <= i2 < nqp1d and \ 0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\ \ 0 <= i1_2 < nqp1d and \ 0 <= alpha1_2 <= deg and \ 0 <= i2_2 < nqp1d \ }", """ for el for i2 <> xi = qpts[1, i2] <> s = 1-xi <> r = xi/s <> aind = 0 {id=aind_init} for alpha1 <> w = s**(deg-alpha1) {id=init_w} <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \ {id=write_tmp,dep=init_w:aind_init} for alpha2 w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \ {id=update_w,dep=init_w:write_tmp} aind = aind + 1 \ {id=aind_incr,dep=aind_init:write_tmp:update_w} end end end for i1_2 <> xi2 = qpts[0, i1_2] {dep=aind_incr} <> s2 = 1-xi2 <> r2 = xi2/s2 <> w2 = s2**deg {id=w2_init} for alpha1_2 for i2_2 result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \ w2 * tmp[alpha1_2, i2_2] {id=res2,dep=w2_init} end w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2) \ {id=w2_update, dep=res2} end end end """, [ # Must declare coeffs to have "no" shape, to keep loopy # from trying to figure it out the shape automatically. lp.GlobalArg("coeffs", None, shape=None), "..." ], assumptions="deg>=0 and nels>=1") knl = lp.fix_parameters(knl, nqp1d=7, deg=4) if 0: knl = lp.split_iname(knl, "el", 16, inner_tag="l.0") knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp", slabs=(0, 1)) knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr")) from pickle import dumps, loads knl = loads(dumps(knl)) knl = lp.CompiledKernel(ctx, knl).get_highlighted_code( dict( qpts=np.float32, tmp=np.float32, coeffs=np.float32, result=np.float32, )) print(knl)
def set_up_volume_loop(kernel, Nq): kernel = lp.fix_parameters(kernel, Nq=Nq) kernel = lp.set_loop_priority(kernel, "e,k,j,i") kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0")) kernel = lp.assume(kernel, "elements >= 1") return kernel
def fix_euler_parameters(kernel, p_p0, p_Gamma, p_R): return lp.fix_parameters(kernel, p_p0=pick_apart_float_cast(p_p0), p_Gamma=pick_apart_float_cast(p_Gamma), p_R=pick_apart_float_cast(p_R))