Ejemplo n.º 1
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0 <= i < n}",
            "[i] -> {[j]: 0 <= j <= i}",
            "[i] -> {[k]: 0 <= k <= i}"
        ],
        """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out,) = knl(queue)

    print(out)
Ejemplo n.º 2
0
def test_global_parallel_reduction_simpler(ctx_factory, size):
    ctx = ctx_factory()

    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")

    knl = lp.make_kernel(
            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
            """
            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}

            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)

            result = sum(j, tmp[j])
            """)

    ng = 50
    knl = lp.fix_parameters(knl, ng=ng)

    knl = lp.set_options(knl, write_cl=True)

    ref_knl = knl

    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "l_inner")
    knl = lp.tag_inames(knl, "g:g.0,j:l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
Ejemplo n.º 3
0
def test_local_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i, j]: 0 <= i < n and 0 <= j < 5}",
            """
            z[j] = sum(i, i+j)
            """)

    knl = lp.fix_parameters(knl, n=size)

    ref_knl = knl

    def variant0(knl):
        return lp.tag_inames(knl, "i:l.0")

    def variant1(knl):
        return lp.tag_inames(knl, "i:l.0,j:l.1")

    def variant2(knl):
        return lp.tag_inames(knl, "i:l.0,j:g.0")

    for variant in [
            variant0,
            variant1,
            variant2
            ]:
        knl = variant(ref_knl)

        lp.auto_test_vs_ref(ref_knl, ctx, knl)
Ejemplo n.º 4
0
def test_assignment_to_subst_indices(ctx_factory):
    fortran_src = """
        subroutine fill(out, out2, inp, n)
          implicit none

          real*8 a(n), out(n), out2(n), inp(n)
          integer n, i

          do i = 1, n
            a(i) = 6*inp(i)
          enddo

          do i = 1, n
            out(i) = 5*a(i)
          end do
        end
        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.fix_parameters(knl, n=5)

    ref_knl = knl

    assert "a" in knl.temporary_variables
    knl = lp.assignment_to_subst(knl, "a")
    assert "a" not in knl.temporary_variables

    ctx = ctx_factory()
    lp.auto_test_vs_ref(ref_knl, ctx, knl)
Ejemplo n.º 5
0
def test_precompute_with_preexisting_inames(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j,k<n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    ref_knl = knl

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    knl = lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
            precompute_inames="ii,jj")

    knl = lp.set_loop_priority(knl, "ii,jj,e,j,k")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(E=200))
Ejemplo n.º 6
0
def test_vector_types(ctx_factory, vec_len):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,j]: 0<=i<n and 0<=j<vec_len }",
            "out[i,j] = 2*a[i,j]",
            [
                lp.GlobalArg("a", np.float32, shape=lp.auto),
                lp.GlobalArg("out", np.float32, shape=lp.auto),
                "..."
                ])

    knl = lp.fix_parameters(knl, vec_len=vec_len)

    ref_knl = knl

    knl = lp.tag_data_axes(knl, "out", "c,vec")
    knl = lp.tag_inames(knl, dict(j="unr"))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters=dict(
                n=20000
                ))
Ejemplo n.º 7
0
def element_centers_of_mass(discr):
    knl = lp.make_kernel(
        """{[dim,k,i]:
            0<=dim<ndims and
            0<=k<nelements and
            0<=i<nunit_nodes}""",
        """
            panels[dim, k] = sum(i, nodes[dim, k, i])/nunit_nodes
            """,
        default_offset=lp.auto, name="find_panel_centers_of_mass",
        lang_version=MOST_RECENT_LANGUAGE_VERSION)

    knl = lp.fix_parameters(knl, ndims=discr.ambient_dim)

    knl = lp.split_iname(knl, "k", 128, inner_tag="l.0", outer_tag="g.0")
    knl = lp.tag_inames(knl, dict(dim="ilp"))

    with cl.CommandQueue(discr.cl_context) as queue:
        mesh = discr.mesh
        panels = cl.array.empty(queue, (mesh.ambient_dim, mesh.nelements),
                                dtype=discr.real_dtype)
        for group_nr, group in enumerate(discr.groups):
            _, (result,) = knl(queue,
                nelements=group.nelements,
                nunit_nodes=group.nunit_nodes,
                nodes=group.view(discr.nodes()),
                panels=el_view(discr, group_nr, panels))
        panels.finish()
        panels = panels.with_queue(None)
        return tuple(panels[d, :] for d in range(mesh.ambient_dim))
Ejemplo n.º 8
0
def test_convolution(ctx_factory):
    ctx = ctx_factory()

    dtype = np.float32

    knl = lp.make_kernel(
        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and 0 <= im_x < im_w and 0 <= im_y < im_h \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
                }",
        """
        out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
            img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
        """,
        [
            lp.GlobalArg("f", dtype, shape=lp.auto),
            lp.GlobalArg("img", dtype, shape=lp.auto),
            lp.GlobalArg("out", dtype, shape=lp.auto),
            "..."
            ],
        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
        options="annotate_inames")

    f_w = 3

    knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3)

    ref_knl = knl

    def variant_0(knl):
        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
        return knl

    def variant_1(knl):
        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
        return knl

    def variant_2(knl):
        knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
        knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
        knl = lp.tag_inames(knl, dict(ifeat="g.2"))
        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto")
        knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y",
                default_tag="l.auto")
        return knl

    for variant in [
            #variant_0,
            #variant_1,
            variant_2
            ]:
        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
                parameters=dict(
                    im_w=128, im_h=128, f_w=f_w,
                    nfeats=3, nimgs=3
                    ))
Ejemplo n.º 9
0
def test_dependent_loop_bounds_4():
    # https://gitlab.tiker.net/inducer/loopy/issues/23
    import loopy as lp

    loopy_knl = lp.make_kernel(
        [
            "{[a]: 0<=a<10}",
            "{[b]: b_start<=b<b_end}",
            "{[c,idim]: c_start<=c<c_end and 0<=idim<dim}",
        ],
        """
        for a
         <> b_start = 1
         <> b_end = 2
         for b
          <> c_start = 1
          <> c_end = 2

          for c
           ... nop
          end

          <>t[idim] = 1
         end
        end
        """,
        "...",
        seq_dependencies=True)

    loopy_knl = lp.fix_parameters(loopy_knl, dim=3)

    with lp.CacheMode(False):
        lp.generate_code_v2(loopy_knl)
Ejemplo n.º 10
0
def test_poisson_fem(ctx_factory):
    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
    ctx = ctx_factory()

    nbf = 5
    nqp = 5
    sdim = 3

    knl = lp.make_kernel(
            "{ [c,i,j,k,ell,ell2,ell3]: \
            0 <= c < nels and \
            0 <= i < nbf and \
            0 <= j < nbf and \
            0 <= k < nqp and \
            0 <= ell,ell2 < sdim}",
            """
            dpsi(bf,k0,dir) := \
                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
            Ael[c,i,j] = \
                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
            """,
            assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")

    print(knl)

    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)

    ref_knl = knl

    knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"])

    def variant_1(knl):
        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
        knl = lp.set_loop_priority(knl, "c,i,j")
        return knl

    def variant_2(knl):
        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
        knl = lp.set_loop_priority(knl, "c,i,j")
        return knl

    def add_types(knl):
        return lp.add_and_infer_dtypes(knl, dict(
            w=np.float32,
            J=np.float32,
            DPsi=np.float32,
            DFinv=np.float32,
            ))

    for variant in [
            #variant_1,
            variant_2
            ]:
        knl = variant(knl)

        lp.auto_test_vs_ref(
                add_types(ref_knl), ctx, add_types(knl),
                parameters=dict(n=5, nels=15, nbf=5, sdim=2, nqp=7))
Ejemplo n.º 11
0
def test_convolution_with_nonzero_base(ctx_factory):
    # This is kept alive as a test for domains that don't start at zero.
    # These are a bad idea for split_iname, which places its origin at zero
    # and therefore produces a first block that is odd-sized.
    #
    # Therefore, for real tests, check test_convolution further up.

    ctx = ctx_factory()

    dtype = np.float32

    knl = lp.make_kernel(
        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and f_w <= im_x < im_w-f_w and f_w <= im_y < im_h-f_w \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
                }",
        """
        out[iimg, ifeat, im_x-f_w, im_y-f_w] = sum((f_x, f_y, icolor), \
            img[iimg, im_x-f_x, im_y-f_y, icolor] \
            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
        """,
        [
            lp.GlobalArg("f", dtype, shape=lp.auto),
            lp.GlobalArg("img", dtype, shape=lp.auto),
            lp.GlobalArg("out", dtype, shape=lp.auto),
            "..."
            ],
        assumptions="f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
        options="annotate_inames")

    knl = lp.fix_parameters(knl, ncolors=3)

    ref_knl = knl

    f_w = 3

    def variant_0(knl):
        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.set_loop_priority(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
        return knl

    def variant_1(knl):
        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.set_loop_priority(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
        return knl

    for variant in [
            variant_0,
            variant_1,
            ]:
        lp.auto_test_vs_ref(ref_knl, ctx, variant(knl),
                parameters=dict(
                    im_w=128, im_h=128, f_w=f_w,
                    nfeats=12, nimgs=17
                    ))
Ejemplo n.º 12
0
    def get_kernel(self):
        ncoeffs = len(self.expansion)

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
                [
                    "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
                    "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
                    ],
                ["""
                for isrc_box
                    <> src_ibox = source_boxes[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]

                    <> center[idim] = centers[idim, src_ibox] {id=fetch_center}

                    for isrc
                        <> a[idim] = center[idim] - sources[idim, isrc] {dup=idim}

                        <> strength = strengths[isrc]
                        """] + self.get_loopy_instructions() + ["""
                    end
                    """] + ["""
                    tgt_expansions[src_ibox-tgt_base_ibox, {coeffidx}] = \
                            simul_reduce(sum, isrc, strength*coeff{coeffidx}) \
                            {{id_prefix=write_expn}}
                    """.format(coeffidx=i) for i in range(ncoeffs)] + ["""
                end
                """],
                [
                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources"),
                        dim_tags="sep,c"),
                    lp.GlobalArg("strengths", None, shape="nsources"),
                    lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
                        None, shape=None),
                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                    lp.GlobalArg("tgt_expansions", None,
                        shape=("nboxes", ncoeffs), offset=lp.auto),
                    lp.ValueArg("nboxes,aligned_nboxes,tgt_base_ibox", np.int32),
                    lp.ValueArg("nsources", np.int32),
                    "..."
                ] + gather_loopy_source_arguments([self.expansion]),
                name=self.name,
                assumptions="nsrc_boxes>=1",
                silenced_warnings="write_race(write_expn*)",
                default_offset=lp.auto)

        loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim)

        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        return loopy_knl
Ejemplo n.º 13
0
def test_make_copy_kernel(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    intermediate_format = "f,f,sep"

    a1 = np.random.randn(1024, 4, 3)

    cknl1 = lp.make_copy_kernel(intermediate_format)

    cknl1 = lp.fix_parameters(cknl1, n2=3)

    cknl1 = lp.set_options(cknl1, write_cl=True)
    evt, a2 = cknl1(queue, input=a1)

    cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format)
    cknl2 = lp.fix_parameters(cknl2, n2=3)

    evt, a3 = cknl2(queue, input=a2)

    assert (a1 == a3).all()
Ejemplo n.º 14
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1"
            )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                )))
Ejemplo n.º 15
0
    def __test(evalfunc, target, **targetargs):
        n = 10

        knl = lp.make_kernel('{[i]: 0 <= i < n}',
            """
                a[i] = b[i]
            """,
            [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
             lp.GlobalArg('b', shape=(n,), dtype=np.int32)],
            target=target(**targetargs))

        knl = lp.fix_parameters(knl, n=n)
        return evalfunc(knl)
Ejemplo n.º 16
0
def test_diff(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         """{ [i,j]: 0<=i,j<n }""",
         """
         <> a = 1/(1+sinh(x[i] + y[j])**2)
         z[i] = sum(j, exp(a * x[j]))
         """)

    knl = lp.fix_parameters(knl, n=50)

    from loopy.transform.diff import diff_kernel
    dknl, diff_map = diff_kernel(knl, "z", "x")
    dknl = lp.remove_unused_arguments(dknl)

    dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")

    print(dknl)

    n = 50
    x = np.random.randn(n)
    y = np.random.randn(n)

    dx = np.random.randn(n)

    fac = 1e-1
    h1 = 1e-4
    h2 = h1 * fac

    evt, (z0,) = knl(queue, x=x, y=y)
    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)

    dknl = lp.set_options(dknl, write_cl=True)
    evt, (df,) = dknl(queue, x=x, y=y)

    diff1 = (z1-z0)
    diff2 = (z2-z0)

    diff1_predicted = df.dot(h1*dx)
    diff2_predicted = df.dot(h2*dx)

    err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1)
    err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2)
    print(err1, err2)

    assert (err2 < err1 * fac * 1.1).all()
Ejemplo n.º 17
0
def test_sequential_scan(ctx_factory, n, stride):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride,
        """
        a[i] = sum(j, j**2)
        """
        )

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (a,) = knl(queue)

    assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
Ejemplo n.º 18
0
def test_numba_cuda_target():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j<M and 0<=k<N}",
        "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))",
        target=lp.NumbaCudaTarget())

    knl = lp.assume(knl, "M>0")
    knl = lp.split_iname(knl, "i", 16, outer_tag='g.0')
    knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1))
    knl = lp.add_prefetch(knl, "X[i,:]", default_tag="l.auto")
    knl = lp.fix_parameters(knl, N=3)
    knl = lp.prioritize_loops(knl, "i_inner,j_outer")
    knl = lp.tag_inames(knl, "k:unr")
    knl = lp.tag_array_axes(knl, "X", "N0,N1")

    knl = lp.add_and_infer_dtypes(knl, {"X": np.float32})

    print(lp.generate_code_v2(knl).all_code())
Ejemplo n.º 19
0
    def copy_targets_kernel(self):
        knl = lp.make_kernel(
            """{[dim,i]:
                0<=dim<ndims and
                0<=i<npoints}""",
            """
                targets[dim, i] = points[dim, i]
                """,
            default_offset=lp.auto, name="copy_targets",
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.fix_parameters(knl, ndims=self.ambient_dim)

        knl = lp.split_iname(knl, "i", 128, inner_tag="l.0", outer_tag="g.0")
        knl = lp.tag_array_axes(knl, "points", "sep, C")

        knl = lp.tag_array_axes(knl, "targets", "stride:auto, stride:1")
        return lp.tag_inames(knl, dict(dim="ilp"))
Ejemplo n.º 20
0
def test_batched_sparse():
    fortran_src = """
        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
          implicit none

          integer rowstarts(m+1), colindices(nvals)
          real*8 values(nvals)
          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)

          integer m, n, rowstart, rowend, length, nvals, nvecs
          integer i, j, k

          do i = 1, m
            rowstart = rowstarts(i)
            rowend = rowstarts(i+1)
            length = rowend - rowstart

            do k = 1, nvecs
              rowsum(k) = 0
            enddo
            do k = 1, nvecs
              do j = 1, length
                rowsum(k) = rowsum(k) + &
                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
              end do
            end do
            do k = 1, nvecs
              y(i,k) = rowsum(k)
            end do
          end do
        end

        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.split_iname(knl, "i", 128)
    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
    knl = lp.add_prefetch(knl, "values",
            default_tag="l.auto")
    knl = lp.add_prefetch(knl, "colindices",
            default_tag="l.auto")
    knl = lp.fix_parameters(knl, nvecs=4)
Ejemplo n.º 21
0
def test_generate_c_snippet():
    from loopy.target.c import CTarget

    from pymbolic import var
    I = var("I")  # noqa
    f = var("f")
    df = var("df")
    q_v = var("q_v")
    eN = var("eN")  # noqa
    k = var("k")
    u = var("u")

    from functools import partial
    l_sum = partial(lp.Reduction, "sum", allow_simultaneous=True)

    Instr = lp.Assignment  # noqa

    knl = lp.make_kernel(
        "{[I, k]: 0<=I<nSpace and 0<=k<nQuad}",
        [
            Instr(f[I], l_sum(k, q_v[k, I]*u)),
            Instr(df[I], l_sum(k, q_v[k, I])),
            ],
        [
            lp.GlobalArg("q_v", np.float64, shape="nQuad, nSpace"),
            lp.GlobalArg("f,df", np.float64, shape="nSpace"),
            lp.ValueArg("u", np.float64),
            "...",
            ],
        target=CTarget(),
        assumptions="nQuad>=1")

    if 0:  # enable to play with prefetching
        # (prefetch currently requires constant sizes)
        knl = lp.fix_parameters(knl, nQuad=5, nSpace=3)
        knl = lp.add_prefetch(knl, "q_v", "k,I", default_tag=None)

    knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
    knl = lp.set_loop_priority(knl, "I,k_outer,k_inner")

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    print(lp.generate_body(knl))
Ejemplo n.º 22
0
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
        """
        out[i-1] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=16)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)
    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))
    evt, (out,) = knl(queue, a=np.arange(1, 17))

    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
Ejemplo n.º 23
0
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "{[k]: 0<=k<=1}",
            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
        ],
        "out[k,i] = k + sum(j, j**2)"
        )

    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
    n = 10
    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (out,) = knl(queue)

    inner = np.cumsum(np.arange(n)**2)

    assert (out.get() == np.array([inner, 1 + inner])).all()
Ejemplo n.º 24
0
def test_scan_with_different_lower_bound_from_sweep(
        ctx_factory, sweep_lbound, scan_lbound):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n, sweep_lbound, scan_lbound] -> "
        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}",
        """
        out[i-sweep_lbound] = sum(j, j**2)
        """
        )

    n = 10

    knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (out,) = knl(queue, n=n)

    assert (out.get()
            == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
Ejemplo n.º 25
0
def test_c_execution_with_global_temporaries():
    # ensure that the "host" code of a bare ExecutableCTarget with
    # global constant temporaries is None

    from loopy.target.c import ExecutableCTarget
    from loopy.kernel.data import temp_var_scope as scopes
    n = 10

    knl = lp.make_kernel('{[i]: 0 <= i < n}',
        """
            a[i] = b[i]
        """,
        [lp.GlobalArg('a', shape=(n,), dtype=np.int32),
         lp.TemporaryVariable('b', shape=(n,),
                              initializer=np.arange(n, dtype=np.int32),
                              dtype=np.int32,
                              read_only=True,
                              scope=scopes.GLOBAL)],
        target=ExecutableCTarget())

    knl = lp.fix_parameters(knl, n=n)
    assert ('int b[%d]' % n) not in lp.generate_code_v2(knl).host_code()
    assert np.allclose(knl(a=np.zeros(10, dtype=np.int32))[1], np.arange(10))
Ejemplo n.º 26
0
def test_precompute_with_preexisting_inames_fail():
    knl = lp.make_kernel(
        "{[e,i,j,k]: 0<=e<E and 0<=i,j<n and 0<=k<2*n}",
        """
        result[e,i] = sum(j, D1[i,j]*u[e,j])
        result2[e,i] = sum(k, D2[i,k]*u[e,k])
        """)

    knl = lp.add_and_infer_dtypes(knl, {
        "u": np.float32,
        "D1": np.float32,
        "D2": np.float32,
        })

    knl = lp.fix_parameters(knl, n=13)

    knl = lp.extract_subst(knl, "D1_subst", "D1[ii,jj]", parameters="ii,jj")
    knl = lp.extract_subst(knl, "D2_subst", "D2[ii,jj]", parameters="ii,jj")

    knl = lp.precompute(knl, "D1_subst", "i,j", default_tag="for",
            precompute_inames="ii,jj")
    with pytest.raises(lp.LoopyError):
        lp.precompute(knl, "D2_subst", "i,k", default_tag="for",
                precompute_inames="ii,jj")
Ejemplo n.º 27
0
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arr = np.ones(n, dtype=np.float32)
    segment_boundaries = np.zeros(n, dtype=np.int32)
    segment_boundaries[(segment_boundaries_indices,)] = 1

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<=i}",
        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])",
        [
            lp.GlobalArg("arr", np.float32, shape=("n",)),
            lp.GlobalArg("segflag", np.int32, shape=("n",)),
            "..."
        ])

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i=iname_tag))
    knl = lp.realize_reduction(knl, force_scan=True)

    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)

    check_segmented_scan_output(arr, segment_boundaries_indices, out)
Ejemplo n.º 28
0
def test_local_parallel_scan(ctx_factory, n):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
        """
        out[i] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))

    print(knl)

    evt, (a,) = knl(queue, a=np.arange(n))
    assert (a == np.cumsum(np.arange(n)**2)).all()
Ejemplo n.º 29
0
def build_loopy_kernel_A_auto():
    knl_name = "kernel_tensor_A"

    # Inputs to the kernel
    arg_names = ["A", "B", "c"]
    # Kernel parameters that will be fixed later
    param_names = ["n", "m"]
    # Tuples of inames and extents of their loops
    loops = [("i", "n"), ("j", "n"), ("k", "m")]

    # Generate the domains for the loops
    isl_domains = []
    for idx, extent in loops:
        # Create dict of loop variables (inames) and parameters
        vs = isl.make_zero_and_vars([idx], [extent])
        # Create the loop domain using '<=' and '>' restrictions
        isl_domains.append(
            ((vs[0].le_set(vs[idx])) & (vs[idx].lt_set(vs[0] + vs[extent]))))

    print("ISL loop domains:")
    print(isl_domains)
    print("")

    # Generate pymbolic variables for all used symbols
    args = {arg: pb.Variable(arg) for arg in arg_names}
    params = {param: pb.Variable(param) for param in param_names}
    inames = {iname: pb.Variable(iname) for iname, extent in loops}

    # Input arguments for the loopy kernel
    lp_args = {
        "A": lp.GlobalArg("A",
                          dtype=np.double,
                          shape=(params["n"], params["n"])),
        "B": lp.GlobalArg("B",
                          dtype=np.double,
                          shape=(params["m"], params["n"])),
        "c": lp.ValueArg("c", dtype=np.double)
    }

    # Generate the list of arguments & parameters that will be passed to loopy
    data = []
    data += [arg for arg in lp_args.values()]
    data += [lp.ValueArg(param) for param in ["n", "m"]]

    # Build the kernel instruction: computation and assignment of the element matrix
    def build_ass():
        """
        A[i,j] = c*sum(k, B[k,i]*B[k,j])
        """

        # The target of the assignment
        target = pb.Subscript(args["A"], (inames["i"], inames["j"]))

        # The rhs expression: A reduce operation of the matrix columns
        # Maybe replace with manual increment?
        reduce_op = lp.library.reduction.SumReductionOperation()
        reduce_expr = pb.Subscript(args["B"],
                                   (inames["k"], inames["i"])) * pb.Subscript(
                                       args["B"], (inames["k"], inames["j"]))
        expr = args["c"] * lp.Reduction(reduce_op, inames["k"], reduce_expr)

        return lp.Assignment(target, expr)

    ass = build_ass()
    print("Assignment expression:")
    print(ass)
    print("")

    instructions = [ass]

    # Construct the kernel
    knl = lp.make_kernel(isl_domains,
                         instructions,
                         data,
                         name=knl_name,
                         target=lp.CTarget(),
                         lang_version=lp.MOST_RECENT_LANGUAGE_VERSION)

    knl = lp.fix_parameters(knl, n=3, m=2)
    knl = lp.prioritize_loops(knl, "i,j")
    print(knl)
    print("")

    # Generate kernel code
    knl_c, knl_h = lp.generate_code_v2(knl).device_code(), str(
        lp.generate_header(knl)[0])
    print(knl_c)
    print("")

    # Postprocess kernel code
    replacements = [("__restrict__", "restrict")]
    knl_c = utils.replace_strings(knl_c, replacements)
    knl_h = utils.replace_strings(knl_h, replacements)

    knl_call = "kernel_tensor_A(A, &B[0][0], 1.0/(2.0*Ae));"

    return knl_name, knl_call, knl_c, knl_h
Ejemplo n.º 30
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
        knl
        for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
        if "KernelR" in knl.name or "KernelS" in knl.name
    ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (fix_euler_parameters,
                                        set_q_storage_format,
                                        set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.prioritize_loops(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner", )
        flux_ilp_inames = ("kk", )
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
            ("rknl", rflux_insn, (
                "j",
                "n",
            ), rtmps, (
                "jj",
                "ii",
            )),
            ("sknl", sflux_insn, (
                "i",
                "n",
            ), stmps, (
                "ii",
                "jj",
            )),
        ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(
                hsv,
                "tag:{knl_tag} and reads:{flux_var}".format(knl_tag=knl_tag,
                                                            flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv,
                                flux_var + "_subst",
                                flux_inames + ilp_inames,
                                temporary_name=flux_store_name,
                                precompute_inames=flux_precomp_inames +
                                flux_ilp_inames,
                                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_array_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_" + flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv,
                                  "n",
                                  n_iname,
                                  within="id:" + reader.id,
                                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(
            hsv, lp.find_one_rule_matching(hsv, prep_var_name + "_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv, dict(Q_dim_field_inner="unr", Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv,
                          "rhsQ",
                          ilp_inames,
                          fetch_bounding_box=True,
                          default_tag="for",
                          init_expression="0",
                          store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_array_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_array_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv, {
        "Q_dim_k": "unr",
        "rhsQ_init_k": "unr",
        "rhsQ_store_k": "unr"
    },
                        ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(
            tap_hsv,
            dict(rhsQ_init_field_inner="unr",
                 rhsQ_store_field_inner="unr",
                 rhsQ_init_field_outer="unr",
                 rhsQ_store_field_outer="unr",
                 Q_dim_field_inner="unr",
                 Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(
        hsv,
        dict(rhsQ_init_field_inner="vec",
             rhsQ_store_field_inner="vec",
             rhsQ_init_field_outer="unr",
             rhsQ_store_field_outer="unr",
             Q_dim_field_inner="vec",
             Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(
        hsv, "rhsQ_buf", vary_by_axes=(0, ) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_map = lp.get_op_map(hsv)
        print(lp.stringify_stats_mapping(op_map))

        print("MEM")
        gmem_map = lp.get_mem_access_map(hsv).to_bytes()
        print(lp.stringify_stats_mapping(gmem_map))

    hsv = lp.set_options(hsv,
                         cl_build_options=[
                             "-cl-denorms-are-zero",
                             "-cl-fast-relaxed-math",
                             "-cl-finite-math-only",
                             "-cl-mad-enable",
                             "-cl-no-signed-zeros",
                         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv,
                                  ctx,
                                  hsv,
                                  parameters=dict(elements=300),
                                  quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Ejemplo n.º 31
0
def mhd_vchar(params, G, P, D, loc, dir, vmax_out=None, vmin_out=None):
    """Calculate components of magnetosonic velocity from primitive variables"""
    sh = G.shapes

    # TODO these can almost certainly be calculated on the fly. Just need geometry
    # Careful Acov and *con change with dir/loc arguments
    global Acov, Acon, Bcov, Bcon
    if Acov is None:
        Acov = cl_array.empty(params['queue'],
                              sh.grid_vector,
                              dtype=np.float64)
        Bcov = cl_array.zeros_like(Acov)
        Bcov[0] = 1
        Acon = cl_array.empty_like(Acov)
        Bcon = cl_array.empty_like(Acov)

    # Acov needs to change with dir in call
    Acov.fill(0)
    Acov[dir].fill(1)

    G.raise_grid(Acov, loc, out=Acon)
    G.raise_grid(Bcov, loc, out=Bcon)

    # Find fast magnetosonic speed
    global knl_mhd_vchar
    if knl_mhd_vchar is None:
        code = replace_prim_names("""
        <> bsq = simul_reduce(sum, mu, bcon[mu,i,j,k] * bcov[mu,i,j,k])
        <> Asq = simul_reduce(sum, mu, Acon[mu,i,j,k] * Acov[mu,i,j,k])
        <> Bsq = simul_reduce(sum, mu, Bcon[mu,i,j,k] * Bcov[mu,i,j,k])
        <> AB = simul_reduce(sum, mu, Acon[mu,i,j,k] * Bcov[mu,i,j,k])
        <> Au = simul_reduce(sum, mu, Acov[mu,i,j,k] * ucon[mu,i,j,k])
        <> Bu = simul_reduce(sum, mu, Bcov[mu,i,j,k] * ucon[mu,i,j,k])

        ef := fabs(P[RHO,i,j,k]) + gam * fabs(P[UU,i,j,k])
        ee := (bsq + ef)
        <> va2 = bsq / ee
        <> cs2 = gam * (gam - 1.) * fabs(P[UU,i,j,k]) / ef

        # Keep two temps to keep loopy from having to compile complete spaghetti
        cms21 := cs2 + va2 - cs2 * va2
        cms22 := if(cms21 > 0, cms21, 0)
        <> cms2 = if(cms22 > 1, 1, cms22)

        A := Bu**2 - (Bsq + Bu**2) * cms2
        B := 2. * (Au * Bu - (AB + Au * Bu) * cms2)
        C := Au**2 - (Asq + Au**2) * cms2

        <> discr = sqrt(if(B**2 - 4.*A*C > 0, B**2 - 4.*A*C, 0))

        vp := -(-B + discr) / (2. * A)
        vm := -(-B - discr) / (2. * A)

        vmax[i,j,k] = if(vp > vm, vp, vm)
        vmin[i,j,k] = if(vp > vm, vm, vp)
        """)
        knl_mhd_vchar = lp.make_kernel(
            sh.isl_grid_vector,
            code, [*primsArrayArgs("P", ghosts=False), ...],
            assumptions=sh.assume_grid,
            default_offset=lp.auto)

        knl_mhd_vchar = lp.fix_parameters(knl_mhd_vchar,
                                          gam=params['gam'],
                                          nprim=params['n_prim'])
        knl_mhd_vchar = tune_grid_kernel(knl_mhd_vchar, sh.grid_vector)
        knl_mhd_vchar = lp.tag_inames(knl_mhd_vchar, "mu:unr")
        print("Compiled mhd_vchar")

    if vmax_out is None:
        vmax_out = cl_array.empty(params['queue'],
                                  sh.grid_scalar,
                                  dtype=np.float64)
    if vmin_out is None:
        vmin_out = cl_array.empty(params['queue'],
                                  sh.grid_scalar,
                                  dtype=np.float64)

    evt, _ = knl_mhd_vchar(params['queue'],
                           P=P,
                           Acon=Acon,
                           Acov=Acov,
                           Bcon=Bcon,
                           Bcov=Bcov,
                           ucon=D['ucon'],
                           bcon=D['bcon'],
                           bcov=D['bcov'],
                           vmax=vmax_out,
                           vmin=vmin_out)
    if 'profile' in params and params['profile']:
        evt.wait()

    return vmax_out, vmin_out
Ejemplo n.º 32
0
def U_to_P(params, G, U, P, Pout=None, iter_max=None):

    s = G.slices
    sh = G.shapes

    # Set some default parameters, but allow overrides
    if iter_max is None:
        if 'invert_iter_max' in params:
            iter_max = params['invert_iter_max']
        else:
            iter_max = 8

    if 'invert_err_tol' in params:
        err_tol = params['invert_err_tol']
    else:
        err_tol = 1.e-8

    if 'invert_iter_delta' in params:
        delta = params['invert_iter_delta']
    else:
        delta = 1.e-5

    if 'gamma_max' not in params:
        params['gamma_max'] = 25

    # Don't overwrite old memory by default, just allow using old values
    # Caller can pass new/old memory but is responsible that it contain logical values if we don't update it
    if Pout is None:
        Pout = P.copy()

    # Update the primitive B-fields
    G.vecdivbygeom(params['queue'],
                   u=U[s.B3VEC],
                   g=G.gdet_d[Loci.CENT.value],
                   out=Pout[s.B3VEC])

    # Cached constant quantities
    global ncov, ncon, lgdet
    if ncov is None:
        # For later on
        ncov = cl_array.zeros(params['queue'],
                              sh.grid_vector,
                              dtype=np.float64)
        G.timesgeom(params['queue'],
                    u=cl_array.empty_like(ncov[0]).fill(1.0),
                    g=-G.lapse_d[Loci.CENT.value],
                    out=ncov[0])
        ncon = G.raise_grid(ncov)
        lgdet = G.lapse_d[Loci.CENT.value] / G.gdet_d[Loci.CENT.value]

    # Eflag will indicate inversion failures
    # Define a generic kernel so we can split out flagging in the future
    # Use it to catch negative density early on
    eflag = cl_array.zeros(params['queue'], sh.grid_scalar, dtype=np.int32)
    global knl_set_eflag
    if knl_set_eflag is None:
        code = add_ghosts(
            """eflag[i,j,k] = if(var[i,j,k] < 0, flag, eflag[i,j,k])""")
        knl_set_eflag = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("eflag", dtype=np.int32),
                *scalarArrayArgs("var"),
                lp.ValueArg("flag", dtype=np.int32), ...
            ],
            default_offset=lp.auto)
        knl_set_eflag = tune_grid_kernel(knl_set_eflag,
                                         sh.bulk_scalar,
                                         ng=G.NG)

    evt, _ = knl_set_eflag(params['queue'],
                           var=U[s.RHO],
                           eflag=eflag,
                           flag=-100)

    # Convert from conserved variables to four-vectors
    Bcon = cl_array.zeros(params['queue'], sh.grid_vector, dtype=np.float64)
    G.vectimesgeom(params['queue'], u=U[s.B3VEC], g=lgdet, out=Bcon[1:])

    Qcov = cl_array.empty_like(Bcon)
    G.timesgeom(params['queue'], u=(U[s.UU] - U[s.RHO]), g=lgdet, out=Qcov[0])
    G.vectimesgeom(params['queue'], u=U[s.U3VEC], g=lgdet, out=Qcov[1:])

    Bcov = G.lower_grid(Bcon)
    Qcon = G.raise_grid(Qcov)

    # This will have fringes of zeros still!
    Bsq = G.dot(Bcon, Bcov)
    QdB = G.dot(Bcon, Qcov)
    Qdotn = G.dot(Qcon, ncov)
    Qsq = G.dot(Qcon, Qcov)

    Qtsq = Qsq + Qdotn**2

    Qtcon = cl_array.empty_like(Qcon)
    for i in range(4):
        Qtcon[i] = Qcon[i] + ncon[i] * Qdotn

    # Set up eqn for W', the energy density
    D = cl_array.zeros_like(Qsq)
    G.timesgeom(params['queue'], u=U[s.RHO], g=lgdet, out=D)
    Ep = -Qdotn - D

    del Bcov, Qcon, Qcov

    # Numerical rootfinding
    # Take guesses from primitives
    Wp = Wp_func(params, G, P, Loci.CENT, eflag)
    # Trap on any failures so far if debugging.  They're very rare.
    if 'debug' in params and params['debug']:
        if np.any(eflag.get()[s.bulk] != 0):
            raise ValueError("Unexpected flag set!")

    # Step around the guess & evaluate errors
    h = delta * Wp  # TODO stable enough?  Need fancy subtraction from iharm3d?
    errp = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp + h, eflag)
    err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag)
    errm = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp - h, eflag)

    # Preserve Wp/err before updating them below
    Wp1 = Wp.copy()
    err1 = err.copy()

    global knl_utop_prep
    if knl_utop_prep is None:
        # TODO keep an accumulator here to avoid that costly any() call?
        code = add_ghosts("""
        # TODO put error/prep in here, not calling below

        # Attempt a Halley/Muller/Bailey/Press step
        dedW := (errp[i,j,k] - errm[i,j,k]) / (2 * h[i,j,k])
        dedW2 := (errp[i,j,k] - 2.*err[i,j,k] + errm[i,j,k]) / (h[i,j,k]**2)

        # Limit size of 2nd derivative correction
        # Loopy trick common in HARM: define intermediate variables (xt, xt2, xt3...) to impose clipping
        # This allows assignments or substitutions while keeping dependencies straight
        ft := 0.5*err[i,j,k]*dedW2/(dedW**2)
        ft2 := if(ft > 0.3, 0.3, ft)
        f := if(ft2 < -0.3, -0.3, ft2)

        # Limit size of step
        dWt := -err[i,j,k] / dedW / (1. - f)
        dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt)
        dW := if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2)

        Wp[i,j,k] = Wp[i,j,k] + dW {id=wp}

        # Guarantee we take one step in every bulk zone
        stop_flag[i,j,k] = 0
        # This would avoid the step where there's convergence, but would require taking 2*dW above or similar
        #stop_flag[i,j,k] = (fabs(dW / Wp[i,j,k]) < err_tol) + \
        #                   (fabs(err[i,j,k] / Wp[i,j,k]) < err_tol) {dep=wp,nosync=wp}
        """)
        knl_utop_prep = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("err", "errp", "errm", "h", "Wp"),
                *scalarArrayArgs("stop_flag", dtype=np.int8), ...
            ],
            assumptions=sh.assume_grid,
            seq_dependencies=True)
        knl_utop_prep = lp.fix_parameters(knl_utop_prep, err_tol=err_tol)
        knl_utop_prep = tune_grid_kernel(knl_utop_prep,
                                         sh.bulk_scalar,
                                         ng=G.NG)
        print("Compiled utop_prep")

    # Fill stop_flag with 1 so we don't have to worry about ghost zones taking steps
    stop_flag = cl_array.empty(params['queue'], sh.grid_scalar,
                               dtype=np.int8).fill(1)

    evt, _ = knl_utop_prep(params['queue'],
                           err=err,
                           errp=errp,
                           errm=errm,
                           h=h,
                           Wp=Wp,
                           stop_flag=stop_flag)
    evt.wait()

    err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag, out=err)

    # Iteration kernel for 1Dw solver
    global knl_utop_iter
    if knl_utop_iter is None:
        code = add_ghosts("""
        # Evaluate whether we need to do any of this
        <> go = not(stop_flag[i,j,k]) {id=insn_go}

        # Normal secant increment is dW. Limit guess to between 0.5 and 2 times current value
        dWt := (Wp1[i,j,k] - Wp[i,j,k]) * err[i,j,k] / (err[i,j,k] - err1[i,j,k])
        dWt2 := if(dWt < -0.5*Wp[i,j,k], -0.5*Wp[i,j,k], dWt)
        <> dW = if(dWt2 > 2.0*Wp[i,j,k], 2.0*Wp[i,j,k], dWt2) {id=dw,if=go}

        # Preserve last values, after use but before any changes
        Wp1[i,j,k] = Wp[i,j,k] {id=wp1,nosync=dw,if=go}
        err1[i,j,k] = err[i,j,k] {nosync=dw,if=go}

        # Update Wp.  Err will be updated outside kernel
        Wp[i,j,k] = Wp[i,j,k] + dW {id=wp,nosync=dw:wp1,if=go}

        # Set flag not to continue in zones that have converged
        stop_flag[i,j,k] = if(fabs(dW / Wp[i,j,k]) < err_tol, 1, stop_flag[i,j,k]) {nosync=dw:wp:insn_go,if=go}

        # For the future, when we've defined err_eqn for loopy kernels
        # err = err_eqn(Bsq, D, Ep, QdB, Qtsq, Wp, gam, gamma_max, eflag) {if=go}
        # stop_flag[i,j,k] = stop_flag[i,j,k] + (fabs(err[] / Wp[]) < err_tol) {if=go}
        """)
        knl_utop_iter = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *scalarArrayArgs("Wp", "Wp1", "err", "err1"),
                *scalarArrayArgs("stop_flag", dtype=np.int8), ...
            ],
            assumptions=sh.assume_grid,
            default_offset=lp.auto,
            seq_dependencies=True)
        knl_utop_iter = lp.fix_parameters(knl_utop_iter, err_tol=err_tol)
        knl_utop_iter = tune_grid_kernel(knl_utop_iter,
                                         sh.bulk_scalar,
                                         ng=G.NG)
        print("Compiled utop_iter")

    # Iterate at least once to set new values from first step
    # TODO Needed now we set Wp, err1 right?
    for niter in range(iter_max):
        #print("U_to_P iter")
        evt, _ = knl_utop_iter(params['queue'],
                               Wp=Wp,
                               Wp1=Wp1,
                               err=err,
                               err1=err1,
                               stop_flag=stop_flag)
        err = err_eqn(params, G, Bsq, D, Ep, QdB, Qtsq, Wp, eflag)
        # TODO there may be better/faster if/reduction statements here...
        stop_flag |= (clm.fabs(err / Wp) < err_tol)
        if cl_array.min(stop_flag) >= 1:
            break

    # If secant method failed to converge, do not set primitives other than B
    eflag += (stop_flag == 0)

    del Wp1, err, err1, stop_flag

    # Find utsq, gamma, rho0 from Wp
    gamma = gamma_func(params, G, Bsq, D, QdB, Qtsq, Wp, eflag)

    if 'debug' in params and params['debug']:
        if np.any(gamma.get()[s.bulk] < 1.):
            raise ValueError("gamma < 1 failure!")

    # Find the scalars
    global knl_utop_set
    if knl_utop_set is None:
        code = add_ghosts(
            replace_prim_names("""
        rho0 := D[i,j,k] / gamma[i,j,k]
        W := Wp[i,j,k] + D[i,j,k]
        w := W / (gamma[i,j,k]**2)
        pres := (w - rho0) * (gam - 1.) / gam
        u := w - (rho0 + pres)

        # Set flag if prims are < 0
        eflag[i,j,k] = if((u < 0)*(rho0 < 0), 8, if(u < 0, 7, if(rho0 < 0, 6, eflag[i,j,k]))) {id=ef}

        # Don't update flagged primitives (necessary?  Could skip the branch if fixup does ok)
        <> set = not(eflag[i,j,k]) {dep=ef,nosync=ef}

        P[RHO,i,j,k] = rho0 {if=set}
        P[UU,i,j,k] = u     {if=set}
        P[U1,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[1,i,j,k] + QdB[i,j,k] * Bcon[1,i,j,k] / W) {if=set}
        P[U2,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[2,i,j,k] + QdB[i,j,k] * Bcon[2,i,j,k] / W) {if=set}
        P[U3,i,j,k] = (gamma[i,j,k] / (W + Bsq[i,j,k])) * (Qtcon[3,i,j,k] + QdB[i,j,k] * Bcon[3,i,j,k] / W) {if=set}
        """))
        if 'electrons' in params and params['electrons']:
            code += add_ghosts("""
            P[KEL,i,j,k] = U[KEL,i,j,k]/U[RHO,i,j,k]
            P[KTOT,i,j,k] = U[KTOT,i,j,k]/U[RHO,i,j,k]
            """)
        knl_utop_set = lp.make_kernel(
            sh.isl_grid_scalar,
            code, [
                *primsArrayArgs("P"), *vecArrayArgs("Qtcon", "Bcon"),
                *scalarArrayArgs("D", "gamma", "Wp", "Bsq", "QdB"),
                *scalarArrayArgs("eflag", dtype=np.int32), ...
            ],
            assumptions=sh.assume_grid,
            default_offset=lp.auto)
        knl_utop_set = lp.fix_parameters(knl_utop_set,
                                         gam=params['gam'],
                                         nprim=params['n_prim'],
                                         ndim=4)
        knl_utop_set = tune_grid_kernel(knl_utop_set, sh.bulk_scalar, ng=G.NG)
        print("Compiled utop_set")

    evt, _ = knl_utop_set(params['queue'],
                          P=Pout,
                          Qtcon=Qtcon,
                          Bcon=Bcon,
                          D=D,
                          gamma=gamma,
                          Wp=Wp,
                          Bsq=Bsq,
                          QdB=QdB,
                          eflag=eflag)
    evt.wait()
    del Qtcon, Bcon, D, gamma, Wp, Bsq, QdB

    # Trap on flags early in test problems
    if 'debug' in params and params['debug']:
        n_nonzero = np.count_nonzero(eflag.get())
        if n_nonzero > 0:
            print("Nonzero eflag in bulk: {}\nFlags: {}".format(
                n_nonzero, np.argwhere(eflag.get() != 0)))

    return Pout, eflag
Ejemplo n.º 33
0
    def get_kernel(self):
        ncoeffs = len(self.expansion)

        loopy_insns, result_names = self.get_loopy_insns_and_result_names()

        loopy_knl = lp.make_kernel(
                [
                    "{[itgt_box]: 0<=itgt_box<ntgt_boxes}",
                    "{[itgt]: itgt_start<=itgt<itgt_end}",
                    "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end }",
                    "{[idim]: 0<=idim<dim}",
                    ],
                ["""
                for itgt_box
                    <> tgt_ibox = target_boxes[itgt_box]
                    <> itgt_start = box_target_starts[tgt_ibox]
                    <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox]

                    for itgt
                        <> tgt[idim] = targets[idim,itgt]

                        <> isrc_box_start = source_box_starts[itgt_box]
                        <> isrc_box_end = source_box_starts[itgt_box+1]

                        for isrc_box
                            <> src_ibox = source_box_lists[isrc_box]
                            """] + ["""
                            <> coeff{coeffidx} = \
                                src_expansions[src_ibox - src_base_ibox, {coeffidx}]
                            """.format(coeffidx=i) for i in range(ncoeffs)] + ["""

                            <> center[idim] = centers[idim, src_ibox] {dup=idim}
                            <> b[idim] = tgt[idim] - center[idim] {dup=idim}

                            """] + loopy_insns + ["""
                        end
                        """] + ["""
                        result[{resultidx}, itgt] = result[{resultidx}, itgt] + \
                                kernel_scaling * simul_reduce(sum, isrc_box,
                                result_{resultidx}_p) {{id_prefix=write_result}}
                        """.format(resultidx=i) for i in range(len(result_names))] + ["""
                    end
                end
                """],
                [
                    lp.GlobalArg("targets", None, shape=(self.dim, "ntargets"),
                        dim_tags="sep,C"),
                    lp.GlobalArg("box_target_starts,box_target_counts_nonchild",
                        None, shape=None),
                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                    lp.GlobalArg("src_expansions", None,
                        shape=("nsrc_level_boxes", ncoeffs), offset=lp.auto),
                    lp.ValueArg("src_base_ibox", np.int32),
                    lp.ValueArg("nsrc_level_boxes,aligned_nboxes", np.int32),
                    lp.ValueArg("ntargets", np.int32),
                    lp.GlobalArg("result", None, shape="nresults,ntargets",
                        dim_tags="sep,C"),
                    lp.GlobalArg("source_box_starts, source_box_lists,",
                        None, shape=None, offset=lp.auto),
                    "..."
                ] + [arg.loopy_arg for arg in self.expansion.get_args()],
                name=self.name,
                assumptions="ntgt_boxes>=1",
                silenced_warnings="write_race(write_result*)",
                default_offset=lp.auto)

        loopy_knl = lp.fix_parameters(loopy_knl,
                dim=self.dim,
                nresults=len(result_names))

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.prioritize_loops(loopy_knl, "itgt_box,itgt,isrc_box")
        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)

        return loopy_knl
Ejemplo n.º 34
0
                    (in_disk
                        and qbx_forced_limit == 0)
                    or (in_disk
                            and qbx_forced_limit != 0
                            and qbx_forced_limit * center_side[ictr] > 0)
                    )

            <> post_dist_sq = if(matches, dist_sq, HUGE)
        end
        <> min_dist_sq, <> min_ictr = argmin(ictr, ictr, post_dist_sq)

        tgt_to_qbx_center[itgt] = if(min_dist_sq < HUGE, min_ictr, -1)
    end
    """)

knl = lp.fix_parameters(knl, ambient_dim=2)
knl = lp.add_and_infer_dtypes(
    knl, {
        "tgt,center,radius,HUGE": np.float32,
        "center_side,qbx_forced_limit": np.int32,
    })

lp.auto_test_vs_ref(knl,
                    cl_ctx,
                    knl,
                    parameters={
                        "HUGE": 1e20,
                        "ncenters": 200,
                        "ntargets": 300,
                        "qbx_forced_limit": 1
                    })
Ejemplo n.º 35
0
def test_poisson_fem(ctx_factory):
    # Stolen from Peter Coogan and Rob Kirby for FEM assembly
    ctx = ctx_factory()

    nbf = 5
    nqp = 5
    sdim = 3

    knl = lp.make_kernel("{ [c,i,j,k,ell,ell2,ell3]: \
            0 <= c < nels and \
            0 <= i < nbf and \
            0 <= j < nbf and \
            0 <= k < nqp and \
            0 <= ell,ell2 < sdim}",
                         """
            dpsi(bf,k0,dir) := \
                    simul_reduce(sum, ell2, DFinv[c,ell2,dir] * DPsi[bf,k0,ell2] )
            Ael[c,i,j] = \
                    J[c] * w[k] * sum(ell, dpsi(i,k,ell) * dpsi(j,k,ell))
            """,
                         assumptions="nels>=1 and nbf >= 1 and nels mod 4 = 0")

    print(knl)

    knl = lp.fix_parameters(knl, nbf=nbf, sdim=sdim, nqp=nqp)

    ref_knl = knl

    knl = lp.set_loop_priority(knl, ["c", "j", "i", "k"])

    def variant_1(knl):
        knl = lp.precompute(knl, "dpsi", "i,k,ell", default_tag='for')
        knl = lp.set_loop_priority(knl, "c,i,j")
        return knl

    def variant_2(knl):
        knl = lp.precompute(knl, "dpsi", "i,ell", default_tag='for')
        knl = lp.set_loop_priority(knl, "c,i,j")
        return knl

    def add_types(knl):
        return lp.add_and_infer_dtypes(
            knl,
            dict(
                w=np.float32,
                J=np.float32,
                DPsi=np.float32,
                DFinv=np.float32,
            ))

    for variant in [
            #variant_1,
            variant_2
    ]:
        knl = variant(knl)

        lp.auto_test_vs_ref(add_types(ref_knl),
                            ctx,
                            add_types(knl),
                            parameters=dict(n=5, nels=15, nbf=5, sdim=2,
                                            nqp=7))
Ejemplo n.º 36
0
def test_dg_volume(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)

    dtype = np.float32
    dtype4 = cl.array.vec.float4
    ctx = ctx_factory()

    order = "F"

    N = 3  # noqa
    Np = (N+1)*(N+2)*(N+3)//6  # noqa

    K = 10000  # noqa

    knl = lp.make_kernel([
            "{[n,m,k]: 0<= n,m < Np and 0<= k < K}",
            ],
            """
                <> du_drst = simul_reduce(sum, m, DrDsDt[n,m]*u[k,m])
                <> dv_drst = simul_reduce(sum, m, DrDsDt[n,m]*v[k,m])
                <> dw_drst = simul_reduce(sum, m, DrDsDt[n,m]*w[k,m])
                <> dp_drst = simul_reduce(sum, m, DrDsDt[n,m]*p[k,m])

                # volume flux
                rhsu[k,n] = dot(drst_dx[k],dp_drst)
                rhsv[k,n] = dot(drst_dy[k],dp_drst)
                rhsw[k,n] = dot(drst_dz[k],dp_drst)
                rhsp[k,n] = dot(drst_dx[k], du_drst) + dot(drst_dy[k], dv_drst) \
                    + dot(drst_dz[k], dw_drst)
                """,
            [
                lp.GlobalArg("u,v,w,p,rhsu,rhsv,rhsw,rhsp",
                    dtype, shape="K, Np", order="C"),
                lp.GlobalArg("DrDsDt", dtype4, shape="Np, Np", order="C"),
                lp.GlobalArg("drst_dx,drst_dy,drst_dz", dtype4, shape="K",
                    order=order),
                lp.ValueArg("K", np.int32, approximately=1000),
                ],
            name="dg_volume", assumptions="K>=1")

    knl = lp.fix_parameters(knl, Np=Np)

    seq_knl = knl

    def variant_basic(knl):
        knl = lp.tag_inames(knl, dict(k="g.0", n="l.0"))
        return knl

    def variant_more_per_work_group(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        return knl

    def variant_image_d(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        knl = lp.change_arg_to_image(knl, "DrDsDt")
        return knl

    def variant_prefetch_d(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        knl = lp.add_prefetch(knl, "DrDsDt[:,:]",
                fetch_outer_inames='k_outer',
                default_tag="l.auto")
        return knl

    def variant_prefetch_fields(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        for name in ["u", "v", "w", "p"]:
            knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"],
                    default_tag="l.auto")

        return knl

    def variant_k_ilp(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="ilp")
        knl = lp.tag_inames(knl, dict(m="unr"))
        return knl

    def variant_simple_padding(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")

        arg_names = [
                prefix+name
                for name in ["u", "v", "w", "p"]
                for prefix in ["", "rhs"]]

        for name in arg_names:
            knl = lp.add_padding(knl, name, axis=0, align_bytes=32)

        knl = lp.tag_inames(knl, dict(m="unr"))

        return knl

    def variant_fancy_padding(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        pad_mult = lp.find_padding_multiple(knl, "u", 1, 32)

        arg_names = [
                prefix+name
                for name in ["u", "v", "w", "p"]
                for prefix in ["", "rhs"]]

        knl = lp.split_array_dim(knl, [(nm, 0) for nm in arg_names], pad_mult)

        return knl

    parameters_dict = dict(K=K)

    variants = [
            variant_basic,
            variant_more_per_work_group,
            variant_prefetch_d,
            variant_prefetch_fields,
            variant_k_ilp,
            variant_simple_padding,
            variant_fancy_padding
            ]

    if (ctx.devices[0].image_support
            and ctx.devices[0].platform.name != "Portable Computing Language"):
        variants.append(variant_image_d)

    for variant in variants:
        lp.auto_test_vs_ref(
                seq_knl, ctx, variant(knl), parameters=parameters_dict,
                #codegen_kwargs=dict(with_annotation=True)
                )
Ejemplo n.º 37
0
def test_convolution(ctx_factory):
    ctx = ctx_factory()

    dtype = np.float32

    knl = lp.make_kernel(
        "{ [iimg, ifeat, icolor, im_x, im_y, f_x, f_y]: \
                -f_w <= f_x,f_y <= f_w \
                and 0 <= im_x < im_w and 0 <= im_y < im_h \
                and 0<=iimg<=nimgs and 0<=ifeat<nfeats and 0<=icolor<ncolors \
                }",
        """
        out[iimg, ifeat, im_x, im_y] = sum((f_x, f_y, icolor), \
            img[iimg, f_w+im_x-f_x, f_w+im_y-f_y, icolor] \
            * f[ifeat, f_w+f_x, f_w+f_y, icolor])
        """, [
            lp.GlobalArg("f", dtype, shape=lp.auto),
            lp.GlobalArg("img", dtype, shape=lp.auto),
            lp.GlobalArg("out", dtype, shape=lp.auto), "..."
        ],
        assumptions=
        "f_w>=1 and im_w, im_h >= 2*f_w+1 and nfeats>=1 and nimgs>=0",
        options="annotate_inames")

    f_w = 3

    knl = lp.fix_parameters(knl, f_w=f_w, ncolors=3)

    ref_knl = knl

    def variant_0(knl):
        #knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.prioritize_loops(knl, "iimg,im_x,im_y,ifeat,f_x,f_y")
        return knl

    def variant_1(knl):
        knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0")
        knl = lp.prioritize_loops(knl, "iimg,im_x_outer,im_y,ifeat,f_x,f_y")
        return knl

    def variant_2(knl):
        knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
        knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
        knl = lp.tag_inames(knl, dict(ifeat="g.2"))
        knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto")
        knl = lp.add_prefetch(knl,
                              "img",
                              "im_x_inner, im_y_inner, f_x, f_y",
                              default_tag="l.auto")
        return knl

    for variant in [
            #variant_0,
            #variant_1,
            variant_2
    ]:
        lp.auto_test_vs_ref(ref_knl,
                            ctx,
                            variant(knl),
                            parameters=dict(im_w=128,
                                            im_h=128,
                                            f_w=f_w,
                                            nfeats=3,
                                            nimgs=3))
Ejemplo n.º 38
0
def advance_fluid(params, G, Pi, Ps, dt):
    s = G.slices
    sh = G.shapes
    global Ui, dU, Uf, Ds
    if Ui is None:
        Ui = cl_array.zeros(params['queue'],
                            sh.grid_primitives,
                            dtype=np.float64)
        dU = cl_array.zeros(params['queue'],
                            sh.grid_primitives,
                            dtype=np.float64)
        Uf = cl_array.zeros(params['queue'],
                            sh.grid_primitives,
                            dtype=np.float64)
        Ds = get_state(params, G, Ps, Loci.CENT)

    global ihc
    if ihc is None and 'use_ctypes' in params and params['use_ctypes']:
        ihc = Iharmc()

    F, ndt = get_flux(params, G, Ps)

    # plot_all_prims(G, F[1], "F1")
    # plot_all_prims(G, F[2], "F2")
    # plot_all_prims(G, F[3], "F3")

    # TODO RESTORE following
    # if params['metric'][-3:] == "mks":
    #     fix_flux(F)

    # Constrained transport for B
    flux_ct(params, G, F)

    # Flux diagnostic globals
    # diag_flux(F)

    # Get conserved variables & source term
    get_state(params, G, Ps, Loci.CENT, out=Ds)
    get_fluid_source(params, G, Ps, Ds, out=dU)

    if Pi is not Ps:
        Di = get_state(params, G, Pi, Loci.CENT)
    else:
        Di = Ds

    prim_to_flux(params, G, Pi, Di, 0, Loci.CENT, out=Ui)
    if 'P_U_P_test' in params and params['P_U_P_test']:
        # Test U_to_P by recovering a prims array we know (Ps),
        # starting from a close but not identical array (Pi)
        if Pi is not Ps:
            Us = prim_to_flux(params, G, Ps, Ds, 0, Loci.CENT)
            Ps_fromU, _ = ihc.U_to_P(params, G, Us, Pi)
            print("Pi vs Ps change: ", la.norm((Ps - Pi).get()[s.bulk]))
            print("Ps->U->Ps Absolute Error: ",
                  la.norm((Ps - Ps_fromU).get()[s.bulk]))
        else:
            print("Pi is Ps")

    global knl_finite_diff
    if knl_finite_diff is None:
        code = add_ghosts("""
        Uf[p,i,j,k] = Ui[p,i,j,k] + \
                      dt * ((F1[p,i,j,k] - F1[p,i+1,j,k]) / dx1 + \
                            (F2[p,i,j,k] - F2[p,i,j+1,k]) / dx2 + \
                            (F3[p,i,j,k] - F3[p,i,j,k+1]) / dx3 + dU[p,i,j,k])
        """)
        knl_finite_diff = lp.make_kernel(
            sh.isl_grid_primitives,
            code, [
                lp.ValueArg("dt", dtype=np.float64),
                *primsArrayArgs("Ui", "F1", "F2", "F3", "dU", "Uf"), ...
            ],
            assumptions=sh.assume_grid)
        knl_finite_diff = lp.fix_parameters(knl_finite_diff,
                                            dx1=G.dx[1],
                                            dx2=G.dx[2],
                                            dx3=G.dx[3])
        knl_finite_diff = tune_prims_kernel(knl_finite_diff,
                                            sh.bulk_primitives,
                                            ng=G.NG)
        print("Compiled finite_diff")

    if isinstance(dt, cl_array.Array):
        dt = dt.get()
    evt, _ = knl_finite_diff(params['queue'],
                             dt=float(dt),
                             Ui=Ui,
                             F1=F[1],
                             F2=F[2],
                             F3=F[3],
                             dU=dU,
                             Uf=Uf)

    # Newton-Raphson
    if ihc is not None:
        # If we're using iharm3d's C U_to_P fn, negotiate memory.  Note eflag is not preserved!
        Pf, pflag = ihc.U_to_P(params, G, Uf.get(), Pi.get())
        Pf = cl_array.to_device(params['queue'], Pf)
        pflag = cl_array.to_device(params['queue'], pflag)
    else:
        # Loopy version
        Pf, pflag = U_to_P(params, G, Uf, Pi)

    if params['debug']:
        print("Uf - Ui: ", la.norm((Uf - Ui).get()))
        print("Pf - Pi: ", la.norm((Pf - Pi).get()))

    return Pf, pflag, ndt
Ejemplo n.º 39
0
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")  # noqa

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(
            "{[i,j,e,m,o,o2,gi]: 0<=i,j,m,o,o2<n and 0<=e<K and 0<=gi<3}",
            [
                "ur(a,b) := simul_reduce(sum, o, D[a,o]*u[e,o,b])",
                "us(a,b) := simul_reduce(sum, o2, D[b,o2]*u[e,a,o2])",

                #"Gu(mat_entry,a,b) := G[mat_entry,e,m,j]*ur(m,j)",

                "Gux(a,b) := G$x[0,e,a,b]*ur(a,b)+G$x[1,e,a,b]*us(a,b)",
                "Guy(a,b) := G$y[1,e,a,b]*ur(a,b)+G$y[2,e,a,b]*us(a,b)",
                "lap[e,i,j]  = "
                "  simul_reduce(sum, m, D[m,i]*Gux(m,j))"
                "+ simul_reduce(sum, m, D[m,j]*Guy(i,m))"

            ],
            [
                lp.GlobalArg("u", dtype, shape=field_shape, order=order),
                lp.GlobalArg("lap", dtype, shape=field_shape, order=order),
                lp.GlobalArg("G", dtype, shape=(3,)+field_shape, order=order),
                # lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
                lp.GlobalArg("D", dtype, shape=(n, n), order=order),
                # lp.ImageArg("D", dtype, shape=(n, n)),
                lp.ValueArg("K", np.int32, approximately=1000),
                ],
            name="semlap2D", assumptions="K>=1")

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.duplicate_inames(knl, "o", within="id:ur")
    knl = lp.duplicate_inames(knl, "o", within="id:us")

    seq_knl = knl

    def variant_orig(knl):
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))

        knl = lp.add_prefetch(knl, "D[:,:]")
        knl = lp.add_prefetch(knl, "u[e, :, :]")

        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "us(i,m)", ["i", "m"])

        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"])
        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"])

        knl = lp.add_prefetch(knl, "G$x[:,e,:,:]")
        knl = lp.add_prefetch(knl, "G$y[:,e,:,:]")

        knl = lp.tag_inames(knl, dict(o="unr"))
        knl = lp.tag_inames(knl, dict(m="unr"))

        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
        print(knl)

        return knl

    for variant in [variant_orig]:
        K = 1000  # noqa
        lp.auto_test_vs_ref(seq_knl, ctx, variant(knl),
                op_count=[K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9],
                op_label=["GFlops"],
                parameters={"K": K})
Ejemplo n.º 40
0
def no_test_dg_surface(ctx_factory):
    # tough to test, would need the right index info
    dtype = np.float32
    ctx = ctx_factory()

    order = "F"

    N = 3  # noqa
    Np = (N+1)*(N+2)*(N+3)//6  # noqa
    Nfp = (N+1)*(N+2)//2  # noqa
    Nfaces = 4  # noqa

    K = 10000  # noqa

    knl = lp.make_kernel(
            [
                "{[m,n,k]: 0<= m < NfpNfaces and 0<= n < Np and 0<= k < K }"
                ],
            """
                <> idP = vmapP[m,k]
                <> idM = vmapM[m,k]

                <> du = u[[idP]]-u[[idM]]
                <> dv = v[[idP]]-v[[idM]]
                <> dw = w[[idP]]-w[[idM]]
                <> dp = bc[m,k]*p[[idP]] - p[[idM]]

                <> dQ = 0.5*Fscale[m,k]* \
                        (dp - nx[m,k]*du - ny[m,k]*dv - nz[m,k]*dw)

                <> fluxu = -nx[m,k]*dQ
                <> fluxv = -ny[m,k]*dQ
                <> fluxw = -nz[m,k]*dQ
                <> fluxp =          dQ

                # reduction here
                rhsu[n,k] = sum(m, LIFT[n,m]*fluxu)
                rhsv[n,k] = sum(m, LIFT[n,m]*fluxv)
                rhsw[n,k] = sum(m, LIFT[n,m]*fluxw)
                rhsp[n,k] = sum(m, LIFT[n,m]*fluxp)
                """,
            [
                lp.GlobalArg("vmapP,vmapM",
                    np.int32, shape="NfpNfaces, K", order=order),
                lp.GlobalArg("u,v,w,p,rhsu,rhsv,rhsw,rhsp",
                    dtype, shape="Np, K", order=order),
                lp.GlobalArg("nx,ny,nz,Fscale,bc",
                    dtype, shape="NfpNfaces, K", order=order),
                lp.GlobalArg("LIFT", dtype, shape="Np, NfpNfaces", order="C"),
                lp.ValueArg("K", np.int32, approximately=1000),
                ],
            name="dg_surface", assumptions="K>=1")

    knl = lp.fix_parameters(knl,
            Np=Np, Nfp=Nfp, NfpNfaces=Nfaces*Nfp, nsurf_dofs=K*Nfp)

    seq_knl = knl

    def variant_basic(knl):
        return knl

    parameters_dict = dict(K=K)

    for variant in [
            variant_basic,
            ]:

        lp.auto_test_vs_ref(seq_knl, ctx, variant(knl), parameters=parameters_dict)
Ejemplo n.º 41
0
def test_rob_stroud_bernstein_full(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    # NOTE: result would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                0 <= el < nels and \
                0 <= i2 < nqp1d and \
                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
                \
                0 <= i1_2 < nqp1d and \
                0 <= alpha1_2 <= deg and \
                0 <= i2_2 < nqp1d \
                }",
        """
        for el
            for i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
                            {id=write_tmp}
                    for alpha2
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                            {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end

            for i1_2
                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                <> s2 = 1-xi2
                <> r2 = xi2/s2
                <> w2 = s2**deg

                for alpha1_2
                    for i2_2
                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
                                w2 * tmp[alpha1_2, i2_2]
                    end

                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)
                end
            end
        end
        """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.

            lp.GlobalArg("coeffs", None, shape=None),
            "..."
            ],
        assumptions="deg>=0 and nels>=1"
        )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)

    if 0:
        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
        knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
                slabs=(0, 1))
        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    from pickle import dumps, loads
    knl = loads(dumps(knl))

    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                tmp=np.float32,
                coeffs=np.float32,
                result=np.float32,
                ))
    print(knl)
Ejemplo n.º 42
0
def test_rob_stroud_bernstein_full(ctx_factory):
    #logging.basicConfig(level=logging.DEBUG)
    ctx = ctx_factory()

    # NOTE: result would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2, i1_2, alpha1_2, i2_2]: \
                0 <= el < nels and \
                0 <= i2 < nqp1d and \
                0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 and\
                \
                0 <= i1_2 < nqp1d and \
                0 <= alpha1_2 <= deg and \
                0 <= i2_2 < nqp1d \
                }",
        """
        for el
            for i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    <> tmp[alpha1,i2] = tmp[alpha1,i2] + w * coeffs[aind] \
                            {id=write_tmp,dep=init_w:aind_init}
                    for alpha2
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                            {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                            {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end

            for i1_2
                <> xi2 = qpts[0, i1_2] {dep=aind_incr}
                <> s2 = 1-xi2
                <> r2 = xi2/s2
                <> w2 = s2**deg  {id=w2_init}

                for alpha1_2
                    for i2_2
                        result[el, i1_2, i2_2] = result[el, i1_2, i2_2] + \
                                w2 * tmp[alpha1_2, i2_2]  {id=res2,dep=w2_init}
                    end

                    w2 = w2 * r2 * (deg-alpha1_2) / (1+alpha1_2)  \
                            {id=w2_update, dep=res2}
                end
            end
        end
        """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.
            lp.GlobalArg("coeffs", None, shape=None),
            "..."
        ],
        assumptions="deg>=0 and nels>=1")

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)

    if 0:
        knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
        knl = lp.split_iname(knl,
                             "el_outer",
                             2,
                             outer_tag="g.0",
                             inner_tag="ilp",
                             slabs=(0, 1))
        knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    from pickle import dumps, loads
    knl = loads(dumps(knl))

    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
        dict(
            qpts=np.float32,
            tmp=np.float32,
            coeffs=np.float32,
            result=np.float32,
        ))
    print(knl)
Ejemplo n.º 43
0
def set_up_volume_loop(kernel, Nq):
    kernel = lp.fix_parameters(kernel, Nq=Nq)
    kernel = lp.set_loop_priority(kernel, "e,k,j,i")
    kernel = lp.tag_inames(kernel, dict(e="g.0", j="l.1", i="l.0"))
    kernel = lp.assume(kernel, "elements >= 1")
    return kernel
Ejemplo n.º 44
0
def fix_euler_parameters(kernel, p_p0, p_Gamma, p_R):
    return lp.fix_parameters(kernel,
                             p_p0=pick_apart_float_cast(p_p0),
                             p_Gamma=pick_apart_float_cast(p_Gamma),
                             p_R=pick_apart_float_cast(p_R))