Example #1
0
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
            split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=30))
Example #2
0
    def variant_orig(knl):
        knl = lp.tag_inames(knl, dict(i="l.0", j="l.1", e="g.0"))

        knl = lp.add_prefetch(knl,
                              "D[:,:]",
                              fetch_outer_inames='e',
                              default_tag="l.auto")
        knl = lp.add_prefetch(knl, "u[e, :, :]", default_tag="l.auto")

        knl = lp.precompute(knl, "ur(m,j)", ["m", "j"], default_tag="l.auto")
        knl = lp.precompute(knl, "us(i,m)", ["i", "m"], default_tag="l.auto")
        # TODO this adds `a` and `b` to domains, which leads to unused inames

        knl = lp.precompute(knl, "Gux(m,j)", ["m", "j"], default_tag="l.auto")
        knl = lp.precompute(knl, "Guy(i,m)", ["i", "m"], default_tag="l.auto")

        knl = lp.add_prefetch(knl, "G$x[:,e,:,:]", default_tag="l.auto")
        knl = lp.add_prefetch(knl, "G$y[:,e,:,:]", default_tag="l.auto")

        knl = lp.tag_inames(knl, dict(o="unr"))
        knl = lp.tag_inames(knl, dict(m="unr"))

        knl = lp.set_instruction_priority(knl, "id:D_fetch", 5)
        print(knl)

        return knl
Example #3
0
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
Example #4
0
def test_vectorize(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0<=i<n}", """
        <> temp = 2*b[i]
        a[i] = temp
        """)
    knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
    knl = lp.set_array_dim_names(knl, "a,b", "i")
    knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)],
                             4,
                             split_kwargs=dict(slabs=(0, 1)))

    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
    ref_knl = knl
    ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})

    knl = lp.tag_inames(knl, {"i_inner": "vec"})

    knl = lp.preprocess_kernel(knl)
    knl = lp.get_one_scheduled_kernel(knl)
    code, inf = lp.generate_code(knl)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=30))
Example #5
0
def test_unused_hw_axes_in_callee(ctx_factory, inline):
    ctx = ctx_factory()

    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            y[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{[i]: 0<=i<10}",
                         """
            y[:, i] = twice(x[:, i])
            """, [
                             lp.GlobalArg("x", shape=(10, 10), dtype=float),
                             lp.GlobalArg("y", shape=(10, 10))
                         ],
                         name="outer")

    twice = lp.tag_inames(twice, {"i": "l.1"})
    knl = lp.tag_inames(knl, {"i": "l.0"})
    knl = lp.merge([knl, twice])

    if inline:
        knl = lp.inline_callable_kernel(knl, "twice")

    lp.auto_test_vs_ref(knl, ctx, knl)
Example #6
0
def test_tim2d(ctx_factory):
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    n = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (K_sym, n, n)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,j,e,m,o,gi]: 0<=i,j,m,o<%d and 0<=e<K and 0<=gi<3}" % n,
           [
            "ur(a,b) := sum_float32(@o, D[a,o]*u[e,o,b])",
            "us(a,b) := sum_float32(@o, D[b,o]*u[e,a,o])",

            "lap[e,i,j]  = "
            "  sum_float32(m, D[m,i]*(G[0,e,m,j]*ur(m,j) + G[1,e,m,j]*us(m,j)))"
            "+ sum_float32(m, D[m,j]*(G[1,e,i,m]*ur(i,m) + G[2,e,i,m]*us(i,m)))"

            ],
            [
            lp.ArrayArg("u", dtype, shape=field_shape, order=order),
            lp.ArrayArg("lap", dtype, shape=field_shape, order=order),
            lp.ArrayArg("G", dtype, shape=(3,)+field_shape, order=order),
#            lp.ConstantArrayArg("D", dtype, shape=(n, n), order=order),
            lp.ArrayArg("D", dtype, shape=(n, n), order=order),
#            lp.ImageArg("D", dtype, shape=(n, n)),
            lp.ValueArg("K", np.int32, approximately=1000),
            ],
             name="semlap2D", assumptions="K>=1")

    unroll = 32

    seq_knl = knl
    knl = lp.add_prefetch(knl, "D", ["m", "j", "i","o"], default_tag="l.auto")
    knl = lp.add_prefetch(knl, "u", ["i", "j",  "o"], default_tag="l.auto")
    knl = lp.precompute(knl, "ur", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.precompute(knl, "us", np.float32, ["a", "b"], default_tag="l.auto")
    knl = lp.split_iname(knl, "e", 1, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))
    knl = lp.tag_inames(knl, dict(o="unr"))
    knl = lp.tag_inames(knl, dict(m="unr"))


#    knl = lp.add_prefetch(knl, "G", [2,3], default_tag=None) # axis/argument indices on G
    knl = lp.add_prefetch(knl, "G", [2,3], default_tag="l.auto") # axis/argument indices on G

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000))

    K = 1000
    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=K*(n*n*n*2*2 + n*n*2*3 + n**3 * 2*2)/1e9,
            op_label="GFlops",
            parameters={"K": K})
Example #7
0
def test_double_hw_axes_used_in_knl_call(inline):
    from loopy.diagnostic import LoopyError

    twice = lp.make_function("{[i]: 0<=i<10}",
                             """
            y[i] = 2*x[i]
            """,
                             name="twice")

    knl = lp.make_kernel("{[i]: 0<=i<10}",
                         """
            y[:, i] = twice(x[:, i])
            """, [
                             lp.GlobalArg("x", shape=(10, 10), dtype=float),
                             lp.GlobalArg("y", shape=(10, 10))
                         ],
                         name="outer")

    twice = lp.tag_inames(twice, {"i": "l.0"})
    knl = lp.tag_inames(knl, {"i": "l.0"})
    knl = lp.merge([knl, twice])

    if inline:
        knl = lp.inline_callable_kernel(knl, "twice")

    with pytest.raises(LoopyError):
        lp.generate_code_v2(knl)
Example #8
0
    def variant_simple_gpu(knl):
        # This is a simple GPU-ish variant.

        # It's not the same thing as Matt's code, but I'll need some more time
        # to reverse-engineer what is going on there. Some discussion might
        # help, too. :)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]
Example #9
0
    def variant_simple_gpu(knl):
        # This is a simple GPU-ish variant.

        # It's not the same thing as Matt's code, but I'll need some more time
        # to reverse-engineer what is going on there. Some discussion might
        # help, too. :)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        return knl, ["K", "i", "j", "q", "ax_b_insn"]
Example #10
0
    def transform_loopy_program(self, program):
        # FIXME: This could be much smarter.
        import loopy as lp
        # accommodate loopy with and without kernel callables
        try:
            all_inames = program.all_inames()
        except AttributeError:
            all_inames = program.root_kernel.all_inames()

        inner_iname = None
        if "iel" not in all_inames and "i0" in all_inames:
            outer_iname = "i0"

            if "i1" in all_inames:
                inner_iname = "i1"
        elif "iel" in all_inames:
            outer_iname = "iel"

            if "idof" in all_inames:
                inner_iname = "idof"
        else:
            # cannot "fit" the optimization strategy for the provided kernel
            # => bail
            return program

        if inner_iname is not None:
            program = lp.split_iname(program, inner_iname, 16, inner_tag="l.0")
        return lp.tag_inames(program, {outer_iname: "g.0"})
Example #11
0
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name
        for rule_name in knl.substitutions
        if rule_name.startswith("E")]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
Example #12
0
def element_centers_of_mass(discr):
    knl = lp.make_kernel(
        """{[dim,k,i]:
            0<=dim<ndims and
            0<=k<nelements and
            0<=i<nunit_nodes}""",
        """
            panels[dim, k] = sum(i, nodes[dim, k, i])/nunit_nodes
            """,
        default_offset=lp.auto, name="find_panel_centers_of_mass",
        lang_version=MOST_RECENT_LANGUAGE_VERSION)

    knl = lp.fix_parameters(knl, ndims=discr.ambient_dim)

    knl = lp.split_iname(knl, "k", 128, inner_tag="l.0", outer_tag="g.0")
    knl = lp.tag_inames(knl, dict(dim="ilp"))

    with cl.CommandQueue(discr.cl_context) as queue:
        mesh = discr.mesh
        panels = cl.array.empty(queue, (mesh.ambient_dim, mesh.nelements),
                                dtype=discr.real_dtype)
        for group_nr, group in enumerate(discr.groups):
            _, (result,) = knl(queue,
                nelements=group.nelements,
                nunit_nodes=group.nunit_nodes,
                nodes=group.view(discr.nodes()),
                panels=el_view(discr, group_nr, panels))
        panels.finish()
        panels = panels.with_queue(None)
        return tuple(panels[d, :] for d in range(mesh.ambient_dim))
Example #13
0
def test_global_parallel_reduction_simpler(ctx_factory, size):
    ctx = ctx_factory()

    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")

    knl = lp.make_kernel(
            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
            """
            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}

            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)

            result = sum(j, tmp[j])
            """)

    ng = 50
    knl = lp.fix_parameters(knl, ng=ng)

    knl = lp.set_options(knl, write_cl=True)

    ref_knl = knl

    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "l_inner")
    knl = lp.tag_inames(knl, "g:g.0,j:l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
Example #14
0
    def get_kernel(self):
        if self.radius_factor < 1.0:
            radius_expr = "(1.0 - {f}) * rblk + {f} * rqbx"
        else:
            radius_expr = "{f} * rqbx"
        radius_expr = radius_expr.format(f=self.radius_factor)

        # NOTE: centers of mass are computed using a second-order approximation
        knl = lp.make_kernel([
            "{[irange]: 0 <= irange < nranges}",
            "{[i]: 0 <= i < npoints}",
            "{[idim]: 0 <= idim < dim}"
            ],
            ["""
            for irange
                <> ioffset = srcranges[irange]
                <> npoints = srcranges[irange + 1] - srcranges[irange]

                proxy_center[idim, irange] = 1.0 / npoints * \
                    reduce(sum, i, sources[idim, srcindices[i + ioffset]]) \
                        {{dup=idim:i}}

                <> rblk = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
                        (proxy_center[idim, irange] -
                         sources[idim, srcindices[i + ioffset]]) ** 2)))

                <> rqbx_int = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
                        (proxy_center[idim, irange] -
                         center_int[idim, srcindices[i + ioffset]]) ** 2)) + \
                         expansion_radii[srcindices[i + ioffset]])
                <> rqbx_ext = simul_reduce(max, i, sqrt(simul_reduce(sum, idim, \
                        (proxy_center[idim, irange] -
                         center_ext[idim, srcindices[i + ioffset]]) ** 2)) + \
                         expansion_radii[srcindices[i + ioffset]])
                <> rqbx = rqbx_int if rqbx_ext < rqbx_int else rqbx_ext

                proxy_radius[irange] = {radius_expr}
            end
            """.format(radius_expr=radius_expr)],
            [
                lp.GlobalArg("sources", None,
                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
                lp.GlobalArg("center_int", None,
                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
                lp.GlobalArg("center_ext", None,
                    shape=(self.ambient_dim, "nsources"), dim_tags="sep,C"),
                lp.GlobalArg("proxy_center", None,
                    shape=(self.ambient_dim, "nranges")),
                lp.GlobalArg("proxy_radius", None,
                    shape="nranges"),
                lp.ValueArg("nsources", np.int),
                "..."
            ],
            name="find_proxy_radii_knl",
            assumptions="dim>=1 and nranges>=1",
            fixed_parameters=dict(dim=self.ambient_dim),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        knl = lp.tag_inames(knl, "idim*:unr")
        return knl
Example #15
0
def test_ilp_write_race_detection_global(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "[n] -> {[i,j]: 0<=i,j<n }",
            [
                "a[i] = 5+i+j",
                ],
            [
                lp.GlobalArg("a", np.float32),
                lp.ValueArg("n", np.int32, approximately=1000),
                ],
            assumptions="n>=1")

    knl = lp.tag_inames(knl, dict(j="ilp"))

    knl = lp.preprocess_kernel(knl, ctx.devices[0])

    with lp.CacheMode(False):
        from loopy.diagnostic import WriteRaceConditionWarning
        from warnings import catch_warnings
        with catch_warnings(record=True) as warn_list:
            list(lp.generate_loop_schedules(knl))

            assert any(isinstance(w.message, WriteRaceConditionWarning)
                    for w in warn_list)
Example #16
0
    def variant_simple_padding(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")

        arg_names = [
                prefix+name
                for name in ["u", "v", "w", "p"]
                for prefix in ["", "rhs"]]

        for name in arg_names:
            knl = lp.add_padding(knl, name, axis=0, align_bytes=32)

        knl = lp.tag_inames(knl, dict(m="unr"))

        return knl
Example #17
0
    def variant_simple_padding(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))

        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")

        arg_names = [
                prefix+name
                for name in ["u", "v", "w", "p"]
                for prefix in ["", "rhs"]]

        for name in arg_names:
            knl = lp.add_padding(knl, name, axis=0, align_bytes=32)

        knl = lp.tag_inames(knl, dict(m="unr"))

        return knl
Example #18
0
    def variant_prefetch_fields(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        for name in ["u", "v", "w", "p"]:
            knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"])

        return knl
Example #19
0
        def knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes and
                    0<=j<n_from_nodes}""",
                "result[target_element_indices[k], i] \
                    = sum(j, resample_mat[i, j] \
                    * vec[source_element_indices[k], j])",
                [
                    lp.GlobalArg("result", None,
                        shape="nelements_result, n_to_nodes",
                        offset=lp.auto),
                    lp.GlobalArg("vec", None,
                        shape="nelements_vec, n_from_nodes",
                        offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    "...",
                    ],
                name="oversample")

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #20
0
    def variant_prefetch_fields(knl):
        knl = lp.tag_inames(knl, dict(n="l.0"))
        knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
        for name in ["u", "v", "w", "p"]:
            knl = lp.add_prefetch(knl, "%s[k,:]" % name, ["k_inner"])

        return knl
Example #21
0
 def variant_prefetch_d(knl):
     knl = lp.tag_inames(knl, dict(n="l.0"))
     knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
     knl = lp.add_prefetch(knl, "DrDsDt[:,:]",
             fetch_outer_inames='k_outer',
             default_tag="l.auto")
     return knl
Example #22
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.tag_inames(knl, dict(ifeat="g.2"))
     knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
     knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")
     return knl
Example #23
0
 def variant_2(knl):
     knl = lp.split_iname(knl, "im_x", 16, outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "im_y", 16, outer_tag="g.1", inner_tag="l.1")
     knl = lp.tag_inames(knl, dict(ifeat="g.2"))
     knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]")
     knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y")
     return knl
Example #24
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0 <= i < n}",
            "[i] -> {[j]: 0 <= j <= i}",
            "[i] -> {[k]: 0 <= k <= i}"
        ],
        """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out,) = knl(queue)

    print(out)
Example #25
0
        def pick_knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes}""",
                "result[to_element_indices[k], i] \
                    = vec[from_element_indices[k], pick_list[i]]", [
                    lp.GlobalArg("result",
                                 None,
                                 shape="nelements_result, n_to_nodes",
                                 offset=lp.auto),
                    lp.GlobalArg("vec",
                                 None,
                                 shape="nelements_vec, n_from_nodes",
                                 offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    lp.ValueArg("n_from_nodes", np.int32),
                    "...",
                ],
                name="resample_by_picking")

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #26
0
def test_vector_types(ctx_factory, vec_len):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{ [i,j]: 0<=i<n and 0<=j<vec_len }",
            "out[i,j] = 2*a[i,j]",
            [
                lp.GlobalArg("a", np.float32, shape=lp.auto),
                lp.GlobalArg("out", np.float32, shape=lp.auto),
                "..."
                ])

    knl = lp.fix_parameters(knl, vec_len=vec_len)

    ref_knl = knl

    knl = lp.tag_data_axes(knl, "out", "c,vec")
    knl = lp.tag_inames(knl, dict(j="unr"))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl,
            parameters=dict(
                n=20000
                ))
Example #27
0
 def parallelize(self, knl, lsize):
     # global hist is zeroed in its own kernel
     knl = lp.split_iname(knl,
                          "bb",
                          lsize[0],
                          outer_tag="g.0",
                          inner_tag="l.0",
                          within="id:zero_hist*")
     # outer_tag of loops writing/reading temp_hist cannot be a global index
     knl = lp.split_iname(knl,
                          "bb",
                          lsize[0],
                          inner_tag="l.0",
                          within="id:zero_temp*")
     knl = lp.split_iname(knl,
                          "k",
                          lsize[0],
                          inner_tag="l.0",
                          within="not id:zero* and not id:glb*")
     knl = lp.split_iname(knl,
                          "b",
                          lsize[0],
                          inner_tag="l.0",
                          within="id:glb*")
     knl = lp.tag_inames(knl, "j:g.0")
     return knl
Example #28
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0 <= i < n}",
            "[i] -> {[j]: 0 <= j <= i}",
            "[i] -> {[k]: 0 <= k <= i}"
        ],
        """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out,) = knl(queue)

    print(out)
Example #29
0
        def pick_knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes}""",
                "result[to_element_indices[k], i] \
                    = vec[from_element_indices[k], pick_list[i]]",
                [
                    lp.GlobalArg("result", None,
                        shape="nelements_result, n_to_nodes",
                        offset=lp.auto),
                    lp.GlobalArg("vec", None,
                        shape="nelements_vec, n_from_nodes",
                        offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    lp.ValueArg("n_from_nodes", np.int32),
                    "...",
                    ],
                name="resample_by_picking",
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #30
0
        def mat_knl():
            import loopy as lp
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i<n_to_nodes and
                    0<=j<n_from_nodes}""",
                "result[to_element_indices[k], i] \
                    = sum(j, resample_mat[i, j] \
                    * vec[from_element_indices[k], j])",
                [
                    lp.GlobalArg("result", None,
                        shape="nelements_result, n_to_nodes",
                        offset=lp.auto),
                    lp.GlobalArg("vec", None,
                        shape="nelements_vec, n_from_nodes",
                        offset=lp.auto),
                    lp.ValueArg("nelements_result", np.int32),
                    lp.ValueArg("nelements_vec", np.int32),
                    "...",
                    ],
                name="resample_by_mat",
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #31
0
def test_precompute_nested_subst(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}", """
        E:=a[i]
        D:=E*E
        b[i] = D
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", "i_inner")

    # There's only one surviving 'E' rule.
    assert len([
        rule_name for rule_name in knl.substitutions
        if rule_name.startswith("E")
    ]) == 1

    # That rule should use the newly created prefetch inames,
    # not the prior 'i_inner'
    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=12345))
Example #32
0
def test_precompute_confusing_subst_arguments(ctx_factory):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<5}",
        """
        D(i):=a[i+1]-a[i]
        b[i,j] = D(j)
        """)

    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))

    ref_knl = knl

    knl = lp.tag_inames(knl, dict(j="g.1"))
    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")

    from loopy.symbolic import get_dependencies
    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
    knl = lp.precompute(knl, "D", sweep_inames="j",
            precompute_outer_inames="j, i_inner, i_outer")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl,
            parameters=dict(n=12345))
Example #33
0
def test_gmem_access_counter_consec():

    knl = lp.make_kernel("[n,m,l] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}",
                         [
                             """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
                         ],
                         name="consec",
                         assumptions="n,m,l >= 1")
    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"})

    poly = lp.get_gmem_access_poly(knl)
    n = 512
    m = 256
    l = 128
    params = {'n': n, 'm': m, 'l': l}

    f64consec = poly[(np.dtype(np.float64), 'consecutive',
                      'load')].eval_with_dict(params)
    f32consec = poly[(np.dtype(np.float32), 'consecutive',
                      'load')].eval_with_dict(params)
    assert f64consec == 2 * n * m
    assert f32consec == 3 * n * m * l

    f64consec = poly[(np.dtype(np.float64), 'consecutive',
                      'store')].eval_with_dict(params)
    f32consec = poly[(np.dtype(np.float32), 'consecutive',
                      'store')].eval_with_dict(params)
    assert f64consec == n * m
    assert f32consec == n * m * l
Example #34
0
 def test_split_iname2(self):
     "Split useful for omp simd inner loop"
     knl = lp.split_iname(self.knl, 'i', 8)
     knl = lp.tag_inames(knl, [(
         'i_inner',
         'ilp.unr',
     )])
     print(self._dtype_and_code(knl))
Example #35
0
    def variant_pg4(knl):
        # This (mostly) reproduces the unlabeled code snippet on pg. 4.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"]
Example #36
0
def test_parallel_multi_output_reduction():
    knl = lp.make_kernel(
        "{[i]: 0<=i<128}", """
                max_val, max_indices = argmax(i, fabs(a[i]))
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl)
    print(knl)
Example #37
0
        def knl():
            knl = lp.make_kernel(
                "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}",
                "result[k,i] = weights[i]",
                name="quad_weights")

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #38
0
    def variant_pg4(knl):
        # This (mostly) reproduces the unlabeled code snippet on pg. 4.

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        return knl, ["Ko", "Kloc", "i", "j", "q", "ax_b_insn"]
Example #39
0
def test_interp_diff(ctx_factory):
    1/0 # not ready
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N
            [
                "[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
                "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
                "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
                "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
                "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
                "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
                "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])",
                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("P",   dtype, shape=interim_field_shape, order=order),
            lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
            lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
            lp.GlobalArg("Pu",  dtype, shape=field_shape, order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_lap_precon", assumptions="K>=1")

    print(knl)
    1/0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)
Example #40
0
def test_interp_diff(ctx_factory):
    1/0 # not ready
    dtype = np.float32
    ctx = ctx_factory()
    order = "C"

    N = 8
    M = 8

    from pymbolic import var
    K_sym = var("K")

    field_shape = (N, N, N, K_sym)
    interim_field_shape = (M, M, M, K_sym)

    # 1. direction-by-direction similarity transform on u
    # 2. invert diagonal 
    # 3. transform back (direction-by-direction)

    # K - run-time symbolic
    knl = lp.make_kernel(ctx.devices[0],
            "[K] -> {[i,ip,j,jp,k,kp,e]: 0<=i,j,k<%d AND 0<=ip,jp,kp<%d 0<=e<K}" %M %N
            [
                "[|i,jp,kp] <float32>  u1[i ,jp,kp,e] = sum_float32(ip, I[i,ip]*u [ip,jp,kp,e])",
                "[|i,j ,kp] <float32>  u2[i ,j ,kp,e] = sum_float32(jp, I[j,jp]*u1[i ,jp,kp,e])",
                "[|i,j ,k ] <float32>  u3[i ,j ,k ,e] = sum_float32(kp, I[k,kp]*u2[i ,j ,kp,e])",
                "[|i,j ,k ] <float32>  Pu[i ,j ,k ,e] = P[i,j,k,e]*u3[i,j,k,e]",
                "[|i,j ,kp] <float32> Pu3[i ,j ,kp,e] = sum_float32(k, V[kp,k]*Pu[i ,j , k,e])",
                "[|i,jp,kp] <float32> Pu2[i ,jp,kp,e] = sum_float32(j, V[jp,j]*Pu[i ,j ,kp,e])",
                "Pu[ip,jp,kp,e] = sum_float32(i, V[ip,i]*Pu[i ,jp,kp,e])",
                ],
            [
            lp.GlobalArg("u",   dtype, shape=field_shape, order=order),
            lp.GlobalArg("P",   dtype, shape=interim_field_shape, order=order),
            lp.GlobalArg("I",   dtype, shape=(M, N), order=order),
            lp.GlobalArg("V",   dtype, shape=(N, M), order=order),
            lp.GlobalArg("Pu",  dtype, shape=field_shape, order=order),
            lp.ValueArg("K",  np.int32, approximately=1000),
            ],
            name="sem_lap_precon", assumptions="K>=1")

    print(knl)
    1/0

    knl = lp.split_iname(knl, "e", 16, outer_tag="g.0")#, slabs=(0, 1))

    knl = lp.tag_inames(knl, dict(i="l.0", j="l.1"))

    print(knl)
    #1/0

    kernel_gen = lp.generate_loop_schedules(knl)
    kernel_gen = lp.check_kernels(kernel_gen, dict(K=1000), kill_level_min=5)

    lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen,
            op_count=0,
            op_label="GFlops",
            parameters={"K": K}, print_seq_code=True,)
Example #41
0
 def transform_surface_kernel(knl):
     #print knl
     knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0"))
     knl = lp.split_iname(knl, "mp", 4, inner_tag="unr")
     knl = lp.add_prefetch(knl, "LIFT")
     for name in ["nx", "ny", "nz", "Fscale", "bc"]:
         knl = lp.add_prefetch(knl, name)
     knl = lp.set_loop_priority(knl, "mp_outer,mp_inner")
     return knl
Example #42
0
    def variant_simple_gpu_prefetch(knl):
        # This adds prefetching to the GPU variant above.

        # In this variant (on my machine), loopy makes a silly choice
        # for the upper bound of Kloc (it uses Nc). I'll investigate and
        # fix that. (FIXME)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        knl = lp.add_prefetch(knl, "w", ["q"], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3], default_tag="l.auto")
        knl = lp.add_prefetch(knl, "jacDet", [1], default_tag="l.auto")
        return knl, ["K", "i", "j", "q", "ax_b_insn"]
Example #43
0
        def knl():
            knl = lp.make_kernel(
                "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}",
                "result[k,i] = weights[i]",
                name="quad_weights",
                default_offset=lp.auto)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #44
0
 def transform_surface_kernel(knl):
     #print knl
     knl = lp.tag_inames(knl, dict(k="g.0", n="l.0", m="l.0"))
     knl = lp.split_iname(knl, "mp", 4, inner_tag="unr")
     knl = lp.add_prefetch(knl, "LIFT")
     for name in ["nx", "ny", "nz", "Fscale", "bc"]:
         knl = lp.add_prefetch(knl, name)
     knl = lp.set_loop_priority(knl, "mp_outer,mp_inner")
     return knl
Example #45
0
    def variant_simple_gpu_prefetch(knl):
        # This adds prefetching to the GPU variant above.

        # In this variant (on my machine), loopy makes a silly choice
        # for the upper bound of Kloc (it uses Nc). I'll investigate and
        # fix that. (FIXME)

        knl = lp.tag_inames(knl, {"dx_axis": "unr"})
        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc",
                outer_tag="g.0")
        knl = lp.tag_inames(knl, {"i": "l.1", "j": "l.0"})
        knl = lp.add_prefetch(knl, "w", ["q"])
        knl = lp.add_prefetch(knl, "DPsi", [0, 1, 2])
        knl = lp.add_prefetch(knl, "jacInv", [0, 1, 3])
        knl = lp.add_prefetch(knl, "jacDet", [1])
        return knl, ["K", "i", "j", "q", "ax_b_insn"]
Example #46
0
 def knl():
     import loopy as lp
     knl = lp.make_kernel(
         "{[el, idof, jdof]: 0<=el<nelements and 0<=idof,jdof<ndofs}",
         "result[el, idof] = %s(jdof, input[el, jdof])" % reduction_name,
         default_offset=lp.auto,
         lang_version=MOST_RECENT_LANGUAGE_VERSION)
     knl = lp.tag_inames(knl, "el:g.0,idof:l.0")
     return knl
Example #47
0
def test_mem_access_counter_nonconsec():

    knl = lp.make_kernel(
        "[n,m,ell] -> {[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}", [
            """
            c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
            e[i, k] = g[i,k]*(2+h[i,k])
            """
        ],
        name="nonconsec",
        assumptions="n,m,ell >= 1")
    knl = lp.add_and_infer_dtypes(
        knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
    knl = lp.split_iname(knl, "i", 16)
    knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})

    mem_map = lp.get_mem_access_map(knl, count_redundant_work=True)  # noqa
    n = 512
    m = 256
    ell = 128
    params = {'n': n, 'm': m, 'ell': ell}
    f64nonconsec = mem_map[lp.MemAccess('global',
                                        np.float64,
                                        stride=Variable('m'),
                                        direction='load',
                                        variable='g')].eval_with_dict(params)
    f64nonconsec += mem_map[lp.MemAccess('global',
                                         np.float64,
                                         stride=Variable('m'),
                                         direction='load',
                                         variable='h')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.dtype(np.float32),
                                        stride=Variable('m') * Variable('ell'),
                                        direction='load',
                                        variable='a')].eval_with_dict(params)
    f32nonconsec += mem_map[lp.MemAccess('global',
                                         np.dtype(np.float32),
                                         stride=Variable('m') *
                                         Variable('ell'),
                                         direction='load',
                                         variable='b')].eval_with_dict(params)
    assert f64nonconsec == 2 * n * m
    assert f32nonconsec == 3 * n * m * ell

    f64nonconsec = mem_map[lp.MemAccess('global',
                                        np.float64,
                                        stride=Variable('m'),
                                        direction='store',
                                        variable='e')].eval_with_dict(params)
    f32nonconsec = mem_map[lp.MemAccess('global',
                                        np.float32,
                                        stride=Variable('m') * Variable('ell'),
                                        direction='store',
                                        variable='c')].eval_with_dict(params)
    assert f64nonconsec == n * m
    assert f32nonconsec == n * m * ell
Example #48
0
def test_batched_sparse():
    fortran_src = """
        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
          implicit none

          integer rowstarts(m+1), colindices(nvals)
          real*8 values(nvals)
          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)

          integer m, n, rowstart, rowend, length, nvals, nvecs
          integer i, j, k

          do i = 1, m
            rowstart = rowstarts(i)
            rowend = rowstarts(i+1)
            length = rowend - rowstart

            do k = 1, nvecs
              rowsum(k) = 0
            enddo
            do k = 1, nvecs
              do j = 1, length
                rowsum(k) = rowsum(k) + &
                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
              end do
            end do
            do k = 1, nvecs
              y(i,k) = rowsum(k)
            end do
          end do
        end

        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.split_iname(knl, "i", 128)
    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
    knl = lp.add_prefetch(knl, "values",
            default_tag="l.auto")
    knl = lp.add_prefetch(knl, "colindices",
            default_tag="l.auto")
    knl = lp.fix_parameters(knl, nvecs=4)
Example #49
0
def test_batched_sparse():
    fortran_src = """
        subroutine sparse(rowstarts, colindices, values, m, n, nvecs, nvals, x, y)
          implicit none

          integer rowstarts(m+1), colindices(nvals)
          real*8 values(nvals)
          real*8 x(n, nvecs), y(n, nvecs), rowsum(nvecs)

          integer m, n, rowstart, rowend, length, nvals, nvecs
          integer i, j, k

          do i = 1, m
            rowstart = rowstarts(i)
            rowend = rowstarts(i+1)
            length = rowend - rowstart

            do k = 1, nvecs
              rowsum(k) = 0
            enddo
            do k = 1, nvecs
              do j = 1, length
                rowsum(k) = rowsum(k) + &
                  x(colindices(rowstart+j-1),k)*values(rowstart+j-1)
              end do
            end do
            do k = 1, nvecs
              y(i,k) = rowsum(k)
            end do
          end do
        end

        """

    knl, = lp.parse_fortran(fortran_src)

    knl = lp.split_iname(knl, "i", 128)
    knl = lp.tag_inames(knl, {"i_outer": "g.0"})
    knl = lp.tag_inames(knl, {"i_inner": "l.0"})
    knl = lp.add_prefetch(knl, "values",
            default_tag="l.auto")
    knl = lp.add_prefetch(knl, "colindices",
            default_tag="l.auto")
    knl = lp.fix_parameters(knl, nvecs=4)
Example #50
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
        "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
        """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
        [
            # Must declare coeffs to have "no" shape, to keep loopy
            # from trying to figure it out the shape automatically.
            lp.GlobalArg("coeffs", None, shape=None),
            "..."
        ],
        assumptions="deg>=0 and nels>=1")

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl,
                         "el_outer",
                         2,
                         outer_tag="g.0",
                         inner_tag="ilp",
                         slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    print(
        lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
            )))
Example #51
0
        def knl():
            knl = lp.make_kernel(
                "{[k,i]: 0<=k<nelements and 0<=i<ndiscr_nodes}",
                "result[k,i] = weights[i]",
                name="quad_weights",
                default_offset=lp.auto,
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #52
0
        def knl():
            knl = lp.make_kernel(
                """{[k,i,j]:
                    0<=k<nelements and
                    0<=i,j<ndiscr_nodes}""",
                "result[k,i] = sum(j, diff_mat[i, j] * vec[k, j])",
                default_offset=lp.auto, name="diff")

            knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
            return lp.tag_inames(knl, dict(k="g.0"))
Example #53
0
    def variant_fig32(knl):
        # This (mostly) reproduces Figure 3.2.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi", np.float32, ["i", "q", "dx_axis"],
                default_tag=None)
        knl = lp.tag_inames(knl, {"dx_axis": "unr", "dxi": "unr"})
        return knl, ["Ko", "Kloc", "dPsi_q", "ij", "i", "j", "q", "ax_b_insn"]
Example #54
0
    def variant_fig33(knl):
        # This is meant to (mostly) reproduce Figure 3.3.

        Ncloc = 16
        knl = lp.split_iname(knl, "K", Ncloc,
                outer_iname="Ko", inner_iname="Kloc")
        knl = lp.precompute(knl, "dPsi$one", np.float32, ["dx_axis"], default_tag=None)
        knl = lp.tag_inames(knl, {"j": "ilp.seq"})

        return knl, ["Ko", "Kloc"]
Example #55
0
File: p2e.py Project: inducer/sumpy
    def get_kernel(self):
        ncoeffs = len(self.expansion)

        from sumpy.tools import gather_loopy_source_arguments
        loopy_knl = lp.make_kernel(
                [
                    "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
                    "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
                    ],
                ["""
                for isrc_box
                    <> src_ibox = source_boxes[isrc_box]
                    <> isrc_start = box_source_starts[src_ibox]
                    <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox]

                    <> center[idim] = centers[idim, src_ibox] {id=fetch_center}

                    for isrc
                        <> a[idim] = center[idim] - sources[idim, isrc] {dup=idim}

                        <> strength = strengths[isrc]
                        """] + self.get_loopy_instructions() + ["""
                    end
                    """] + ["""
                    tgt_expansions[src_ibox-tgt_base_ibox, {coeffidx}] = \
                            simul_reduce(sum, isrc, strength*coeff{coeffidx}) \
                            {{id_prefix=write_expn}}
                    """.format(coeffidx=i) for i in range(ncoeffs)] + ["""
                end
                """],
                [
                    lp.GlobalArg("sources", None, shape=(self.dim, "nsources"),
                        dim_tags="sep,c"),
                    lp.GlobalArg("strengths", None, shape="nsources"),
                    lp.GlobalArg("box_source_starts,box_source_counts_nonchild",
                        None, shape=None),
                    lp.GlobalArg("centers", None, shape="dim, aligned_nboxes"),
                    lp.ValueArg("rscale", None),
                    lp.GlobalArg("tgt_expansions", None,
                        shape=("nboxes", ncoeffs), offset=lp.auto),
                    lp.ValueArg("nboxes,aligned_nboxes,tgt_base_ibox", np.int32),
                    lp.ValueArg("nsources", np.int32),
                    "..."
                ] + gather_loopy_source_arguments([self.expansion]),
                name=self.name,
                assumptions="nsrc_boxes>=1",
                silenced_warnings="write_race(write_expn*)",
                default_offset=lp.auto,
                fixed_parameters=dict(dim=self.dim),
                lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl)
        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")

        return loopy_knl
Example #56
0
        def transform_vol(knl):
            knl = lp.tag_inames(knl, dict(n="l.0", k="g.0"))
            #knl = lp.change_arg_to_image(knl, "DrDsDt")

            # knl = lp.split_iname(knl, "k", 3, outer_tag="g.0", inner_tag="l.1")
            for name in ["u", "v", "w", "p"]:
                knl = lp.add_prefetch(knl, "%s[k,:]" % name)
            for name in ["drst_dx", "drst_dy", "drst_dz"]:
                knl = lp.add_prefetch(knl, "%s" % name)
            knl = lp.add_prefetch(knl, "DrDsDt")
            return knl
Example #57
0
 def variant_gpu(knl):
     knl = lp.expand_subst(knl)
     knl = lp.split_iname(knl, "i", 256,
             outer_tag="g.0", inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 256)
     knl = lp.add_prefetch(knl, "x[j,k]", ["j_inner", "k"],
             ["x_fetch_j", "x_fetch_k"], default_tag=None)
     knl = lp.tag_inames(knl, dict(x_fetch_k="unr", x_fetch_j="l.0"))
     knl = lp.add_prefetch(knl, "x[i,k]", ["k"], default_tag=None)
     knl = lp.prioritize_loops(knl, ["j_outer", "j_inner"])
     return knl
Example #58
0
def test_rob_stroud_bernstein(ctx_factory):
    ctx = ctx_factory()

    # NOTE: tmp would have to be zero-filled beforehand

    knl = lp.make_kernel(
            "{[el, i2, alpha1,alpha2]: \
                    0 <= el < nels and \
                    0 <= i2 < nqp1d and \
                    0 <= alpha1 <= deg and 0 <= alpha2 <= deg-alpha1 }",
            """
            for el,i2
                <> xi = qpts[1, i2]
                <> s = 1-xi
                <> r = xi/s
                <> aind = 0 {id=aind_init}

                for alpha1
                    <> w = s**(deg-alpha1) {id=init_w}

                    for alpha2
                        tmp[el,alpha1,i2] = tmp[el,alpha1,i2] + w * coeffs[aind] \
                                {id=write_tmp}
                        w = w * r * ( deg - alpha1 - alpha2 ) / (1 + alpha2) \
                                {id=update_w,dep=init_w:write_tmp}
                        aind = aind + 1 \
                                {id=aind_incr,dep=aind_init:write_tmp:update_w}
                    end
                end
            end
            """,
            [
                # Must declare coeffs to have "no" shape, to keep loopy
                # from trying to figure it out the shape automatically.

                lp.GlobalArg("coeffs", None, shape=None),
                "..."
                ],
            assumptions="deg>=0 and nels>=1"
            )

    knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
    knl = lp.split_iname(knl, "el", 16, inner_tag="l.0")
    knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
            slabs=(0, 1))
    knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))

    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
            dict(
                qpts=np.float32,
                coeffs=np.float32,
                tmp=np.float32,
                )))
Example #59
0
File: p2p.py Project: inducer/sumpy
    def get_kernel(self):
        loopy_insns, result_names = self.get_loopy_insns_and_result_names()
        kernel_exprs = self.get_kernel_exprs(result_names)
        arguments = (
            self.get_default_src_tgt_arguments()
            + [
                lp.GlobalArg("srcindices", None, shape="nresult"),
                lp.GlobalArg("tgtindices", None, shape="nresult"),
                lp.ValueArg("nresult", None)
            ]
            + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult")
             for i, dtype in enumerate(self.value_dtypes)])

        loopy_knl = lp.make_kernel(
            "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}",
            self.get_kernel_scaling_assignments()
            # NOTE: itgt, isrc need to always be defined in case a statement
            # in loopy_insns or kernel_exprs needs them (e.g. hardcoded in
            # places like get_kernel_exprs)
            + ["""
                for imat
                    <> itgt = tgtindices[imat]
                    <> isrc = srcindices[imat]

                    <> d[idim] = targets[idim, itgt] - sources[idim, isrc]
            """]
            + ["""
                    <> is_self = (isrc == target_to_source[itgt])
                """ if self.exclude_self else ""]
            + loopy_insns + kernel_exprs
            + ["""
                    result_{i}[imat] = \
                        knl_{i}_scaling * pair_result_{i} \
                            {{id_prefix=write_p2p}}
                """.format(i=iknl)
                for iknl in range(len(self.kernels))]
            + ["end"],
            arguments,
            assumptions="nresult>=1",
            silenced_warnings="write_race(write_p2p*)",
            name=self.name,
            fixed_parameters=dict(dim=self.dim),
            lang_version=MOST_RECENT_LANGUAGE_VERSION)

        loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr")
        loopy_knl = lp.add_dtypes(loopy_knl,
            dict(nsources=np.int32, ntargets=np.int32))

        for knl in self.kernels:
            loopy_knl = knl.prepare_loopy_kernel(loopy_knl)

        return loopy_knl