Beispiel #1
0
    def get_kernel(self, **kwargs):

        loopy_knl = lp.make_kernel(  # NOQA
            [
                "{ [ bid ] : 0 <= bid < n_boxes }",
                "{ [ nid ] : 0 <= nid < n_box_nodes }"
            ],
            [
                """
                for bid
                    <> box_id       = boxes[bid]
                    <> box_node_beg = box_node_starts[box_id]

                    result[bid] = sum(nid,
                                      func[box_node_beg + nid]
                                      * filter_multiplier[nid])
                end
                """
            ],
            [
                lp.ValueArg("n_box_nodes, n_boxes", np.int32),
                lp.GlobalArg("filter_multiplier", np.float64, "n_box_nodes"),
                lp.GlobalArg("func", np.float64, "n_box_nodes * n_boxes"),
                "...",
            ],
            name="box_filtered_sum",
            lang_version=(2018, 2),
        )

        loopy_knl = lp.set_options(loopy_knl, write_cl=False)
        loopy_knl = lp.set_options(loopy_knl, return_dict=True)

        return loopy_knl
Beispiel #2
0
    def get_kernel(self, **kwargs):

        loopy_knl = lp.make_kernel(  # NOQA
            [
                "{ [ bid ] : 0 <= bid < n_boxes }",
                "{ [ mid ] : 0 <= mid < n_box_nodes }",
                "{ [ nid ] : 0 <= nid < n_box_nodes }"
            ],
            [
                """
                for bid
                    <> box_id       = boxes[bid]
                    <> box_node_beg = box_node_starts[box_id]

                    # Rescale weights based on template interval sizes.
                    # Not needed since the rscl in both the numerator and
                    # the denominator and is canceled.
                    #
                    # <> box_level    = box_levels[box_id]
                    # <> box_extent   = root_extent * (1.0 / (2**box_level))
                    # <> weight_rscl  = (box_extent / 2.0)**dim

                    for mid

                        <> mode_id = box_node_beg + mid

                        for nid
                            <> user_node_id = user_node_ids[box_node_beg + nid]
                        end

                        result[mode_id] = sum(
                                              nid,
                                              (
                                              func[user_node_id]
                                              * weight[nid]
                                              * vandermonde[nid, mid]
                                              ) * filter_multiplier[nid]
                                             ) / normalizer[mid]
                    end
                end
                """
            ],
            [
                lp.ValueArg("n_box_nodes, n_boxes", np.int32),
                # lp.ValueArg("root_extent", np.float64),
                lp.GlobalArg("weight, normalizer, filter_multiplier",
                             np.float64, "n_box_nodes"),
                lp.GlobalArg("vandermonde", np.float64,
                             "n_box_nodes, n_box_nodes"),
                lp.GlobalArg("func", np.float64, "n_box_nodes * n_boxes"),
                "...",
            ],
            name="discrete_legendre_transform",
            lang_version=(2018, 2),
        )

        loopy_knl = lp.set_options(loopy_knl, write_cl=False)
        loopy_knl = lp.set_options(loopy_knl, return_dict=True)

        return loopy_knl
Beispiel #3
0
    def get_kernel(self, **kwargs):

        extra_kernel_kwarg_types = ()
        if "extra_kernel_kwarg_types" in kwargs:
            extra_kernel_kwarg_types = kwargs["extra_kernel_kwarg_types"]

        eval_inames = frozenset(["itgt"])
        scalar_assignment = lp.Assignment(
            id=None,
            assignee="expr_val",
            expression=self.get_normalised_expr(),
            temp_var_type=None,
        )
        eval_insns = [
            insn.copy(within_inames=insn.within_inames | eval_inames)
            for insn in [scalar_assignment]
        ]

        loopy_knl = lp.make_kernel(  # NOQA
            "{ [itgt]: 0<=itgt<n_targets }",
            [
                """
                for itgt
                    VAR_ASSIGNMENT
                end
                """.replace("VAR_ASSIGNMENT",
                            self.get_variable_assignment_code())
            ] + eval_insns + [
                """
                for itgt
                    result[itgt] = expr_val
                end
                """
            ],
            [
                lp.ValueArg("dim, n_targets", np.int32),
                lp.GlobalArg("target_points", np.float64, "dim, n_targets"),
                lp.TemporaryVariable("expr_val", None, ()),
            ] + list(extra_kernel_kwarg_types) + [
                "...",
            ],
            name="eval_expr",
            lang_version=(2018, 2),
        )

        loopy_knl = lp.fix_parameters(loopy_knl, dim=self.dim)
        loopy_knl = lp.set_options(loopy_knl, write_cl=False)
        loopy_knl = lp.set_options(loopy_knl, return_dict=True)

        if self.function_manglers is not None:
            loopy_knl = lp.register_function_manglers(loopy_knl,
                                                      self.function_manglers)

        if self.preamble_generators is not None:
            loopy_knl = lp.register_preamble_generators(
                loopy_knl, self.preamble_generators)

        return loopy_knl
Beispiel #4
0
def test_complex_support(ctx_factory, target):
    knl = lp.make_kernel("{[i, i1, i2]: 0<=i,i1,i2<10}",
                         """
            euler1[i]  = exp(3.14159265359j)
            euler2[i] = (2.7182818284 ** (3.14159265359j))
            euler1_real[i] = real(euler1[i])
            euler1_imag[i] = imag(euler1[i])
            real_times_complex[i] = in1[i]*(in2[i]*1j)
            real_plus_complex[i] = in1[i] + (in2[i]*1j)
            abs_complex[i] = abs(real_plus_complex[i])
            complex_div_complex[i] = (2jf + 7*in1[i])/(32jf + 37*in1[i])
            complex_div_real[i] = (2jf + 7*in1[i])/in1[i]
            real_div_complex[i] = in1[i]/(2jf + 7*in1[i])
            out_sum = sum(i1, 1.0*i1 + i1*1jf)*sum(i2, 1.0*i2 + i2*1jf)
            conj_out_sum = conj(out_sum)
            """, [
                             lp.GlobalArg("out_sum, euler1, real_plus_complex",
                                          is_input=False,
                                          shape=lp.auto), ...
                         ],
                         target=target(),
                         seq_dependencies=True)
    knl = lp.set_options(knl, "return_dict")

    n = 10

    in1 = np.random.rand(n)
    in2 = np.random.rand(n)

    kwargs = {"in1": in1, "in2": in2}

    if target == lp.PyOpenCLTarget:
        knl = lp.set_options(knl, "write_cl")
        cl_ctx = ctx_factory()
        evt, out = knl(cl.CommandQueue(cl_ctx), **kwargs)
    elif target == lp.ExecutableCTarget:
        evt, out = knl(**kwargs)
    else:
        raise NotImplementedError("unsupported target")

    np.testing.assert_allclose(out["euler1"], -1)
    np.testing.assert_allclose(out["euler2"], -1)
    np.testing.assert_allclose(out["euler1_real"], -1)
    np.testing.assert_allclose(out["euler1_imag"], 0, atol=1e-10)
    np.testing.assert_allclose(out["real_times_complex"], in1 * (in2 * 1j))
    np.testing.assert_allclose(out["real_plus_complex"], in1 + (in2 * 1j))
    np.testing.assert_allclose(out["abs_complex"],
                               np.abs(out["real_plus_complex"]))
    np.testing.assert_allclose(out["complex_div_complex"],
                               (2j + 7 * in1) / (32j + 37 * in1))
    np.testing.assert_allclose(out["complex_div_real"], (2j + 7 * in1) / in1)
    np.testing.assert_allclose(out["real_div_complex"], in1 / (2j + 7 * in1))
    np.testing.assert_allclose(out["out_sum"],
                               (0.5 * n * (n - 1) + 0.5 * n * (n - 1) * 1j)**2)
    np.testing.assert_allclose(out["conj_out_sum"],
                               (0.5 * n * (n - 1) - 0.5 * n * (n - 1) * 1j)**2)
Beispiel #5
0
def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
    x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64)

    callee1 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 2*abs(a[i])
            """,
                               name="callee_fn1")

    callee2 = lp.make_function("{[i, j]: 0<=i<3 and 0 <= j < 2}",
                               """
            b[i, j] = 3*a[i, j]
            """,
                               name="callee_fn2")

    callee3 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 5*a[i]
            """,
                               name="callee_fn3")

    knl = lp.make_kernel(
        "{[i, j, k, l]:  0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}",
        """
            [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2])
            [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k])
            [l]: y3[l, l] = callee_fn3([l]: x3[l, l])
            """)

    knl = lp.merge([knl, callee1])
    knl = lp.merge([knl, callee2])
    knl = lp.merge([knl, callee3])

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_fn1")
        knl = lp.inline_callable_kernel(knl, "callee_fn2")
        knl = lp.inline_callable_kernel(knl, "callee_fn3")

    knl = lp.set_options(knl, "write_cl")
    knl = lp.set_options(knl, "return_dict")
    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)

    y1 = out_dict["y1"].get()
    y2 = out_dict["y2"].get()
    y3 = out_dict["y3"].get()

    assert (np.linalg.norm(y1 - 2 * x1.get())) < 1e-15
    assert (np.linalg.norm(y2 - 3 * x2.get())) < 1e-15
    assert (np.linalg.norm(np.diag(y3 - 5 * x3.get()))) < 1e-15
Beispiel #6
0
def test_cuda_short_vector():
    knl = lp.make_kernel("{ [i]: 0<=i<n }",
                         "out[i] = 2*a[i]",
                         target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
Beispiel #7
0
def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2*np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel(
        "{[i]: 1<=i<=n}",
        "out[i] = -(f[i+1] - f[i-1])/h",
        [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel(
        "{[j]: 1<=j<=n}",
        "f[j] = u[j]**2/2",
        [
            lp.GlobalArg("f", shape="n+2"),
            lp.GlobalArg("u", shape="n+2"),
            ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
            data_flow=[
                ("f", 1, 0)
                ])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(
            fused0_knl, "inew", 128, outer_tag="g.0", inner_tag="l.0")

    precomp_knl = lp.precompute(
            gpu_knl, "f_subst", "inew_inner", fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)
Beispiel #8
0
def test_cuda_short_vector():
    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "out[i] = 2*a[i]",
        target=lp.CudaTarget())

    knl = lp.set_options(knl, write_code=True)
    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
    knl = lp.tag_array_axes(knl, "a,out", "C,vec")

    knl = lp.set_options(knl, write_wrapper=True)
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})

    print(lp.generate_code_v2(knl).device_code())
Beispiel #9
0
def test_finite_difference_expr_subst(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    grid = np.linspace(0, 2 * np.pi, 2048, endpoint=False)
    h = grid[1] - grid[0]
    u = cl.clmath.sin(cl.array.to_device(queue, grid))

    fin_diff_knl = lp.make_kernel("{[i]: 1<=i<=n}",
                                  "out[i] = -(f[i+1] - f[i-1])/h",
                                  [lp.GlobalArg("out", shape="n+2"), "..."])

    flux_knl = lp.make_kernel("{[j]: 1<=j<=n}", "f[j] = u[j]**2/2", [
        lp.GlobalArg("f", shape="n+2"),
        lp.GlobalArg("u", shape="n+2"),
    ])

    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
                                data_flow=[("f", 1, 0)])

    fused_knl = lp.set_options(fused_knl, write_cl=True)
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused_knl = lp.assignment_to_subst(fused_knl, "f")

    fused_knl = lp.set_options(fused_knl, write_cl=True)

    # This is the real test here: The automatically generated
    # shape expressions are '2+n' and the ones above are 'n+2'.
    # Is loopy smart enough to understand that these are equal?
    evt, _ = fused_knl(queue, u=u, h=np.float32(1e-1))

    fused0_knl = lp.affine_map_inames(fused_knl, "i", "inew", "inew+1=i")

    gpu_knl = lp.split_iname(fused0_knl,
                             "inew",
                             128,
                             outer_tag="g.0",
                             inner_tag="l.0")

    precomp_knl = lp.precompute(gpu_knl,
                                "f_subst",
                                "inew_inner",
                                fetch_bounding_box=True)

    precomp_knl = lp.tag_inames(precomp_knl, {"j_0_outer": "unr"})
    precomp_knl = lp.set_options(precomp_knl, return_dict=True)
    evt, _ = precomp_knl(queue, u=u, h=h)
Beispiel #10
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
Beispiel #11
0
def test_fd_demo():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i,j<n}",
        "result[i+1,j+1] = u[i + 1, j + 1]**2 + -1 + (-4)*u[i + 1, j + 1] \
                + u[i + 1 + 1, j + 1] + u[i + 1 + -1, j + 1] \
                + u[i + 1, j + 1 + 1] + u[i + 1, j + 1 + -1]")
    #assumptions="n mod 16=0")
    knl = lp.split_iname(knl,
            "i", 16, outer_tag="g.1", inner_tag="l.1")
    knl = lp.split_iname(knl,
            "j", 16, outer_tag="g.0", inner_tag="l.0")
    knl = lp.add_prefetch(knl, "u",
            ["i_inner", "j_inner"],
            fetch_bounding_box=True,
            default_tag="l.auto")

    #n = 1000
    #u = cl.clrandom.rand(queue, (n+2, n+2), dtype=np.float32)

    knl = lp.set_options(knl, write_cl=True)
    knl = lp.add_and_infer_dtypes(knl, dict(u=np.float32))
    code, inf = lp.generate_code(knl)
    print(code)

    assert "double" not in code
Beispiel #12
0
    def time_kernel(
        self,
        knl,
        param_dict,
    ):

        if param_dict is None:
            raise ValueError(
                "Wall time requires dictionary of kernel parameters.")

        ctx = self.get_cl_context(knl)
        queue = cl.CommandQueue(ctx)

        arg_arrays = create_rand_args(ctx, knl, param_dict)
        knl = lp.set_options(knl, no_numpy=True)
        compiled = lp.CompiledKernel(ctx, knl)

        wtimes = []

        import time
        for t in range(self.n_time_trials + self.n_warmup_time_trials):
            queue.finish()
            tstart = time.time()
            evt, out = compiled(queue, **arg_arrays)
            queue.finish()
            tend = time.time()
            wtimes.append(tend - tstart)

        import numpy as np
        return np.average(wtimes[self.n_warmup_time_trials:])
Beispiel #13
0
def test_random123(ctx_factory, tp):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    n = 150000

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            """
            <> key2 = make_uint2(i, 324830944) {inames=i}
            <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
            <> real, ctr = philox4x32_TYPE(ctr, key2)  {id=realpart,dep=init_ctr}
            <> imag, ctr = threefry4x32_TYPE(ctr, key4)  {dep=init_ctr:realpart}

            out[i, 0] = real.s0 + 1j * imag.s0
            out[i, 1] = real.s1 + 1j * imag.s1
            out[i, 2] = real.s2 + 1j * imag.s2
            out[i, 3] = real.s3 + 1j * imag.s3
            """.replace("TYPE", tp))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
    knl = lp.set_options(knl, write_cl=True)

    evt, (out,) = knl(queue, n=n)

    out = out.get()
    assert (out < 1).all()
    assert (0 <= out).all()
Beispiel #14
0
 def get_wkb_knl(self):
     knl = lp.make_kernel(
         "[Nx, Ny, Nz] ->  { [i,j,k]: 0<=i<Nx and 0<=j<Ny and 0<=k<Nz }",
         """
         <> amp_1 = sqrt(- log(rands[0, i, j, k]))
         <> amp_2 = sqrt(- log(rands[2, i, j, k]))
         <> phs_1 = exp(1j * 2. * pi * rands[1, i, j, k])
         <> phs_2 = exp(1j * 2. * pi * rands[3, i, j, k])
         <> power = f_power[i, j, k]
         <> Lmode = phs_1 * amp_1 * sqrt(power)
         <> Rmode = phs_2 * amp_2 * sqrt(power)
         <> fk_ = (Lmode + Rmode) / sqrt2
         fk[i, j, k] = fk_
         dfk[i, j, k] = 1j * wk[i, j, k] * (Lmode - Rmode) / sqrt2 - hubble * fk_
         """,
         [
             lp.ValueArg("hubble", self.rdtype),
             lp.GlobalArg("fk, dfk", shape=lp.auto, dtype=self.cdtype), ...
         ],
         seq_dependencies=True,
         silenced_warnings=["inferred_iname"],
         lang_version=(2018, 2),
     )
     knl = lp.set_options(knl, return_dict=True)
     return knl
Beispiel #15
0
def time_knl(knl, ctx, param_dict):
    def create_rand_args(ctx, knl, param_dict):
        queue = cl.CommandQueue(ctx)
        info = lp.generate_code_v2(knl).implemented_data_info
        args, arg_data = lp.auto_test.make_ref_args(knl, info, queue,
                                                    param_dict)
        args.clear()
        del args
        rand_args = lp.auto_test.make_args(knl, info, queue, arg_data,
                                           param_dict)
        del arg_data[:]
        del arg_data
        return rand_args

    queue = cl.CommandQueue(ctx)
    trial_wtimes = []
    arg_arrays = create_rand_args(ctx, knl, param_dict)
    knl = lp.set_options(knl, no_numpy=True)
    for t in range(2 + 3):
        queue.finish()
        tstart = time.time()
        evt, out = knl(queue, **arg_arrays)
        queue.finish()
        tend = time.time()
        trial_wtimes.append(tend - tstart)
    return np.average(trial_wtimes[2:])
Beispiel #16
0
def test_slab_decomposition_does_not_double_execute(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }",
        "a[i] = 2*a[i]",
        assumptions="n>=1")

    ref_knl = knl

    for outer_tag in ["for", "g.0"]:
        knl = ref_knl
        knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="unr",
                outer_tag=outer_tag)
        knl = lp.set_loop_priority(knl, "i_outer")

        a = cl.array.empty(queue, 20, np.float32)
        a.fill(17)
        a_ref = a.copy()
        a_knl = a.copy()

        knl = lp.set_options(knl, write_cl=True)
        print("TEST-----------------------------------------")
        knl(queue, a=a_knl)
        print("REF-----------------------------------------")
        ref_knl(queue, a=a_ref)
        print("DONE-----------------------------------------")

        print("REF", a_ref)
        print("KNL", a_knl)
        assert (a_ref == a_knl).get().all()

        print("_________________________________")
Beispiel #17
0
def test_inf_support(ctx_factory, target, dtype):
    from loopy.symbolic import parse
    import math
    # See: https://github.com/inducer/loopy/issues/443 for some laughs
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{:}", [
        lp.Assignment(parse("out_inf"), math.inf),
        lp.Assignment(parse("out_neginf"), -math.inf)
    ], [
        lp.GlobalArg("out_inf", shape=lp.auto, dtype=dtype),
        lp.GlobalArg("out_neginf", shape=lp.auto, dtype=dtype)
    ],
                         target=target())

    knl = lp.set_options(knl, "return_dict")

    if target == lp.PyOpenCLTarget:
        _, out_dict = knl(queue)
        out_dict = {k: v.get() for k, v in out_dict.items()}
    elif target == lp.ExecutableCTarget:
        _, out_dict = knl()
    else:
        raise NotImplementedError("unsupported target")

    assert np.isinf(out_dict["out_inf"])
    assert np.isneginf(out_dict["out_neginf"])
Beispiel #18
0
def test_random123(ctx_factory, tp):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    n = 150000

    knl = lp.make_kernel(
        "{ [i]: 0<=i<n }", """
            <> key2 = make_uint2(i, 324830944) {inames=i}
            <> key4 = make_uint4(i, 324830944, 234181, 2233) {inames=i}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
            <> real, ctr = philox4x32_TYPE(ctr, key2)  {dep=init_ctr}
            <> imag, ctr = threefry4x32_TYPE(ctr, key4)  {dep=init_ctr}

            out[i, 0] = real.s0 + 1j * imag.s0
            out[i, 1] = real.s1 + 1j * imag.s1
            out[i, 2] = real.s2 + 1j * imag.s2
            out[i, 3] = real.s3 + 1j * imag.s3
            """.replace("TYPE", tp))

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
    knl = lp.set_options(knl, write_cl=True)

    evt, (out, ) = knl(queue, n=n)

    out = out.get()
    assert (out < 1).all()
    assert (0 <= out).all()
Beispiel #19
0
def test_opencl_emits_ternary_operators_correctly(ctx_factory, target):
    # See: https://github.com/inducer/loopy/issues/390
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{:}",
                         """
            <> tmp1 = 3.1416
            <> tmp2 = 0.000
            y1 = 1729 if tmp1 else 1.414
            y2 = 42   if 2.7183 else 13
            y3 = 127 if tmp2 else 128
            """,
                         seq_dependencies=True,
                         target=target())

    knl = lp.set_options(knl, "return_dict")

    if target == lp.PyOpenCLTarget:
        evt, out_dict = knl(queue)
    elif target == lp.ExecutableCTarget:
        evt, out_dict = knl()
    else:
        raise NotImplementedError("unsupported target")

    assert out_dict["y1"] == 1729
    assert out_dict["y2"] == 42
    assert out_dict["y3"] == 128
Beispiel #20
0
def test_global_parallel_reduction_simpler(ctx_factory, size):
    ctx = ctx_factory()

    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")

    knl = lp.make_kernel(
            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
            """
            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}

            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)

            result = sum(j, tmp[j])
            """)

    ng = 50
    knl = lp.fix_parameters(knl, ng=ng)

    knl = lp.set_options(knl, write_cl=True)

    ref_knl = knl

    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "l_inner")
    knl = lp.tag_inames(knl, "g:g.0,j:l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
Beispiel #21
0
def __set_editor(knl, script):
    # set the edit script as the 'editor'
    os.environ['EDITOR'] = script

    # turn on code editing
    edit_knl = lp.set_options(knl, edit_code=True)

    return edit_knl
Beispiel #22
0
def test_packing_unpacking(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)

    callee1 = lp.make_function("{[i]: 0<=i<6}",
                               """
            b[i] = 2*a[i]
            """,
                               name="callee_fn1")

    callee2 = lp.make_function("{[i, j]: 0<=i<2 and 0 <= j < 3}",
                               """
            b[i, j] = 3*a[i, j]
            """,
                               name="callee_fn2")

    knl = lp.make_kernel(
        "{[i, j, k]:  0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}", """
            [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j])
            [k]: y2[k] = callee_fn2([k]: x2[k])
            """)

    knl = lp.merge([knl, callee1])
    knl = lp.merge([knl, callee2])

    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1")
    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2")

    if inline:
        knl = lp.inline_callable_kernel(knl, "callee_fn1")
        knl = lp.inline_callable_kernel(knl, "callee_fn2")

    knl = lp.set_options(knl, "write_cl")
    knl = lp.set_options(knl, "return_dict")
    evt, out_dict = knl(queue, x1=x1, x2=x2)

    y1 = out_dict["y1"].get()
    y2 = out_dict["y2"].get()

    assert np.linalg.norm(2 * x1.get() - y1) / np.linalg.norm(
        2 * x1.get()) < 1e-15
    assert np.linalg.norm(3 * x2.get() - y2) / np.linalg.norm(
        3 * x2.get()) < 1e-15
Beispiel #23
0
def test_sci_notation_literal(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    set_kernel = lp.make_kernel(""" { [i]: 0<=i<12 } """,
                                """ out[i] = 1e-12""")

    set_kernel = lp.set_options(set_kernel, write_cl=True)

    evt, (out, ) = set_kernel(queue)

    assert (np.abs(out.get() - 1e-12) < 1e-20).all()
Beispiel #24
0
def test_diff(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         """{ [i,j]: 0<=i,j<n }""",
         """
         <> a = 1/(1+sinh(x[i] + y[j])**2)
         z[i] = sum(j, exp(a * x[j]))
         """, name="diff")

    knl = lp.fix_parameters(knl, n=50)

    from loopy.transform.diff import diff_kernel
    #FIXME Is this the correct interface. Does it make sense to take the entire
    #translation unit?
    dknl, diff_map = diff_kernel(knl["diff"], "z", "x")
    dknl = knl.with_kernel(dknl)
    dknl = lp.remove_unused_arguments(dknl)

    dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")

    print(dknl)

    n = 50
    x = np.random.randn(n)
    y = np.random.randn(n)

    dx = np.random.randn(n)

    fac = 1e-1
    h1 = 1e-4
    h2 = h1 * fac

    evt, (z0,) = knl(queue, x=x, y=y)
    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)

    dknl = lp.set_options(dknl, write_cl=True)
    evt, (df,) = dknl(queue, x=x, y=y)

    diff1 = (z1-z0)
    diff2 = (z2-z0)

    diff1_predicted = df.dot(h1*dx)
    diff2_predicted = df.dot(h2*dx)

    err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1)
    err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2)
    print(err1, err2)

    assert (err2 < err1 * fac * 1.1).all()
Beispiel #25
0
def test_sci_notation_literal(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    set_kernel = lp.make_kernel(
         ''' { [i]: 0<=i<12 } ''',
         ''' out[i] = 1e-12''')

    set_kernel = lp.set_options(set_kernel, write_cl=True)

    evt, (out,) = set_kernel(queue)

    assert (np.abs(out.get() - 1e-12) < 1e-20).all()
Beispiel #26
0
def test_indexof(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(""" { [i,j]: 0<=i,j<5 } """,
                         """ out[i,j] = indexof(out[i,j])""")

    knl = lp.set_options(knl, write_cl=True)

    (evt, (out, )) = knl(queue)
    out = out.get()

    assert np.array_equal(out.ravel(order="C"), np.arange(25))
Beispiel #27
0
def test_clamp(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 15 * 10**6
    x = cl.clrandom.rand(queue, n, dtype=np.float32)

    knl = lp.make_kernel("{ [i]: 0<=i<n }", "out[i] = clamp(x[i], a, b)")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
    knl = lp.set_options(knl, write_cl=True)

    evt, (out, ) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
Beispiel #28
0
def test_indexof(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         ''' { [i,j]: 0<=i,j<5 } ''',
         ''' out[i,j] = indexof(out[i,j])''')

    knl = lp.set_options(knl, write_cl=True)

    (evt, (out,)) = knl(queue)
    out = out.get()

    assert np.array_equal(out.ravel(order="C"), np.arange(25))
Beispiel #29
0
def test_diff(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         """{ [i,j]: 0<=i,j<n }""",
         """
         <> a = 1/(1+sinh(x[i] + y[j])**2)
         z[i] = sum(j, exp(a * x[j]))
         """)

    knl = lp.fix_parameters(knl, n=50)

    from loopy.transform.diff import diff_kernel
    dknl, diff_map = diff_kernel(knl, "z", "x")
    dknl = lp.remove_unused_arguments(dknl)

    dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")

    print(dknl)

    n = 50
    x = np.random.randn(n)
    y = np.random.randn(n)

    dx = np.random.randn(n)

    fac = 1e-1
    h1 = 1e-4
    h2 = h1 * fac

    evt, (z0,) = knl(queue, x=x, y=y)
    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)

    dknl = lp.set_options(dknl, write_cl=True)
    evt, (df,) = dknl(queue, x=x, y=y)

    diff1 = (z1-z0)
    diff2 = (z2-z0)

    diff1_predicted = df.dot(h1*dx)
    diff2_predicted = df.dot(h2*dx)

    err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1)
    err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2)
    print(err1, err2)

    assert (err2 < err1 * fac * 1.1).all()
Beispiel #30
0
def test_clamp(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 15 * 10**6
    x = cl.clrandom.rand(queue, n, dtype=np.float32)

    knl = lp.make_kernel(
            "{ [i]: 0<=i<n }",
            "out[i] = clamp(x[i], a, b)")

    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
    knl = lp.set_options(knl, write_cl=True)

    evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15))
Beispiel #31
0
def test_any_all(ctx_factory):
    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{[i, j]: 0<=i,j<10}", """
        out1 = reduce(any, [i], i == 4)
        out2 = reduce(all, [j], j == 5)
        """)
    knl = lp.set_options(knl, return_dict=True)

    _, out_dict = knl(cq)

    assert out_dict["out1"].get()
    assert not out_dict["out2"].get()
Beispiel #32
0
def test_empty_reduction(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        ["{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}"],
        "a[i] = sum(j, j)",
    )

    knl = lp.preprocess_kernel(knl)
    print(knl)

    knl = lp.set_options(knl, write_cl=True)
    evt, (a, ) = knl(queue)

    assert (a.get() == 0).all()
Beispiel #33
0
def test_indexof_vec(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if ctx.devices[0].platform.name.startswith("Portable"):
        # Accurate as of 2015-10-08
        pytest.skip("POCL miscompiles vector code")

    knl = lp.make_kernel(''' { [i,j,k]: 0<=i,j,k<4 } ''',
                         ''' out[i,j,k] = indexof_vec(out[i,j,k])''')

    knl = lp.tag_inames(knl, {"i": "vec"})
    knl = lp.tag_data_axes(knl, "out", "vec,c,c")
    knl = lp.set_options(knl, write_cl=True)

    (evt, (out, )) = knl(queue)
Beispiel #34
0
def test_nan_support(ctx_factory):
    from loopy.symbolic import parse
    ctx = ctx_factory()
    knl = lp.make_kernel("{:}", [
        lp.Assignment(parse("a"), np.nan),
        lp.Assignment(parse("b"), parse("isnan(a)")),
        lp.Assignment(parse("c"), parse("isnan(3.14)"))
    ],
                         seq_dependencies=True)

    knl = lp.set_options(knl, "return_dict")

    evt, out_dict = knl(cl.CommandQueue(ctx))
    assert np.isnan(out_dict["a"].get())
    assert out_dict["b"] == 1
    assert out_dict["c"] == 0
Beispiel #35
0
def test_diff(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
         """{ [i,j]: 0<=i,j<n }""",
         """
         <> a = 1/(1+sinh(x[i] + y[j])**2)
         z[i] = sum(j, exp(a * x[j]))
         """)

    knl = lp.fix_parameters(knl, n=50)

    from loopy.transform.diff import diff_kernel
    dknl, diff_map = diff_kernel(knl, "z", "x")
    dknl = lp.remove_unused_arguments(dknl)

    print(dknl)

    n = 50
    x = np.random.randn(n)
    y = np.random.randn(n)

    dx = np.random.randn(n)

    fac = 1e-1
    h1 = 1e-4
    h2 = h1 * fac

    evt, (z0,) = knl(queue, x=x, y=y)
    evt, (z1,) = knl(queue, x=(x + h1*dx), y=y)
    evt, (z2,) = knl(queue, x=(x + h2*dx), y=y)

    dknl = lp.set_options(dknl, write_cl=True)
    evt, (df,) = dknl(queue, x=x, y=y)

    diff1 = (z1-z0)
    diff2 = (z2-z0)

    diff1_predicted = df.dot(h1*dx)
    diff2_predicted = df.dot(h2*dx)

    err1 = la.norm(diff1 - diff1_predicted) / la.norm(diff1)
    err2 = la.norm(diff2 - diff2_predicted) / la.norm(diff2)
    print(err1, err2)

    assert (err2 < err1 * fac * 1.1).all()
Beispiel #36
0
def test_register_knl_with_hw_axes(ctx_factory, inline):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 4

    x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
    y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)

    callee_knl = lp.make_function("{[i, j]:0<=i, j < 4}",
                                  """
            g[i, j] = 2*e[i, j] + 3*f[i, j]
            """,
                                  name="linear_combo")

    callee_knl = lp.split_iname(callee_knl,
                                "i",
                                1,
                                inner_tag="l.0",
                                outer_tag="g.0")

    caller_knl = lp.make_kernel("{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
                                """
            [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
                                                     [j, l]: y[i, j, k, l, m])
            """,
                                name="caller")
    caller_knl = lp.split_iname(caller_knl,
                                "i",
                                4,
                                inner_tag="l.1",
                                outer_tag="g.1")

    knl = lp.merge([caller_knl, callee_knl])

    knl = lp.set_options(knl, "return_dict")

    if inline:
        knl = lp.inline_callable_kernel(knl, "linear_combo")

    evt, out = knl(queue, x=x_dev, y=y_dev)

    x_host = x_dev.get()
    y_host = y_dev.get()

    assert np.linalg.norm(2 * x_host + 3 * y_host - out["z"].get()
                          ) / np.linalg.norm(2 * x_host + 3 * y_host) < 1e-15
Beispiel #37
0
def test_indexof_vec(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if ctx.devices[0].platform.name.startswith("Portable"):
        # Accurate as of 2015-10-08
        pytest.skip("POCL miscompiles vector code")

    knl = lp.make_kernel(
         ''' { [i,j,k]: 0<=i,j,k<4 } ''',
         ''' out[i,j,k] = indexof_vec(out[i,j,k])''')

    knl = lp.tag_inames(knl, {"i": "vec"})
    knl = lp.tag_data_axes(knl, "out", "vec,c,c")
    knl = lp.set_options(knl, write_cl=True)

    (evt, (out,)) = knl(queue)
Beispiel #38
0
def test_pyopencl_target_with_global_temps_with_base_storage(ctx_factory):
    from pyopencl.tools import ImmediateAllocator

    class RecordingAllocator(ImmediateAllocator):
        def __init__(self, queue):
            super().__init__(queue)
            self.allocated_nbytes = 0

        def __call__(self, size):
            self.allocated_nbytes += size
            return super().__call__(size)

    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "{[i, j]: 0<=i, j<10}",
        """
        tmp1[i] = 2*i    {id=w_tmp1}
        y[i] = tmp1[i] {nosync=w_tmp1}
        ... gbarrier
        tmp2[j] = 3*j    {id=w_tmp2}
        z[j] = tmp2[j] {nosync=w_tmp2}
        """, [
            lp.TemporaryVariable("tmp1",
                                 base_storage="base",
                                 address_space=lp.AddressSpace.GLOBAL),
            lp.TemporaryVariable("tmp2",
                                 base_storage="base",
                                 address_space=lp.AddressSpace.GLOBAL), ...
        ],
        seq_dependencies=True)
    knl = lp.tag_inames(knl, {"i": "g.0", "j": "g.0"})
    knl = lp.set_options(knl, "return_dict")

    my_allocator = RecordingAllocator(cq)
    _, out = knl(cq, allocator=my_allocator)

    np.testing.assert_allclose(out["y"].get(), 2 * np.arange(10))
    np.testing.assert_allclose(out["z"].get(), 3 * np.arange(10))
    assert my_allocator.allocated_nbytes == (
        40  # base
        + 40  # y
        + 40  # z
    )
Beispiel #39
0
def test_sized_integer_c_codegen(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    from pymbolic import var
    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        [lp.Assignment("a[i]",
                       lp.TypeCast(np.int64, 1) << var("i"))])

    knl = lp.set_options(knl, write_code=True)
    n = 40

    evt, (a, ) = knl(queue, n=n)

    a_ref = 1 << np.arange(n, dtype=np.int64)

    assert np.array_equal(a_ref, a.get())
Beispiel #40
0
def test_indexof_vec(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    if (
            # Accurate as of 2019-11-04
            ctx.devices[0].platform.name.startswith("Intel")):
        pytest.skip("target ICD miscompiles vector code")

    knl = lp.make_kernel(""" { [i,j,k]: 0<=i,j,k<4 } """,
                         """ out[i,j,k] = indexof_vec(out[i,j,k])""",
                         [lp.GlobalArg("out", shape=lp.auto, is_input=False)])

    knl = lp.tag_inames(knl, {"i": "vec"})
    knl = lp.tag_data_axes(knl, "out", "vec,c,c")
    knl = lp.set_options(knl, write_cl=True)

    (evt, (out, )) = knl(queue)
Beispiel #41
0
def test_sized_integer_c_codegen(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    from pymbolic import var
    knl = lp.make_kernel(
        "{[i]: 0<=i<n}",
        [lp.Assignment("a[i]", lp.TypeCast(np.int64, 1) << var("i"))]
        )

    knl = lp.set_options(knl, write_code=True)
    n = 40

    evt, (a,) = knl(queue, n=n)

    a_ref = 1 << np.arange(n, dtype=np.int64)

    assert np.array_equal(a_ref, a.get())
Beispiel #42
0
def test_empty_reduction(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<20}",
                "[i] -> {[j]: 0<=j<0}"
                ],
            "a[i] = sum(j, j)",
            )

    knl = lp.realize_reduction(knl)
    print(knl)

    knl = lp.set_options(knl, write_cl=True)
    evt, (a,) = knl(queue)

    assert (a.get() == 0).all()
Beispiel #43
0
def test_make_copy_kernel(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    intermediate_format = "f,f,sep"

    a1 = np.random.randn(1024, 4, 3)

    cknl1 = lp.make_copy_kernel(intermediate_format)

    cknl1 = lp.fix_parameters(cknl1, n2=3)

    cknl1 = lp.set_options(cknl1, write_cl=True)
    evt, a2 = cknl1(queue, input=a1)

    cknl2 = lp.make_copy_kernel("c,c,c", intermediate_format)
    cknl2 = lp.fix_parameters(cknl2, n2=3)

    evt, a3 = cknl2(queue, input=a2)

    assert (a1 == a3).all()
Beispiel #44
0
def test_argmax(ctx_factory):
    logging.basicConfig(level=logging.INFO)

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 10000

    knl = lp.make_kernel(
            "{[i]: 0<=i<%d}" % n,
            """
            max_val, max_idx = argmax(i, fabs(a[i]))
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    print(lp.preprocess_kernel(knl))
    knl = lp.set_options(knl, write_cl=True, highlight_cl=True)

    a = np.random.randn(10000).astype(dtype)
    evt, (max_idx, max_val) = knl(queue, a=a, out_host=True)
    assert max_val == np.max(np.abs(a))
    assert max_idx == np.where(np.abs(a) == max_val)[-1]
Beispiel #45
0
def dense_bprop_input():
    """
    Given: dCost/dOut
    Compute: dCost/dIn via dCost/dOut * dOut/dIn.

    dOut/dIn is just the weight matrix.
    dOut/dIn[i, j] = w[i,j]
    """
    knl = lp.make_kernel(
        domains="""
        { [i, j]:
           0<=i<input_len and
           0<=j<output_len
        }
        """,
        instructions=[
            """
            d_cost_d_in[i] = sum(j, d_cost_d_out[j] * weights[i, j])
            """,
        ],
        assumptions="output_len,input_len>0"
    )
    knl = lp.set_options(knl, 'write_cl')
    return knl
Beispiel #46
0
def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):
    ctx = ctx_factory()

    filename = "strongVolumeKernels.f90"
    with open(filename, "r") as sourcef:
        source = sourcef.read()

    source = source.replace("datafloat", "real*4")

    hsv_r, hsv_s = [
           knl for knl in lp.parse_fortran(source, filename, auto_dependencies=False)
           if "KernelR" in knl.name or "KernelS" in knl.name
           ]
    hsv_r = lp.tag_instructions(hsv_r, "rknl")
    hsv_s = lp.tag_instructions(hsv_s, "sknl")
    hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
    #hsv = hsv_s

    from gnuma_loopy_transforms import (
          fix_euler_parameters,
          set_q_storage_format, set_D_storage_format)

    hsv = lp.fix_parameters(hsv, Nq=Nq)
    hsv = lp.set_loop_priority(hsv, "e,k,j,i")
    hsv = lp.tag_inames(hsv, dict(e="g.0", j="l.1", i="l.0"))
    hsv = lp.assume(hsv, "elements >= 1")

    hsv = fix_euler_parameters(hsv, p_p0=1, p_Gamma=1.4, p_R=1)
    for name in ["Q", "rhsQ"]:
        hsv = set_q_storage_format(hsv, name)

    hsv = set_D_storage_format(hsv)
    #hsv = lp.add_prefetch(hsv, "volumeGeometricFactors")

    ref_hsv = hsv

    if opt_level == 0:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "D[:,:]")

    if opt_level == 1:
        tap_hsv = hsv

    # turn the first reads into subst rules
    local_prep_var_names = set()
    for insn in lp.find_instructions(hsv, "tag:local_prep"):
        assignee, = insn.assignee_var_names()
        local_prep_var_names.add(assignee)
        hsv = lp.assignment_to_subst(hsv, assignee)

    # precompute fluxes
    hsv = lp.assignment_to_subst(hsv, "JinvD_r")
    hsv = lp.assignment_to_subst(hsv, "JinvD_s")

    r_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:rknl")
    s_fluxes = lp.find_instructions(hsv, "tag:compute_fluxes and tag:sknl")

    if ilp_multiple > 1:
        hsv = lp.split_iname(hsv, "k", 2, inner_tag="ilp")
        ilp_inames = ("k_inner",)
        flux_ilp_inames = ("kk",)
    else:
        ilp_inames = ()
        flux_ilp_inames = ()

    rtmps = []
    stmps = []

    flux_store_idx = 0

    for rflux_insn, sflux_insn in zip(r_fluxes, s_fluxes):
        for knl_tag, insn, flux_inames, tmps, flux_precomp_inames in [
                  ("rknl", rflux_insn, ("j", "n",), rtmps, ("jj", "ii",)),
                  ("sknl", sflux_insn, ("i", "n",), stmps, ("ii", "jj",)),
                  ]:
            flux_var, = insn.assignee_var_names()
            print(insn)

            reader, = lp.find_instructions(hsv,
                  "tag:{knl_tag} and reads:{flux_var}"
                  .format(knl_tag=knl_tag, flux_var=flux_var))

            hsv = lp.assignment_to_subst(hsv, flux_var)

            flux_store_name = "flux_store_%d" % flux_store_idx
            flux_store_idx += 1
            tmps.append(flux_store_name)

            hsv = lp.precompute(hsv, flux_var+"_subst", flux_inames + ilp_inames,
                temporary_name=flux_store_name,
                precompute_inames=flux_precomp_inames + flux_ilp_inames,
                default_tag=None)
            if flux_var.endswith("_s"):
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N0,N1,N2?")
            else:
                hsv = lp.tag_data_axes(hsv, flux_store_name, "N1,N0,N2?")

            n_iname = "n_"+flux_var.replace("_r", "").replace("_s", "")
            if n_iname.endswith("_0"):
                n_iname = n_iname[:-2]
            hsv = lp.rename_iname(hsv, "n", n_iname, within="id:"+reader.id,
                  existing_ok=True)

    hsv = lp.tag_inames(hsv, dict(ii="l.0", jj="l.1"))
    for iname in flux_ilp_inames:
        hsv = lp.tag_inames(hsv, {iname: "ilp"})

    hsv = lp.alias_temporaries(hsv, rtmps)
    hsv = lp.alias_temporaries(hsv, stmps)

    if opt_level == 2:
        tap_hsv = hsv

    for prep_var_name in local_prep_var_names:
        if prep_var_name.startswith("Jinv") or "_s" in prep_var_name:
            continue
        hsv = lp.precompute(hsv,
            lp.find_one_rule_matching(hsv, prep_var_name+"_*subst*"))

    if opt_level == 3:
        tap_hsv = hsv

    hsv = lp.add_prefetch(hsv, "Q[ii,jj,k,:,:,e]", sweep_inames=ilp_inames)

    if opt_level == 4:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.buffer_array(hsv, "rhsQ", ilp_inames,
          fetch_bounding_box=True, default_tag="for",
          init_expression="0", store_expression="base + buffer")

    if opt_level == 5:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    # buffer axes need to be vectorized in order for this to work
    hsv = lp.tag_data_axes(hsv, "rhsQ_buf", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "Q_fetch", "c?,vec,c")
    hsv = lp.tag_data_axes(hsv, "D_fetch", "f,f")
    hsv = lp.tag_inames(hsv,
            {"Q_dim_k": "unr", "rhsQ_init_k": "unr", "rhsQ_store_k": "unr"},
            ignore_nonexistent=True)

    if opt_level == 6:
        tap_hsv = hsv
        tap_hsv = lp.tag_inames(tap_hsv, dict(
              rhsQ_init_field_inner="unr", rhsQ_store_field_inner="unr",
              rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
              Q_dim_field_inner="unr",
              Q_dim_field_outer="unr"))

    hsv = lp.tag_inames(hsv, dict(
          rhsQ_init_field_inner="vec", rhsQ_store_field_inner="vec",
          rhsQ_init_field_outer="unr", rhsQ_store_field_outer="unr",
          Q_dim_field_inner="vec",
          Q_dim_field_outer="unr"))

    if opt_level == 7:
        tap_hsv = hsv

    hsv = lp.collect_common_factors_on_increment(hsv, "rhsQ_buf",
          vary_by_axes=(0,) if ilp_multiple > 1 else ())

    if opt_level >= 8:
        tap_hsv = hsv

    hsv = tap_hsv

    if 1:
        print("OPS")
        op_poly = lp.get_op_poly(hsv)
        print(lp.stringify_stats_mapping(op_poly))

        print("MEM")
        gmem_poly = lp.sum_mem_access_to_bytes(lp.get_gmem_access_poly(hsv))
        print(lp.stringify_stats_mapping(gmem_poly))

    hsv = lp.set_options(hsv, cl_build_options=[
         "-cl-denorms-are-zero",
         "-cl-fast-relaxed-math",
         "-cl-finite-math-only",
         "-cl-mad-enable",
         "-cl-no-signed-zeros",
         ])

    hsv = hsv.copy(name="horizontalStrongVolumeKernel")

    results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
            quiet=True)

    elapsed = results["elapsed_wall"]

    print("elapsed", elapsed)
Beispiel #47
0
import numpy as np
import loopy as lp
import pyopencl as cl
import pyopencl.array  # noqa

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

n = 200
a = cl.array.arange(queue, n, dtype=np.float32)

knl = lp.make_kernel(
    "{ [i,k]: 0<=i,k<n }",
    "out[i] = a[i] + 1e-12"
    )

knl = lp.set_options(knl, write_cl=True)
evt, (out,) = knl(queue, a=a)
Beispiel #48
0
import numpy as np
import pyopencl as cl
import loopy as lp

ctx = cl.create_some_context()
queue = cl.CommandQueue(ctx)

knl = lp.make_kernel(
    "{[i, j]: 0<=i<n and 0<=j<n}",
    "c[i, j] = a[i]*b[j]",
    assumptions="n >= 16")

a = np.arange(200, dtype=np.float32)
b = np.arange(200, dtype=np.float32)

knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
# SETUPEND

orig_knl = knl

# SPLITBEGIN
knl = lp.split_iname(knl, "i", 16,
        outer_tag="g.0", inner_tag="l.0")
knl = lp.split_iname(knl, "j", 16,
        outer_tag="g.1", inner_tag="l.1")
# SPLITEND

knl = lp.set_options(knl, write_code=True)
evt, (c,) = knl(queue, a=a, b=b)
Beispiel #49
0
     ''' exp[i] = E ** z[i]''',
     assumptions="n>0")

sum_kernel = lp.make_kernel(
    '{ [j]: 0<=j<n }',
    'total = sum(j, exp[j])',
    assumptions='n>0')

softmax_kernel = lp.make_kernel(
    '{ [k]: 0<=k<n }',
    'out3[k] = exp[k] / total',
    [
        lp.GlobalArg("total", None, shape=()),
        "..."
        ],
    assumptions='n>0')

big_honkin_knl = lp.fuse_kernels([exp_kernel, sum_kernel, softmax_kernel])
#big_honkin_knl = lp.fuse_kernels([exp_kernel, sum_kernel])

print softmax_kernel.arg_dict["total"].shape

#big_honkin_knl = lp.tag_inames(big_honkin_knl, dict(i="l.0"))

big_honkin_knl = lp.set_options(big_honkin_knl, write_cl=True)



a = np.random.randn(20)
big_honkin_knl = big_honkin_knl(queue, z=a, E=np.float64(5))
Beispiel #50
0
# Kernel
#
knl = lp.make_kernel(
        "{ [i]: 0<=i<nx }",
        """
        double(s) := 2*s

        b[i] = if(i<nx/2, double(a[i]), a[i])
        """
        ) 

# transform
# ---------
wgs = 512   # Work Group Size
knl = lp.fix_parameters(knl, nx=nx)
knl = lp.set_options(knl, write_cl=True, write_wrapper=False)
print(knl)

#typed_knl = lp.add_dtypes(knl, dict(a=np.float32))
#code, _ = lp.generate_code(typed_knl)
#print(code)


# execute
# -------
evt, (out,) = knl(queue, a=a, b=b)
#a_assert(2*a.get(), b.get())
#print( np.linalg.norm(2*a.get()-out.get())==0 )
print(a.get())
print(b.get())
Beispiel #51
0
import loopy as lp
import numpy as np
import pyopencl as cl
import math

knl = lp.make_kernel(
         """{ [i,j,k]:
             0<=i<n and
             0<=j<n and
             0<=k<n
          }
         """,
         instructions=[
             '<float32>exp[i] = E ** in_batch[i]',
             '<float32> total = sum(j, exp[j])',
             'out[k] = exp[k] / total'
         ],
         assumptions="n>0")

knl = lp.set_options(knl, 'write_cl')

if __name__ == "__main__":
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    x = np.array([1, 2, 3, 4, 5]).astype(np.float32)
    ex = math.e ** x
    print 'np softmax:', x / ex.sum()
    print knl
    evt, out = knl(queue, in_batch=x, E=np.float32(math.e))
    print 'lp sofxmax:', out