Exemple #1
0
def test_force_outer_iname_for_scan():
    knl = lp.make_kernel(
        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
        "out[i] = product(j, a[j]) {inames=i:k}")

    knl = lp.add_dtypes(knl, dict(a=np.float32))

    # TODO: Maybe this deserves to work?
    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
Exemple #2
0
def test_force_outer_iname_for_scan():
    knl = lp.make_kernel(
        "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}",
        "out[i] = product(j, a[j]) {inames=i:k}")

    knl = lp.add_dtypes(knl, dict(a=np.float32))

    # TODO: Maybe this deserves to work?
    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
Exemple #3
0
def no_test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            <> key = make_uint2(i, 324830944)  {inames=i}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    # ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer")
    print(knl)
    1/0
    knl = lp.realize_reduction(knl)

    evt, (z,) = knl(queue, n=size)
Exemple #4
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_scope=lp.temp_var_scope.GLOBAL)
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
Exemple #5
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0 <= i < n}",
            "[i] -> {[j]: 0 <= j <= i}",
            "[i] -> {[k]: 0 <= k <= i}"
        ],
        """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out,) = knl(queue)

    print(out)
Exemple #6
0
def test_scan_not_triangular():
    knl = lp.make_kernel("{[i,j]: 0<=i<100 and 1<=j<=2*i}", """
        a[i] = sum(j, j**2)
        """)

    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        knl = lp.realize_reduction(knl, force_scan=True)
Exemple #7
0
def test_nested_scan(ctx_factory, i_tag, j_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0 <= i < n}",
            "[i] -> {[j]: 0 <= j <= i}",
            "[i] -> {[k]: 0 <= k <= i}"
        ],
        """
        <>tmp[i] = sum(k, 1)
        out[i] = sum(j, tmp[j])
        """)

    knl = lp.fix_parameters(knl, n=10)
    knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag))

    knl = lp.realize_reduction(knl, force_scan=True)

    print(knl)

    evt, (out,) = knl(queue)

    print(out)
Exemple #8
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, i/13)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)
Exemple #9
0
def test_parallel_multi_output_reduction():
    knl = lp.make_kernel(
        "{[i]: 0<=i<128}", """
                max_val, max_indices = argmax(i, fabs(a[i]))
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl)
    print(knl)
Exemple #10
0
def test_scan_extra_constraints_on_domain():
    knl = lp.make_kernel("{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}",
                         "out[i] = sum(j, a[j])")

    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        knl = lp.realize_reduction(knl,
                                   force_scan=True,
                                   force_outer_iname_for_scan="i")
Exemple #11
0
def test_scan_extra_constraints_on_domain():
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}",
        "out[i] = sum(j, a[j])")

    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        knl = lp.realize_reduction(
                knl, force_scan=True, force_outer_iname_for_scan="i")
Exemple #12
0
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """
        out[i-1] = sum(j, a[j]**2)
        """, "...")

    knl = lp.fix_parameters(knl, n=16)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)
    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))
    evt, (out, ) = knl(queue, a=np.arange(1, 17))

    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
Exemple #13
0
def test_scan_not_triangular():
    knl = lp.make_kernel(
        "{[i,j]: 0<=i<100 and 1<=j<=2*i}",
        """
        a[i] = sum(j, j**2)
        """
        )

    with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError):
        knl = lp.realize_reduction(knl, force_scan=True)
Exemple #14
0
def test_local_parallel_scan(ctx_factory, n):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """
        out[i] = sum(j, a[j]**2)
        """, "...")

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))

    print(knl)

    evt, (a, ) = knl(queue, a=np.arange(n))
    assert (a == np.cumsum(np.arange(n)**2)).all()
Exemple #15
0
def test_dependent_domain_scan(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(["[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}"], """
        a[i] = sum(j, j**2) {id=scan}
        """)
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (a, ) = knl(queue, n=100)

    assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
Exemple #16
0
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}",
        """
        out[i-1] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=16)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)
    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))
    evt, (out,) = knl(queue, a=np.arange(1, 17))

    assert (out == np.cumsum(np.arange(1, 17)**2)).all()
Exemple #17
0
def test_scan_data_types(ctx_factory, dtype):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }",
                         "res[i] = reduce(sum, j, a[j])",
                         assumptions="n>=1")

    a = np.random.randn(20).astype(dtype)
    knl = lp.add_dtypes(knl, dict(a=dtype))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res, ) = knl(queue, a=a)

    assert np.allclose(res, np.cumsum(a))
Exemple #18
0
def test_sequential_scan(ctx_factory, n, stride):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, """
        a[i] = sum(j, j**2)
        """)

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (a, ) = knl(queue)

    assert (a.get() == np.cumsum(np.arange(stride * n)**2)[::stride]).all()
Exemple #19
0
def test_scan_data_types(ctx_factory, dtype):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<=i }",
            "res[i] = reduce(sum, j, a[j])",
            assumptions="n>=1")

    a = np.random.randn(20).astype(dtype)
    knl = lp.add_dtypes(knl, dict(a=dtype))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res,) = knl(queue, a=a)

    assert np.allclose(res, np.cumsum(a))
Exemple #20
0
def test_local_parallel_scan(ctx_factory, n):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}",
        """
        out[i] = sum(j, a[j]**2)
        """,
        "..."
        )

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.realize_reduction(knl, force_scan=True)

    knl = lp.realize_reduction(knl)

    knl = lp.add_dtypes(knl, dict(a=int))

    print(knl)

    evt, (a,) = knl(queue, a=np.arange(n))
    assert (a == np.cumsum(np.arange(n)**2)).all()
Exemple #21
0
def test_scan_library(ctx_factory, op_name, np_op):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }",
                         "res[i] = reduce(%s, j, a[j])" % op_name,
                         assumptions="n>=1")

    a = np.random.randn(20)
    knl = lp.add_dtypes(knl, dict(a=np.float))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res, ) = knl(queue, a=a)

    assert np.allclose(res,
                       np.array([np_op(a[:i + 1]) for i in range(len(a))]))
Exemple #22
0
def test_empty_reduction(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        ["{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}"],
        "a[i] = sum(j, j)",
    )

    knl = lp.realize_reduction(knl)
    print(knl)

    knl = lp.set_options(knl, write_cl=True)
    evt, (a, ) = knl(queue)

    assert (a.get() == 0).all()
Exemple #23
0
def test_scan_library(ctx_factory, op_name, np_op):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<n and 0<=j<=i }",
            "res[i] = reduce(%s, j, a[j])" % op_name,
            assumptions="n>=1")

    a = np.random.randn(20)
    knl = lp.add_dtypes(knl, dict(a=np.float))
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (res,) = knl(queue, a=a)

    assert np.allclose(res, np.array(
            [np_op(a[:i+1]) for i in range(len(a))]))
Exemple #24
0
def test_parallel_multi_output_reduction(ctx_factory):
    knl = lp.make_kernel(
        "{[i]: 0<=i<128}", """
                max_val, max_indices = argmax(i, fabs(a[i]), i)
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.add_dtypes(knl, dict(a=np.float64))
    knl = lp.realize_reduction(knl)

    ctx = ctx_factory()

    with cl.CommandQueue(ctx) as queue:
        a = np.random.rand(128)
        out, (max_index, max_val) = knl(queue, a=a)

        assert max_val == np.max(a)
        assert max_index == np.argmax(np.abs(a))
Exemple #25
0
def test_dependent_domain_scan(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "[n] -> {[i]: 0<=i<n}",
            "{[j]: 0<=j<=2*i}"
        ],
        """
        a[i] = sum(j, j**2) {id=scan}
        """
        )
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (a,) = knl(queue, n=100)

    assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
Exemple #26
0
def test_sequential_scan(ctx_factory, n, stride):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride,
        """
        a[i] = sum(j, j**2)
        """
        )

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (a,) = knl(queue)

    assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
Exemple #27
0
def test_parallel_multi_output_reduction(ctx_factory):
    knl = lp.make_kernel(
                "{[i]: 0<=i<128}",
                """
                max_val, max_indices = argmax(i, abs(a[i]), i)
                """)
    knl = lp.tag_inames(knl, dict(i="l.0"))
    knl = lp.add_dtypes(knl, dict(a=np.float64))
    knl = lp.realize_reduction(knl)

    ctx = ctx_factory()

    with cl.CommandQueue(ctx) as queue:
        a = np.random.rand(128)
        out, (max_index, max_val) = knl(queue, a=a)

        assert max_val == np.max(a)
        assert max_index == np.argmax(np.abs(a))
Exemple #28
0
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        ["{[k]: 0<=k<=1}", "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"],
        "out[k,i] = k + sum(j, j**2)")

    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
    n = 10
    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (out, ) = knl(queue)

    inner = np.cumsum(np.arange(n)**2)

    assert (out.get() == np.array([inner, 1 + inner])).all()
Exemple #29
0
def test_empty_reduction(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
            [
                "{[i]: 0<=i<20}",
                "[i] -> {[j]: 0<=j<0}"
                ],
            "a[i] = sum(j, j)",
            )

    knl = lp.realize_reduction(knl)
    print(knl)

    knl = lp.set_options(knl, write_cl=True)
    evt, (a,) = knl(queue)

    assert (a.get() == 0).all()
Exemple #30
0
def test_scan_with_different_lower_bound_from_sweep(ctx_factory, sweep_lbound,
                                                    scan_lbound):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n, sweep_lbound, scan_lbound] -> "
        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """
        out[i-sweep_lbound] = sum(j, j**2)
        """)

    n = 10

    knl = lp.fix_parameters(knl,
                            sweep_lbound=sweep_lbound,
                            scan_lbound=scan_lbound)
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (out, ) = knl(queue, n=n)

    assert (out.get() == np.cumsum(
        np.arange(scan_lbound, 2 * n + scan_lbound)**2)[::2]).all()
Exemple #31
0
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        [
            "{[k]: 0<=k<=1}",
            "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"
        ],
        "out[k,i] = k + sum(j, j**2)"
        )

    knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag))
    n = 10
    knl = lp.fix_parameters(knl, n=n)
    knl = lp.realize_reduction(knl, force_scan=True)

    evt, (out,) = knl(queue)

    inner = np.cumsum(np.arange(n)**2)

    assert (out.get() == np.array([inner, 1 + inner])).all()
Exemple #32
0
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arr = np.ones(n, dtype=np.float32)
    segment_boundaries = np.zeros(n, dtype=np.int32)
    segment_boundaries[(segment_boundaries_indices, )] = 1

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<=i}",
        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [
            lp.GlobalArg("arr", np.float32, shape=("n", )),
            lp.GlobalArg("segflag", np.int32, shape=("n", )), "..."
        ])

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i=iname_tag))
    knl = lp.realize_reduction(knl, force_scan=True)

    (evt, (out, )) = knl(queue, arr=arr, segflag=segment_boundaries)

    check_segmented_scan_output(arr, segment_boundaries_indices, out)
Exemple #33
0
def test_scan_with_different_lower_bound_from_sweep(
        ctx_factory, sweep_lbound, scan_lbound):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    knl = lp.make_kernel(
        "[n, sweep_lbound, scan_lbound] -> "
        "{[i,j]: sweep_lbound<=i<n+sweep_lbound "
        "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}",
        """
        out[i-sweep_lbound] = sum(j, j**2)
        """
        )

    n = 10

    knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound)
    knl = lp.realize_reduction(knl, force_scan=True)
    evt, (out,) = knl(queue, n=n)

    assert (out.get()
            == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
Exemple #34
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_address_space=lp.AddressSpace.GLOBAL,
                        default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters={"n": size},
                        print_ref_code=True)
Exemple #35
0
def test_global_mc_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    import pyopencl.version  # noqa
    if cl.version.VERSION < (2016, 2):
        pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2")

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            for i
                <> key = make_uint2(i, 324830944)  {inames=i}
                <> ctr = make_uint4(0, 1, 2, 3)  {inames=i,id=init_ctr}
                <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}
            end
            z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3)
            """)

    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
    knl = lp.split_reduction_inward(knl, "i_inner_inner")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")
    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size})
Exemple #36
0
def test_argmax(ctx_factory, i_tag):
    logging.basicConfig(level=logging.INFO)

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 128

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, """
            max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j)
            """)

    knl = lp.tag_inames(knl, dict(i=i_tag))
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    knl = lp.realize_reduction(knl, force_scan=True)

    a = np.random.randn(n).astype(dtype)
    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)

    assert (max_vals == [np.max(np.abs(a)[0:i + 1]) for i in range(n)]).all()
    assert (max_indices == [np.argmax(np.abs(a[0:i + 1]))
                            for i in range(n)]).all()
Exemple #37
0
def test_argmax(ctx_factory, i_tag):
    logging.basicConfig(level=logging.INFO)

    dtype = np.dtype(np.float32)
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    n = 128

    knl = lp.make_kernel(
            "{[i,j]: 0<=i<%d and 0<=j<=i}" % n,
            """
            max_vals[i], max_indices[i] = argmax(j, abs(a[j]), j)
            """)

    knl = lp.tag_inames(knl, dict(i=i_tag))
    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    knl = lp.realize_reduction(knl, force_scan=True)

    a = np.random.randn(n).astype(dtype)
    evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True)

    assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all()
    assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
Exemple #38
0
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    arr = np.ones(n, dtype=np.float32)
    segment_boundaries = np.zeros(n, dtype=np.int32)
    segment_boundaries[(segment_boundaries_indices,)] = 1

    knl = lp.make_kernel(
        "{[i,j]: 0<=i<n and 0<=j<=i}",
        "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])",
        [
            lp.GlobalArg("arr", np.float32, shape=("n",)),
            lp.GlobalArg("segflag", np.int32, shape=("n",)),
            "..."
        ])

    knl = lp.fix_parameters(knl, n=n)
    knl = lp.tag_inames(knl, dict(i=iname_tag))
    knl = lp.realize_reduction(knl, force_scan=True)

    (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries)

    check_segmented_scan_output(arr, segment_boundaries_indices, out)
Exemple #39
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)