Example #1
0
def test_global_parallel_reduction_simpler(ctx_factory, size):
    ctx = ctx_factory()

    pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check")

    knl = lp.make_kernel(
            "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}",
            """
            <> key = make_uint2(l+nl*g, 1234)  {inames=l:g}
            <> ctr = make_uint4(0, 1, 2, 3)  {inames=l:g,id=init_ctr}
            <> vals, ctr = philox4x32_f32(ctr, key)  {dep=init_ctr}

            <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3)

            result = sum(j, tmp[j])
            """)

    ng = 50
    knl = lp.fix_parameters(knl, ng=ng)

    knl = lp.set_options(knl, write_cl=True)

    ref_knl = knl

    knl = lp.split_iname(knl, "l", 128, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "l_inner")
    knl = lp.tag_inames(knl, "g:g.0,j:l.0")

    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
Example #2
0
def test_split_reduction(ctx_factory):
    knl = lp.make_kernel(
        "{[i,j,k]: 0<=i,j,k<n}", """
                b = sum((i,j,k), a[i,j,k])
                """, [
            lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                         None,
                         shape=None), "..."
        ])

    knl = lp.split_reduction_outward(knl, "j,k")
Example #3
0
def test_split_reduction(ctx_factory):
    knl = lp.make_kernel(
            "{[i,j,k]: 0<=i,j,k<n}",
            """
                b = sum((i,j,k), a[i,j,k])
                """,
            [
                lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                    None, shape=None),
                "..."])

    knl = lp.split_reduction_outward(knl, "j,k")
Example #4
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
        "{[i]: 0 <= i < n }", """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl,
                        "red_i_outer_arg",
                        "i_outer",
                        temporary_address_space=lp.AddressSpace.GLOBAL,
                        default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(knl, "writes:acc_i_outer",
                            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(ref_knl,
                        ctx,
                        knl,
                        parameters={"n": size},
                        print_ref_code=True)
Example #5
0
def test_global_parallel_reduction(ctx_factory, size):
    ctx = ctx_factory()

    knl = lp.make_kernel(
            "{[i]: 0 <= i < n }",
            """
            # Using z[0] instead of z works around a bug in ancient PyOpenCL.
            z[0] = sum(i, a[i])
            """)

    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
    ref_knl = knl

    gsize = 128
    knl = lp.split_iname(knl, "i", gsize * 20)
    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
    knl = lp.split_reduction_outward(knl, "i_outer")
    knl = lp.split_reduction_inward(knl, "i_inner_outer")
    from loopy.transform.data import reduction_arg_to_subst_rule
    knl = reduction_arg_to_subst_rule(knl, "i_outer")

    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
            temporary_scope=lp.temp_var_scope.GLOBAL,
            default_tag="l.auto")
    knl = lp.realize_reduction(knl)
    knl = lp.tag_inames(knl, "i_outer_0:g.0")

    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
    # otherwise there will be useless save/reload code generated.
    knl = lp.add_dependency(
            knl, "writes:acc_i_outer",
            "id:red_i_outer_arg_barrier")

    lp.auto_test_vs_ref(
            ref_knl, ctx, knl, parameters={"n": size},
            print_ref_code=True)
Example #6
0
        lp.GlobalArg("img,f", np.float32, shape=lp.auto),
        "..."
        ],
        
        defines=dict(ncolors=n_input_channels, 
                     nfeature_maps=n_output_channels, 
                     im_w=im_w, im_h=im_h, 
                     nimg=nimg,
                     f_w=f_w))

ref_knl = knl

##############################################################
# Play with things in here to optimize for speed

lp.split_reduction_outward(knl, "color") 

## Order of loop variables
knl = lp.set_loop_priority(knl, [
                                 "im_x_outer", 
                                 "im_y_outer",
                                 "im_x_inner",
                                 "im_y_inner", 
                                 "n",
                                 "feat",
                                 "f_x",
                                 "f_y"])

## Split loop into innner/outer. 2nd arg is size of inner loop
knl = lp.split_iname(knl, "im_x", 16)
knl = lp.split_iname(knl, "im_y", 16)