def test_global_parallel_reduction_simpler(ctx_factory, size): ctx = ctx_factory() pytest.xfail("very sensitive to kernel ordering, fails unused hw-axis check") knl = lp.make_kernel( "{[l,g,j]: 0 <= l < nl and 0 <= g,j < ng}", """ <> key = make_uint2(l+nl*g, 1234) {inames=l:g} <> ctr = make_uint4(0, 1, 2, 3) {inames=l:g,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} <> tmp[g] = sum(l, vals.s0 + 1j*vals.s1 + vals.s2 + 1j*vals.s3) result = sum(j, tmp[j]) """) ng = 50 knl = lp.fix_parameters(knl, ng=ng) knl = lp.set_options(knl, write_cl=True) ref_knl = knl knl = lp.split_iname(knl, "l", 128, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "l_inner") knl = lp.tag_inames(knl, "g:g.0,j:l.0") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"nl": size})
def test_split_reduction(ctx_factory): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", """ b = sum((i,j,k), a[i,j,k]) """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), "..." ]) knl = lp.split_reduction_outward(knl, "j,k")
def test_split_reduction(ctx_factory): knl = lp.make_kernel( "{[i,j,k]: 0<=i,j,k<n}", """ b = sum((i,j,k), a[i,j,k]) """, [ lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a", None, shape=None), "..."]) knl = lp.split_reduction_outward(knl, "j,k")
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
lp.GlobalArg("img,f", np.float32, shape=lp.auto), "..." ], defines=dict(ncolors=n_input_channels, nfeature_maps=n_output_channels, im_w=im_w, im_h=im_h, nimg=nimg, f_w=f_w)) ref_knl = knl ############################################################## # Play with things in here to optimize for speed lp.split_reduction_outward(knl, "color") ## Order of loop variables knl = lp.set_loop_priority(knl, [ "im_x_outer", "im_y_outer", "im_x_inner", "im_y_inner", "n", "feat", "f_x", "f_y"]) ## Split loop into innner/outer. 2nd arg is size of inner loop knl = lp.split_iname(knl, "im_x", 16) knl = lp.split_iname(knl, "im_y", 16)