def test_force_outer_iname_for_scan(): knl = lp.make_kernel( "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}", "out[i] = product(j, a[j]) {inames=i:k}") knl = lp.add_dtypes(knl, dict(a=np.float32)) # TODO: Maybe this deserves to work? with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
def no_test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) # ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer") print(knl) 1/0 knl = lp.realize_reduction(knl) evt, (z,) = knl(queue, n=size)
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL) knl = lp.realize_reduction(knl) knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size})
def test_nested_scan(ctx_factory, i_tag, j_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "[n] -> {[i]: 0 <= i < n}", "[i] -> {[j]: 0 <= j <= i}", "[i] -> {[k]: 0 <= k <= i}" ], """ <>tmp[i] = sum(k, 1) out[i] = sum(j, tmp[j]) """) knl = lp.fix_parameters(knl, n=10) knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) knl = lp.realize_reduction(knl, force_scan=True) print(knl) evt, (out,) = knl(queue) print(out)
def test_scan_not_triangular(): knl = lp.make_kernel("{[i,j]: 0<=i<100 and 1<=j<=2*i}", """ a[i] = sum(j, j**2) """) with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): knl = lp.realize_reduction(knl, force_scan=True)
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, i/13) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_parallel_multi_output_reduction(): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, fabs(a[i])) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl) print(knl)
def test_scan_extra_constraints_on_domain(): knl = lp.make_kernel("{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}", "out[i] = sum(j, a[j])") with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i")
def test_scan_extra_constraints_on_domain(): knl = lp.make_kernel( "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}", "out[i] = sum(j, a[j])") with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): knl = lp.realize_reduction( knl, force_scan=True, force_outer_iname_for_scan="i")
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """ out[i-1] = sum(j, a[j]**2) """, "...") knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out, ) = knl(queue, a=np.arange(1, 17)) assert (out == np.cumsum(np.arange(1, 17)**2)).all()
def test_scan_not_triangular(): knl = lp.make_kernel( "{[i,j]: 0<=i<100 and 1<=j<=2*i}", """ a[i] = sum(j, j**2) """ ) with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): knl = lp.realize_reduction(knl, force_scan=True)
def test_local_parallel_scan(ctx_factory, n): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """ out[i] = sum(j, a[j]**2) """, "...") knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) print(knl) evt, (a, ) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all()
def test_dependent_domain_scan(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel(["[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}"], """ a[i] = sum(j, j**2) {id=scan} """) knl = lp.realize_reduction(knl, force_scan=True) evt, (a, ) = knl(queue, n=100) assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", """ out[i-1] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=16) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) evt, (out,) = knl(queue, a=np.arange(1, 17)) assert (out == np.cumsum(np.arange(1, 17)**2)).all()
def test_scan_data_types(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(sum, j, a[j])", assumptions="n>=1") a = np.random.randn(20).astype(dtype) knl = lp.add_dtypes(knl, dict(a=dtype)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res, ) = knl(queue, a=a) assert np.allclose(res, np.cumsum(a))
def test_sequential_scan(ctx_factory, n, stride): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, """ a[i] = sum(j, j**2) """) knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (a, ) = knl(queue) assert (a.get() == np.cumsum(np.arange(stride * n)**2)[::stride]).all()
def test_scan_data_types(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(sum, j, a[j])", assumptions="n>=1") a = np.random.randn(20).astype(dtype) knl = lp.add_dtypes(knl, dict(a=dtype)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res,) = knl(queue, a=a) assert np.allclose(res, np.cumsum(a))
def test_local_parallel_scan(ctx_factory, n): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", """ out[i] = sum(j, a[j]**2) """, "..." ) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl, force_scan=True) knl = lp.realize_reduction(knl) knl = lp.add_dtypes(knl, dict(a=int)) print(knl) evt, (a,) = knl(queue, a=np.arange(n)) assert (a == np.cumsum(np.arange(n)**2)).all()
def test_scan_library(ctx_factory, op_name, np_op): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel("{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(%s, j, a[j])" % op_name, assumptions="n>=1") a = np.random.randn(20) knl = lp.add_dtypes(knl, dict(a=np.float)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res, ) = knl(queue, a=a) assert np.allclose(res, np.array([np_op(a[:i + 1]) for i in range(len(a))]))
def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ["{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}"], "a[i] = sum(j, j)", ) knl = lp.realize_reduction(knl) print(knl) knl = lp.set_options(knl, write_cl=True) evt, (a, ) = knl(queue) assert (a.get() == 0).all()
def test_scan_library(ctx_factory, op_name, np_op): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i }", "res[i] = reduce(%s, j, a[j])" % op_name, assumptions="n>=1") a = np.random.randn(20) knl = lp.add_dtypes(knl, dict(a=np.float)) knl = lp.realize_reduction(knl, force_scan=True) evt, (res,) = knl(queue, a=a) assert np.allclose(res, np.array( [np_op(a[:i+1]) for i in range(len(a))]))
def test_parallel_multi_output_reduction(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, fabs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) knl = lp.realize_reduction(knl) ctx = ctx_factory() with cl.CommandQueue(ctx) as queue: a = np.random.rand(128) out, (max_index, max_val) = knl(queue, a=a) assert max_val == np.max(a) assert max_index == np.argmax(np.abs(a))
def test_dependent_domain_scan(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "[n] -> {[i]: 0<=i<n}", "{[j]: 0<=j<=2*i}" ], """ a[i] = sum(j, j**2) {id=scan} """ ) knl = lp.realize_reduction(knl, force_scan=True) evt, (a,) = knl(queue, n=100) assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all()
def test_sequential_scan(ctx_factory, n, stride): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, """ a[i] = sum(j, j**2) """ ) knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (a,) = knl(queue) assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all()
def test_parallel_multi_output_reduction(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ max_val, max_indices = argmax(i, abs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.add_dtypes(knl, dict(a=np.float64)) knl = lp.realize_reduction(knl) ctx = ctx_factory() with cl.CommandQueue(ctx) as queue: a = np.random.rand(128) out, (max_index, max_val) = knl(queue, a=a) assert max_val == np.max(a) assert max_index == np.argmax(np.abs(a))
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( ["{[k]: 0<=k<=1}", "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}"], "out[k,i] = k + sum(j, j**2)") knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) n = 10 knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (out, ) = knl(queue) inner = np.cumsum(np.arange(n)**2) assert (out.get() == np.array([inner, 1 + inner])).all()
def test_empty_reduction(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[i]: 0<=i<20}", "[i] -> {[j]: 0<=j<0}" ], "a[i] = sum(j, j)", ) knl = lp.realize_reduction(knl) print(knl) knl = lp.set_options(knl, write_cl=True) evt, (a,) = knl(queue) assert (a.get() == 0).all()
def test_scan_with_different_lower_bound_from_sweep(ctx_factory, sweep_lbound, scan_lbound): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n, sweep_lbound, scan_lbound] -> " "{[i,j]: sweep_lbound<=i<n+sweep_lbound " "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """ out[i-sweep_lbound] = sum(j, j**2) """) n = 10 knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) knl = lp.realize_reduction(knl, force_scan=True) evt, (out, ) = knl(queue, n=n) assert (out.get() == np.cumsum( np.arange(scan_lbound, 2 * n + scan_lbound)**2)[::2]).all()
def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( [ "{[k]: 0<=k<=1}", "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}" ], "out[k,i] = k + sum(j, j**2)" ) knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) n = 10 knl = lp.fix_parameters(knl, n=n) knl = lp.realize_reduction(knl, force_scan=True) evt, (out,) = knl(queue) inner = np.cumsum(np.arange(n)**2) assert (out.get() == np.array([inner, 1 + inner])).all()
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr = np.ones(n, dtype=np.float32) segment_boundaries = np.zeros(n, dtype=np.int32) segment_boundaries[(segment_boundaries_indices, )] = 1 knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i}", "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [ lp.GlobalArg("arr", np.float32, shape=("n", )), lp.GlobalArg("segflag", np.int32, shape=("n", )), "..." ]) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i=iname_tag)) knl = lp.realize_reduction(knl, force_scan=True) (evt, (out, )) = knl(queue, arr=arr, segflag=segment_boundaries) check_segmented_scan_output(arr, segment_boundaries_indices, out)
def test_scan_with_different_lower_bound_from_sweep( ctx_factory, sweep_lbound, scan_lbound): ctx = ctx_factory() queue = cl.CommandQueue(ctx) knl = lp.make_kernel( "[n, sweep_lbound, scan_lbound] -> " "{[i,j]: sweep_lbound<=i<n+sweep_lbound " "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", """ out[i-sweep_lbound] = sum(j, j**2) """ ) n = 10 knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) knl = lp.realize_reduction(knl, force_scan=True) evt, (out,) = knl(queue, n=n) assert (out.get() == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all()
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_address_space=lp.AddressSpace.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency(knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)
def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() import pyopencl.version # noqa if cl.version.VERSION < (2016, 2): pytest.skip("Random123 RNG not supported in PyOpenCL < 2016.2") knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ for i <> key = make_uint2(i, 324830944) {inames=i} <> ctr = make_uint4(0, 1, 2, 3) {inames=i,id=init_ctr} <> vals, ctr = philox4x32_f32(ctr, key) {dep=init_ctr} end z = sum(i, vals.s0 + vals.s1 + vals.s2 + vals.s3) """) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") knl = lp.split_reduction_inward(knl, "i_inner_inner") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size})
def test_argmax(ctx_factory, i_tag): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 128 knl = lp.make_kernel( "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, """ max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j) """) knl = lp.tag_inames(knl, dict(i=i_tag)) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.realize_reduction(knl, force_scan=True) a = np.random.randn(n).astype(dtype) evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True) assert (max_vals == [np.max(np.abs(a)[0:i + 1]) for i in range(n)]).all() assert (max_indices == [np.argmax(np.abs(a[0:i + 1])) for i in range(n)]).all()
def test_argmax(ctx_factory, i_tag): logging.basicConfig(level=logging.INFO) dtype = np.dtype(np.float32) ctx = ctx_factory() queue = cl.CommandQueue(ctx) n = 128 knl = lp.make_kernel( "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, """ max_vals[i], max_indices[i] = argmax(j, abs(a[j]), j) """) knl = lp.tag_inames(knl, dict(i=i_tag)) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) knl = lp.realize_reduction(knl, force_scan=True) a = np.random.randn(n).astype(dtype) evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True) assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all() assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all()
def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): ctx = ctx_factory() queue = cl.CommandQueue(ctx) arr = np.ones(n, dtype=np.float32) segment_boundaries = np.zeros(n, dtype=np.int32) segment_boundaries[(segment_boundaries_indices,)] = 1 knl = lp.make_kernel( "{[i,j]: 0<=i<n and 0<=j<=i}", "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", [ lp.GlobalArg("arr", np.float32, shape=("n",)), lp.GlobalArg("segflag", np.int32, shape=("n",)), "..." ]) knl = lp.fix_parameters(knl, n=n) knl = lp.tag_inames(knl, dict(i=iname_tag)) knl = lp.realize_reduction(knl, force_scan=True) (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) check_segmented_scan_output(arr, segment_boundaries_indices, out)
def test_global_parallel_reduction(ctx_factory, size): ctx = ctx_factory() knl = lp.make_kernel( "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. z[0] = sum(i, a[i]) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) knl = lp.tag_inames(knl, "i_outer_0:g.0") # Keep the i_outer accumulator on the correct (lower) side of the barrier, # otherwise there will be useless save/reload code generated. knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True)