def get_kernels(self, key_dtype, value_dtype, starts_dtype): from pyopencl.algorithm import RadixSort from pyopencl.tools import VectorArg, ScalarArg by_target_sorter = RadixSort(self.context, [ VectorArg(value_dtype, "values"), VectorArg(key_dtype, "keys"), ], key_expr="keys[i]", sort_arg_names=["values", "keys"]) from pyopencl.elementwise import ElementwiseTemplate start_finder = ElementwiseTemplate(arguments="""//CL// starts_t *key_group_starts, key_t *keys_sorted_by_key, """, operation=r"""//CL// key_t my_key = keys_sorted_by_key[i]; if (i == 0 || my_key != keys_sorted_by_key[i-1]) key_group_starts[my_key] = i; """, name="find_starts").build( self.context, type_aliases=( ("key_t", starts_dtype), ("starts_t", starts_dtype), ), var_values=()) from pyopencl.scan import GenericScanKernel bound_propagation_scan = GenericScanKernel( self.context, starts_dtype, arguments=[ VectorArg(starts_dtype, "starts"), # starts has length n+1 ScalarArg(key_dtype, "nkeys"), ], input_expr="starts[nkeys-i]", scan_expr="min(a, b)", neutral=_make_cl_int_literal( np.iinfo(starts_dtype).max, starts_dtype), output_statement="starts[nkeys-i] = item;") return _KernelInfo(by_target_sorter=by_target_sorter, start_finder=start_finder, bound_propagation_scan=bound_propagation_scan)
def get_array_binop_kernel(context, operator, dtype_res, dtype_a, dtype_b, a_is_scalar=False, b_is_scalar=False): a = "a[0]" if a_is_scalar else "a[i]" b = "b[0]" if b_is_scalar else "b[i]" return get_elwise_kernel(context, [ VectorArg(dtype_res, "out", with_offset=True), VectorArg(dtype_a, "a", with_offset=True), VectorArg(dtype_b, "b", with_offset=True), ], f"out[i] = {a} {operator} {b}", name="binop_kernel")
def get_binary_func_kernel(context, func_name, x_dtype, y_dtype, out_dtype, preamble="", name=None): return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), VectorArg(x_dtype, "x", with_offset=True), VectorArg(y_dtype, "y", with_offset=True), ], "z[i] = %s(x[i], y[i])" % func_name, name="%s_kernel" % func_name if name is None else name, preamble=preamble)
def test_list_builder_with_offset(ctx_factory): from pytest import importorskip importorskip("mako") context = ctx_factory() queue = cl.CommandQueue(context) from pyopencl.algorithm import ListOfListsBuilder from pyopencl.tools import VectorArg builder = ListOfListsBuilder(context, [("mylist", np.int32)], """//CL// void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) { APPEND_mylist(input_list[i]); } """, arg_decls=[ VectorArg(float, "input_list", with_offset=True)]) n = 10000 input_list = cl.array.zeros(queue, (n + 10,), float) input_list[10:] = 1 result, evt = builder(queue, n, input_list[10:]) inf = result["mylist"] assert inf.count == n assert (inf.lists.get() == 1).all()
def get_linear_combination_kernel(summand_descriptors, dtype_z): # TODO: Port this! raise NotImplementedError from pyopencl.tools import dtype_to_ctype from pyopencl.elementwise import \ VectorArg, ScalarArg, get_elwise_module args = [] preamble = [] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in \ enumerate(summand_descriptors): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i)) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i, with_offset=True)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z", with_offset=True)) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module(args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep)) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), (1, 1, 1), texrefs=tex_src) return func, tex_src
def get_bessel_kernel(context, which_func, out_dtype=np.float64, order_dtype=np.int32, x_dtype=np.float64): return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), ScalarArg(order_dtype, "ord_n"), VectorArg(x_dtype, "x", with_offset=True), ], "z[i] = bessel_%sn(ord_n, x[i])" % which_func, name="bessel_%sn_kernel" % which_func, preamble=""" #pragma OPENCL EXTENSION cl_khr_fp64: enable #define PYOPENCL_DEFINE_CDOUBLE #include <pyopencl-bessel-%s.cl> """ % which_func)
def get_all_kernel(ctx, dtype_in): from pyopencl.tools import VectorArg return ReductionKernel(ctx, np.int8, "true", "a && b", map_expr="(bool) (in[i])", arguments=[VectorArg(dtype_in, "in")])
def get_bessel_kernel(context, which_func, out_dtype=np.float64, order_dtype=np.int32, x_dtype=np.float64): if x_dtype.kind != "c": return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), ScalarArg(order_dtype, "ord_n"), VectorArg(x_dtype, "x", with_offset=True), ], "z[i] = bessel_%sn(ord_n, x[i])" % which_func, name="bessel_%sn_kernel" % which_func, preamble=""" #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE #include <pyopencl-bessel-%s.cl> """ % which_func) else: if which_func != "j": raise NotImplementedError("complex arguments for Bessel Y") if x_dtype != np.complex128: raise NotImplementedError("non-complex double dtype") if x_dtype != out_dtype: raise NotImplementedError("different input/output types") return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), ScalarArg(order_dtype, "ord_n"), VectorArg(x_dtype, "x", with_offset=True), ], """ cdouble_t jv_loc; cdouble_t jvp1_loc; bessel_j_complex(ord_n, x[i], &jv_loc, &jvp1_loc); z[i] = jv_loc; """, name="bessel_j_complex_kernel", preamble=""" #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE #include <pyopencl-complex.h> #include <pyopencl-bessel-j-complex.cl> """)
def get_take_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = ([ VectorArg(dtype, "dest" + str(i), with_offset=True) for i in range(vec_count) ] + [ VectorArg(dtype, "src" + str(i), with_offset=True) for i in range(vec_count) ] + [VectorArg(idx_dtype, "idx", with_offset=True)]) body = (("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join("dest%d[i] = src%d[src_idx];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="take")
def get_count_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [ VectorArg(index_dtype, "plb_%s_count" % name) for name, dtype in self.list_names_and_dtypes if name not in self.count_sharing] user_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue name = "plb_loc_%s_count" % name user_list_args.append(OtherArg("%s *%s" % ( index_ctype, name), name)) kernel_name = self.name_prefix+"_count" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=True, kernel_name=kernel_name, double_support=all(has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), eliminate_empty_output_lists=self.eliminate_empty_output_lists, kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=_get_arg_list(user_list_args, prefix="&"), user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl_with_offset=_get_arg_decl(self.arg_decls), user_arg_decl_no_offset=_get_arg_decl(self.arg_decls_no_offset), user_args_no_offset=_get_arg_list(self.arg_decls_no_offset), arg_offset_adjustment=get_arg_offset_adjuster_code(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes(get_arg_list_scalar_arg_dtypes( kernel_list_args+self.arg_decls) + [index_dtype]) return knl
def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, arguments=None, name="reduce_kernel", preamble="", map_exprs=None, device=None, options=[], max_group_size=None): if map_exprs is None: raise ValueError("map_exprs has to be given!") for i, m in enumerate(map_exprs): if m is None: if stage == 2: map_exprs[i] = "pyopencl_reduction_inp_%i[i]" % i else: map_exprs[i] = "in[i]" from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) arg_prep = "" if stage == 1 and arguments is not None: arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = parse_arg_list(arguments) arguments = ([ VectorArg(dtype_out, "pyopencl_reduction_inp_%i" % i) for i in range(len(map_exprs)) ] + arguments) inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_exprs, arguments, name, preamble, arg_prep, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( [ None, ] * len(map_exprs) + [np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.uint32] * 2) return inf
def get_float_binary_func_kernel(context, func_name, x_dtype, y_dtype, out_dtype, preamble="", name=None): if (np.array(0, x_dtype) * np.array(0, y_dtype)).itemsize > 4: arg_type = 'double' preamble = """ #if __OPENCL_C_VERSION__ < 120 #pragma OPENCL EXTENSION cl_khr_fp64: enable #endif #define PYOPENCL_DEFINE_CDOUBLE """ + preamble else: arg_type = 'float' return get_elwise_kernel(context, [ VectorArg(out_dtype, "z", with_offset=True), VectorArg(x_dtype, "x", with_offset=True), VectorArg(y_dtype, "y", with_offset=True), ], "z[i] = %s((%s)x[i], (%s)y[i])" % (func_name, arg_type, arg_type), name="%s_kernel" % func_name if name is None else name, preamble=preamble)
def get_put_kernel(context, dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i, with_offset=True) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] body = ("%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join("dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, name="put")
def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=None, max_group_size=None): if map_expr is None: if stage == 2: map_expr = "pyopencl_reduction_inp[i]" else: map_expr = "in[i]" from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) if arguments is None: raise ValueError("arguments must not be None") arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments) source, group_size = _get_reduction_source(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, name, preamble, arg_prep, device, max_group_size) program = cl.Program(ctx, source) program.build(options) kernel = getattr(program, name) kernel.set_scalar_arg_dtypes([None, np.int64] + get_arg_list_scalar_arg_dtypes(arguments) + [np.int64] * 3 + [np.uint32, np.int64]) return _ReductionInfo(context=ctx, source=source, group_size=group_size, program=program, kernel=kernel, arg_types=arguments)
def get_balls_to_leaves_kernel(self, dimensions, coord_dtype, box_id_dtype, ball_id_dtype, max_levels, stick_out_factor): from pyopencl.tools import dtype_to_ctype from boxtree import box_flags_enum render_vars = dict( dimensions=dimensions, dtype_to_ctype=dtype_to_ctype, box_id_dtype=box_id_dtype, particle_id_dtype=None, ball_id_dtype=ball_id_dtype, coord_dtype=coord_dtype, vec_types=cl.array.vec.types, max_levels=max_levels, AXIS_NAMES=AXIS_NAMES, box_flags_enum=box_flags_enum, debug=False, stick_out_factor=stick_out_factor, ) logger.info("start building leaves-to-balls lookup kernel") from boxtree.traversal import TRAVERSAL_PREAMBLE_TEMPLATE src = Template(TRAVERSAL_PREAMBLE_TEMPLATE + BALLS_TO_LEAVES_TEMPLATE, strict_undefined=True).render(**render_vars) from pyopencl.tools import VectorArg, ScalarArg from pyopencl.algorithm import ListOfListsBuilder result = ListOfListsBuilder( self.context, [ ("ball_numbers", ball_id_dtype), ("overlapping_leaves", box_id_dtype), ], str(src), arg_decls=[ VectorArg(box_flags_enum.dtype, "box_flags"), VectorArg(coord_dtype, "box_centers"), VectorArg(box_id_dtype, "box_child_ids"), VectorArg(np.uint8, "box_levels"), ScalarArg(coord_dtype, "root_extent"), ScalarArg(box_id_dtype, "aligned_nboxes"), VectorArg(coord_dtype, "ball_radii"), ] + [ VectorArg(coord_dtype, "ball_" + ax) for ax in AXIS_NAMES[:dimensions] ], name_prefix="circles_to_balls", count_sharing={ # /!\ This makes a promise that APPEND_ball_numbers will # always occur *before* APPEND_overlapping_leaves. "overlapping_leaves": "ball_numbers" }, complex_kernel=True) logger.info("done building leaves-to-balls lookup kernel") return result
def _get_kernel(self, dtype, src_index_dtype, map_values=False): from pyopencl.tools import VectorArg args = [ VectorArg(dtype, "input_ary", with_offset=True), VectorArg(dtype, "output_ary", with_offset=True), VectorArg(src_index_dtype, "from_indices", with_offset=True) ] if map_values: args.append(VectorArg(dtype, "value_map", with_offset=True)) from pyopencl.tools import dtype_to_ctype src = GAPPY_COPY_TPL.render(dtype=dtype, dtype_to_ctype=dtype_to_ctype, map_values=map_values) from pyopencl.elementwise import ElementwiseKernel return ElementwiseKernel(self.context, args, str(src), name="gappy_copy_and_map")
def get_arange_kernel(context, dtype): if dtype.kind == "c": i = "%s_fromreal(i)" % complex_dtype_to_name(dtype) else: i = "(%s) i" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], "z[i] = start + %s*step" % i, name="arange")
def get_arange_kernel(context, dtype): if dtype.kind == "c": expr = ("{root}_add(start, {root}_rmul(i, step))".format( root=complex_dtype_to_name(dtype))) else: expr = "start + ((%s) i)*step" % dtype_to_ctype(dtype) return get_elwise_kernel(context, [ VectorArg(dtype, "z", with_offset=True), ScalarArg(dtype, "start"), ScalarArg(dtype, "step"), ], "z[i] = " + expr, name="arange")
def get_filter_target_lists_in_user_order_kernel(self, particle_id_dtype, user_order_flags_dtype): from pyopencl.tools import VectorArg, dtype_to_ctype from pyopencl.algorithm import ListOfListsBuilder from mako.template import Template builder = ListOfListsBuilder( self.context, [("filt_tgt_list", particle_id_dtype)], Template("""//CL// typedef ${dtype_to_ctype(particle_id_dtype)} particle_id_t; void generate(LIST_ARG_DECL USER_ARG_DECL index_type i) { particle_id_t b_t_start = box_target_starts[i]; particle_id_t b_t_count = box_target_counts_nonchild[i]; for (particle_id_t j = b_t_start; j < b_t_start+b_t_count; ++j) { particle_id_t user_target_id = user_target_ids[j]; if (user_order_flags[user_target_id]) { APPEND_filt_tgt_list(user_target_id); } } } """, strict_undefined=True).render( dtype_to_ctype=dtype_to_ctype, particle_id_dtype=particle_id_dtype), arg_decls=[ VectorArg(user_order_flags_dtype, "user_order_flags"), VectorArg(particle_id_dtype, "user_target_ids"), VectorArg(particle_id_dtype, "box_target_starts"), VectorArg(particle_id_dtype, "box_target_counts_nonchild"), ]) return builder
def get_take_put_kernel(context, dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = [ VectorArg(dtype, "dest%d" % i) for i in range(vec_count) ] + [ VectorArg(idx_dtype, "gmem_dest_idx", with_offset=True), VectorArg(idx_dtype, "gmem_src_idx", with_offset=True), ] + [ VectorArg(dtype, "src%d" % i, with_offset=True) for i in range(vec_count) ] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] if with_offsets: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx+offset%d];" % (i, i, i)) else: def get_copy_insn(i): return ("dest%d[dest_idx] = " "src%d[src_idx];" % (i, i)) body = (("%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx) + "\n".join(get_copy_insn(i) for i in range(vec_count))) return get_elwise_kernel(context, args, body, preamble=dtype_to_c_struct(context.devices[0], dtype), name="take_put")
def _get_kernel(self, dtype, src_index_dtype, dst_index_dtype, have_src_indices, have_dst_indices, map_values): from pyopencl.tools import VectorArg args = [ VectorArg(dtype, "input_ary", with_offset=True), VectorArg(dtype, "output_ary", with_offset=True), ] if have_src_indices: args.append( VectorArg(src_index_dtype, "from_indices", with_offset=True)) if have_dst_indices: args.append( VectorArg(dst_index_dtype, "to_indices", with_offset=True)) if map_values: args.append(VectorArg(dtype, "value_map", with_offset=True)) from pyopencl.tools import dtype_to_ctype src = GAPPY_COPY_TPL.render(dtype=dtype, dtype_to_ctype=dtype_to_ctype, from_dtype=src_index_dtype, to_dtype=dst_index_dtype, from_indices=have_src_indices, to_indices=have_dst_indices, map_values=map_values) from pyopencl.elementwise import ElementwiseKernel return ElementwiseKernel(self.context, args, str(src), preamble=dtype_to_c_struct( self.context.devices[0], dtype), name="gappy_copy_and_map")
def get_reduction_kernel(stage, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", preamble="", device=None, options=[], max_group_size=None): if map_expr is None: if stage == 2: map_expr = "pyopencl_reduction_inp[i]" else: map_expr = "in[i]" from pyopencl.tools import (parse_arg_list, get_arg_list_scalar_arg_dtypes, get_arg_offset_adjuster_code, VectorArg) arg_prep = "" if stage == 1 and arguments is not None: arguments = parse_arg_list(arguments, with_offset=True) arg_prep = get_arg_offset_adjuster_code(arguments) if stage == 2 and arguments is not None: arguments = parse_arg_list(arguments) arguments = ([VectorArg(dtype_out, "pyopencl_reduction_inp")] + arguments) inf = _get_reduction_source(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, name, preamble, arg_prep, device, max_group_size) inf.program = cl.Program(ctx, inf.source) inf.program.build(options) inf.kernel = getattr(inf.program, name) inf.arg_types = arguments inf.kernel.set_scalar_arg_dtypes( [None, np.int64] + get_arg_list_scalar_arg_dtypes(inf.arg_types) + [np.int64] * 3 + [np.uint32, np.int64]) return inf
def extract_extra_args_types_values(extra_args): from pyopencl.tools import VectorArg, ScalarArg extra_args_types = [] extra_args_values = [] for name, val in extra_args: if isinstance(val, cl.array.Array): extra_args_types.append(VectorArg(val.dtype, name, with_offset=False)) extra_args_values.append(val) elif isinstance(val, np.generic): extra_args_types.append(ScalarArg(val.dtype, name)) extra_args_values.append(val) else: raise RuntimeError("argument '%d' not understood" % name) return tuple(extra_args_types), extra_args_values
def get_kernel_info(self, dimensions, particle_id_dtype, box_id_dtype, coord_dtype, box_level_dtype, max_levels, sources_are_targets, sources_have_extent, targets_have_extent, stick_out_factor): logging.info("building traversal build kernels") debug = False from pyopencl.tools import dtype_to_ctype from boxtree.tree import box_flags_enum render_vars = dict( dimensions=dimensions, dtype_to_ctype=dtype_to_ctype, particle_id_dtype=particle_id_dtype, box_id_dtype=box_id_dtype, box_flags_enum=box_flags_enum, coord_dtype=coord_dtype, vec_types=cl.array.vec.types, max_levels=max_levels, AXIS_NAMES=AXIS_NAMES, debug=debug, sources_are_targets=sources_are_targets, sources_have_extent=sources_have_extent, targets_have_extent=targets_have_extent, stick_out_factor=stick_out_factor, ) from pyopencl.algorithm import ListOfListsBuilder from pyopencl.tools import VectorArg, ScalarArg result = {} # {{{ source boxes, their parents, target boxes src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + SOURCES_PARENTS_AND_TARGETS_TEMPLATE, strict_undefined=True).render(**render_vars) result["sources_parents_and_targets_builder"] = \ ListOfListsBuilder(self.context, [ ("source_parent_boxes", box_id_dtype), ("source_boxes", box_id_dtype), ("target_or_target_parent_boxes", box_id_dtype) ] + ( [("target_boxes", box_id_dtype)] if not sources_are_targets else []), str(src), arg_decls=[ VectorArg(box_flags_enum.dtype, "box_flags"), ], debug=debug, name_prefix="sources_parents_and_targets") result["level_start_box_nrs_extractor"] = \ LEVEL_START_BOX_NR_EXTRACTOR_TEMPLATE.build(self.context, type_aliases=( ("box_id_t", box_id_dtype), ("box_level_t", box_level_dtype), ), ) # }}} # {{{ build list N builders base_args = [ VectorArg(coord_dtype, "box_centers"), ScalarArg(coord_dtype, "root_extent"), VectorArg(np.uint8, "box_levels"), ScalarArg(box_id_dtype, "aligned_nboxes"), VectorArg(box_id_dtype, "box_child_ids"), VectorArg(box_flags_enum.dtype, "box_flags"), ] for list_name, template, extra_args, extra_lists in [ ("colleagues", COLLEAGUES_TEMPLATE, [], []), ("neighbor_source_boxes", NEIGBHOR_SOURCE_BOXES_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), ], []), ("sep_siblings", SEP_SIBLINGS_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], []), ("sep_smaller", SEP_SMALLER_TEMPLATE, [ VectorArg(box_id_dtype, "target_boxes"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_smaller"] if sources_have_extent or targets_have_extent else []), ("sep_bigger", SEP_BIGGER_TEMPLATE, [ VectorArg(box_id_dtype, "target_or_target_parent_boxes"), VectorArg(box_id_dtype, "box_parent_ids"), VectorArg(box_id_dtype, "colleagues_starts"), VectorArg(box_id_dtype, "colleagues_list"), ], ["sep_close_bigger"] if sources_have_extent or targets_have_extent else []), ]: src = Template( TRAVERSAL_PREAMBLE_TEMPLATE + HELPER_FUNCTION_TEMPLATE + template, strict_undefined=True).render(**render_vars) result[list_name+"_builder"] = ListOfListsBuilder(self.context, [(list_name, box_id_dtype)] + [(extra_list_name, box_id_dtype) for extra_list_name in extra_lists], str(src), arg_decls=base_args + extra_args, debug=debug, name_prefix=list_name, complex_kernel=True) # }}} logging.info("traversal build kernels built") return _KernelInfo(**result)
def initialize(cls): ''' Compile kernels ''' cls.program = cl.Program(cl_ctx, F(cls.KERNEL)).build() cls.longitudinal_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len")], key_expr="(int) floor(theta[i]*inv_slice_len)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], key_dtype=np.int32) class LongitudinalTraverseScanKernel(GenericScanKernel): ''' Adds a preamble method for the longitudinal traverse sort ''' def __init__(self, *argl, **argd): ''' Patch argd['preamble'] ''' sort_fun = ''' int sort_fun(FLOAT_TYPE x, FLOAT_TYPE y, FLOAT_TYPE theta, FLOAT_TYPE inv_slice_len, FLOAT_TYPE inv_traverse_len, int bins) { FLOAT_TYPE xnorm = 0.5 + (inv_traverse_len*x); FLOAT_TYPE ynorm = 0.5 + (inv_traverse_len*y); int xbin = (int) floor(xnorm * inv_traverse_len); int ybin = (int) floor(ynorm * inv_traverse_len); int zbin = (int) floor(theta*inv_slice_len); if ((xbin < 0) || (xbin >= bins) || (ybin < 0) || (ybin >= bins)) { xbin = 0; ybin = 0; } return xbin+bins*(ybin+bins*zbin); } ''' new_argd = dict(argd) new_argd['preamble'] = F(sort_fun + new_argd['preamble']) super().__init__(*argl, **new_argd) cls.longitudinal_traverse_sort_kernel = RadixSort(cl_ctx, [VectorArg(cl_ftype, "x"), VectorArg(cl_ftype, "px"), VectorArg(cl_ftype, "y"), VectorArg(cl_ftype, "py"), VectorArg(cl_ftype, "theta"), VectorArg(cl_ftype, "gamma"), ScalarArg(cl_ftype, "inv_slice_len"), ScalarArg(cl_ftype, "inv_traverse_len"), ScalarArg(np.int32, "bins")], key_expr="sort_fun(x[i],y[i],theta[i], inv_slice_len, inv_traverse_len, bins)", sort_arg_names=["x", "px", "y", "py", "theta", "gamma"], scan_kernel = LongitudinalTraverseScanKernel, key_dtype=np.int32)
def __init__(self, dtype, name): VectorArg.__init__(self, dtype, name, with_offset=True)
def get_write_kernel(self, index_dtype): index_ctype = dtype_to_ctype(index_dtype) from pyopencl.tools import VectorArg, OtherArg kernel_list_args = [] kernel_list_arg_values = "" user_list_args = [] for name, dtype in self.list_names_and_dtypes: list_name = "plb_%s_list" % name list_arg = VectorArg(dtype, list_name) kernel_list_args.append(list_arg) user_list_args.append(list_arg) if name in self.count_sharing: kernel_list_arg_values += "%s, " % list_name continue kernel_list_args.append( VectorArg(index_dtype, "plb_%s_start_index" % name)) index_name = "plb_%s_index" % name user_list_args.append( OtherArg("%s *%s" % (index_ctype, index_name), index_name)) kernel_list_arg_values += "%s, &%s, " % (list_name, index_name) kernel_name = self.name_prefix + "_write" from pyopencl.characterize import has_double_support src = _LIST_BUILDER_TEMPLATE.render( is_count_stage=False, kernel_name=kernel_name, double_support=all( has_double_support(dev) for dev in self.context.devices), debug=self.debug, do_not_vectorize=self.do_not_vectorize(), kernel_list_arg_decl=_get_arg_decl(kernel_list_args), kernel_list_arg_values=kernel_list_arg_values, user_list_arg_decl=_get_arg_decl(user_list_args), user_list_args=_get_arg_list(user_list_args), user_arg_decl=_get_arg_decl(self.arg_decls), user_args=_get_arg_list(self.arg_decls), list_names_and_dtypes=self.list_names_and_dtypes, count_sharing=self.count_sharing, name_prefix=self.name_prefix, generate_template=self.generate_template, preamble=self.preamble, index_type=index_ctype, ) src = str(src) prg = cl.Program(self.context, src).build(self.options) knl = getattr(prg, kernel_name) from pyopencl.tools import get_arg_list_scalar_arg_dtypes knl.set_scalar_arg_dtypes( get_arg_list_scalar_arg_dtypes(kernel_list_args + self.arg_decls) + [index_dtype]) return knl
def __init__(self, context, arguments, key_expr, sort_arg_names, bits_at_a_time=2, index_dtype=np.int32, key_dtype=np.uint32, options=[]): """ :arg arguments: A string of comma-separated C argument declarations. If *arguments* is specified, then *input_expr* must also be specified. All types used here must be known to PyOpenCL. (see :func:`pyopencl.tools.get_or_register_dtype`). :arg key_expr: An integer-valued C expression returning the key based on which the sort is performed. The array index for which the key is to be computed is available as `i`. The expression may refer to any of the *arguments*. :arg sort_arg_names: A list of argument names whose corresponding array arguments will be sorted according to *key_expr*. """ # {{{ arg processing from pyopencl.tools import parse_arg_list self.arguments = parse_arg_list(arguments) del arguments self.sort_arg_names = sort_arg_names self.bits = int(bits_at_a_time) self.index_dtype = np.dtype(index_dtype) self.key_dtype = np.dtype(key_dtype) self.options = options # }}} # {{{ kernel creation scan_ctype, scan_dtype, scan_t_cdecl = \ _make_sort_scan_type(context.devices[0], self.bits, self.index_dtype) from pyopencl.tools import VectorArg, ScalarArg scan_arguments = (list(self.arguments) + [ VectorArg(arg.dtype, "sorted_" + arg.name) for arg in self.arguments if arg.name in sort_arg_names ] + [ScalarArg(np.int32, "base_bit")]) def get_count_branch(known_bits): if len(known_bits) == self.bits: return "s.c%s" % known_bits boundary_mnr = known_bits + "1" + (self.bits - len(known_bits) - 1) * "0" return ("((mnr < %s) ? %s : %s)" % (int(boundary_mnr, 2), get_count_branch(known_bits + "0"), get_count_branch(known_bits + "1"))) codegen_args = dict( bits=self.bits, key_ctype=dtype_to_ctype(self.key_dtype), key_expr=key_expr, index_ctype=dtype_to_ctype(self.index_dtype), index_type_max=np.iinfo(self.index_dtype).max, padded_bin=_padded_bin, scan_ctype=scan_ctype, sort_arg_names=sort_arg_names, get_count_branch=get_count_branch, ) preamble = scan_t_cdecl + RADIX_SORT_PREAMBLE_TPL.render( **codegen_args) scan_preamble = preamble \ + RADIX_SORT_SCAN_PREAMBLE_TPL.render(**codegen_args) from pyopencl.scan import GenericScanKernel self.scan_kernel = GenericScanKernel( context, scan_dtype, arguments=scan_arguments, input_expr="scan_t_from_value(%s, base_bit, i)" % key_expr, scan_expr="scan_t_add(a, b, across_seg_boundary)", neutral="scan_t_neutral()", output_statement=RADIX_SORT_OUTPUT_STMT_TPL.render(**codegen_args), preamble=scan_preamble, options=self.options) for i, arg in enumerate(self.arguments): if isinstance(arg, VectorArg): self.first_array_arg_idx = i
def __init__(self, context, list_names_and_dtypes, generate_template, arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list", options=[], preamble="", debug=False, complex_kernel=False, eliminate_empty_output_lists=[]): """ :arg context: A :class:`pyopencl.Context`. :arg list_names_and_dtypes: a list of `(name, dtype)` tuples indicating the lists to be built. :arg generate_template: a snippet of C as described below :arg arg_decls: A string of comma-separated C argument declarations. :arg count_sharing: A mapping consisting of `(child, mother)` indicating that `mother` and `child` will always have the same number of indices, and the `APPEND` to `mother` will always happen *before* the `APPEND` to the child. :arg name_prefix: the name prefix to use for the compiled kernels :arg options: OpenCL compilation options for kernels using *generate_template*. :arg complex_kernel: If `True`, prevents vectorization on CPUs. :arg eliminate_empty_output_lists: A Python list of list names for which the empty output lists are eliminated. *generate_template* may use the following C macros/identifiers: * `index_type`: expands to C identifier for the index type used for the calculation * `USER_ARG_DECL`: expands to the C declarator for `arg_decls` * `USER_ARGS`: a list of C argument values corresponding to `user_arg_decl` * `LIST_ARG_DECL`: expands to a C argument list representing the data for the output lists. These are escaped prefixed with `"plg_"` so as to not interfere with user-provided names. * `LIST_ARGS`: a list of C argument values corresponding to `LIST_ARG_DECL` * `APPEND_name(entry)`: inserts `entry` into the list `name`. *entry* must be a valid C expression of the correct type. All argument-list related macros have a trailing comma included if they are non-empty. *generate_template* must supply a function: .. code-block:: c void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) { APPEND_mylist(5); } Internally, the `kernel_template` is expanded (at least) twice. Once, for a 'counting' stage where the size of all the lists is determined, and a second time, for a 'generation' stage where the lists are actually filled. A `generate` function that has side effects beyond calling `append` is therefore ill-formed. .. versionchanged:: 2018.1 Change *eliminate_empty_output_lists* argument type from `bool` to `list`. """ if devices is None: devices = context.devices if count_sharing is None: count_sharing = {} self.context = context self.devices = devices self.list_names_and_dtypes = list_names_and_dtypes self.generate_template = generate_template from pyopencl.tools import parse_arg_list self.arg_decls = parse_arg_list(arg_decls) # To match with the signature of the user-supplied generate(), arguments # can't appear to have offsets. arg_decls_no_offset = [] from pyopencl.tools import VectorArg for arg in self.arg_decls: if isinstance(arg, VectorArg) and arg.with_offset: arg = VectorArg(arg.dtype, arg.name) arg_decls_no_offset.append(arg) self.arg_decls_no_offset = arg_decls_no_offset self.count_sharing = count_sharing self.name_prefix = name_prefix self.preamble = preamble self.options = options self.debug = debug self.complex_kernel = complex_kernel if eliminate_empty_output_lists is True: eliminate_empty_output_lists = \ [name for name, _ in self.list_names_and_dtypes] if eliminate_empty_output_lists is False: eliminate_empty_output_lists = [] self.eliminate_empty_output_lists = eliminate_empty_output_lists for list_name in self.eliminate_empty_output_lists: if not any(list_name == name for name, _ in self.list_names_and_dtypes): raise ValueError( "invalid list name '%s' in eliminate_empty_output_lists" % list_name)