def add_watches(self, watches): """Add quantities that are printed after every time step.""" from pytools import Record class WatchInfo(Record): pass for watch in watches: if isinstance(watch, tuple): display, expr = watch else: display = watch expr = watch parsed = self._parse_expr(expr) parsed, dep_data = self._get_expr_dep_data(parsed) from pytools import any self.have_nonlocal_watches = self.have_nonlocal_watches or \ any(dd.nonlocal_agg for dd in dep_data) from pymbolic import compile compiled = compile(parsed, [dd.varname for dd in dep_data]) watch_info = WatchInfo(display=display, parsed=parsed, dep_data=dep_data, compiled=compiled) self.watches.append(watch_info)
def __init__(self, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble=""): dtype_out = self.dtype_out = np.dtype(dtype_out) max_group_size = None trip_count = 0 while True: self.stage_1_inf = get_reduction_kernel( 1, ctx, dtype_out, neutral, reduce_expr, map_expr, arguments, name=name + "_stage1", options=options, preamble=preamble, max_group_size=max_group_size) kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0]) if self.stage_1_inf.group_size <= kernel_max_wg_size: break else: max_group_size = kernel_max_wg_size trip_count += 1 assert trip_count <= 2 self.stage_2_inf = get_reduction_kernel(2, ctx, dtype_out, neutral, reduce_expr, arguments=arguments, name=name + "_stage2", options=options, preamble=preamble, max_group_size=max_group_size) from pytools import any from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ "ReductionKernel can only be used with functions " \ "that have at least one vector argument"
def is_affine(self): from pytools import any has_high_order_geometry = any( sum(mid) >= 2 and abs(mc) >= 1e-13 for mc_along_axis in self.modal_coeff.T for mid, mc in zip( self.ldis.generate_mode_identifiers(), mc_along_axis)) return not has_high_order_geometry
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z): if not np.float64 in [dtype_x, dtype_y]: func = func + "f" from pytools import any if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): func = "f" + func return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z): if not np.float64 in [dtype_x, dtype_y]: func = func +"f" from pytools import any if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): func = "f"+func return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
def is_affine(self): from pytools import any has_high_order_geometry = any( sum(mid) >= 2 and abs(mc) >= 1e-13 for mc_along_axis in self.modal_coeff.T for mid, mc in zip(self.ldis.generate_mode_identifiers(), mc_along_axis) ) return not has_high_order_geometry
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): if np.float64 not in [dtype_x, dtype_y]: func = func + "f" from pytools import any if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): func = "f"+func if use_scalar: return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z) else: return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar): if np.float64 not in [dtype_x, dtype_y]: func = func + "f" from pytools import any if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]): func = "f" + func if use_scalar: return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z) else: return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
def has_barrier_within(kernel, sched_index): sched_item = kernel.schedule[sched_index] if isinstance(sched_item, EnterLoop): loop_contents, _ = gather_schedule_subloop(kernel.schedule, sched_index) from pytools import any return any(isinstance(subsched_item, Barrier) for subsched_item in loop_contents) elif isinstance(sched_item, Barrier): return True else: return False
def p2p(kernels): from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.complex_dtype else: value_dtype = self.real_dtype from sumpy.p2p import P2P return P2P(actx.context, kernels, exclude_self=False, value_dtypes=value_dtype)
def get_p2p(self, kernels): # needs to be separate method for caching from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype from sumpy.p2p import P2P p2p = P2P(self.cl_context, kernels, exclude_self=False, value_dtypes=value_dtype) return p2p
def get_lpot_applier(self, kernels): # needs to be separate method for caching from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype from sumpy.qbx import LayerPotential return LayerPotential(self.cl_context, [self.expansion_getter(knl, self.qbx_order) for knl in kernels], value_dtypes=value_dtype)
def __init__(self, dtype_out, neutral, reduce_expr, arguments=None, map_exprs=[None], name="reduce_kernel", options=[], preamble=""): ctx = get_device().context dtype_out = self.dtype_out = np.dtype(dtype_out) max_group_size = None trip_count = 0 self.n_exprs = len(map_exprs) assert self.n_exprs>0 while True: self.stage_1_inf = get_reduction_kernel(1, ctx, dtype_out, neutral, reduce_expr, arguments, name=name+"_stage1", options=options, preamble=preamble, map_exprs=map_exprs, max_group_size=max_group_size) kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0]) if self.stage_1_inf.group_size<=kernel_max_wg_size: break else: max_group_size = kernel_max_wg_size trip_count += 1 assert trip_count<=2 self.stage_2_inf = get_reduction_kernel(2, ctx, dtype_out, neutral, reduce_expr, arguments=arguments, name=name+"_stage2", options=options, map_exprs=map_exprs, preamble=preamble, max_group_size=max_group_size) from pytools import any from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ "ReductionKernel can only be used with functions " \ "that have at least one vector argument"
def get_lpot_applier_on_tgt_subset(self, kernels): # needs to be separate method for caching from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset from sumpy.expansion.local import VolumeTaylorLocalExpansion return LayerPotentialOnTargetAndCenterSubset(self.cl_context, [ VolumeTaylorLocalExpansion(knl, self.qbx_order) for knl in kernels ], value_dtypes=value_dtype)
def get_lpot_applier(self, kernels): # needs to be separate method for caching from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype from sumpy.qbx import LayerPotential from sumpy.expansion.local import LineTaylorLocalExpansion return LayerPotential( self.cl_context, [LineTaylorLocalExpansion(knl, self.qbx_order) for knl in kernels], value_dtypes=value_dtype)
def get_lpot_applier_on_tgt_subset(self, kernels): # needs to be separate method for caching from pytools import any if any(knl.is_complex_valued for knl in kernels): value_dtype = self.density_discr.complex_dtype else: value_dtype = self.density_discr.real_dtype from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset from sumpy.expansion.local import VolumeTaylorLocalExpansion return LayerPotentialOnTargetAndCenterSubset( self.cl_context, [VolumeTaylorLocalExpansion(knl, self.qbx_order) for knl in kernels], value_dtypes=value_dtype)
def stringify(self, coeff_stringifier, enclosing_prec): from pymbolic.mapper.stringifier import PREC_PRODUCT, PREC_SUM terms = [] for bits in sorted(six.iterkeys(self.data), key=lambda bits: (bit_count(bits), bits)): coeff = self.data[bits] # {{{ try to find a stringifier strifier = None if coeff_stringifier is None: try: strifier = coeff.stringifier()() except AttributeError: pass else: strifier = coeff_stringifier # }}} if strifier is not None: if bits: coeff_str = strifier(coeff, PREC_PRODUCT) else: coeff_str = strifier(coeff, PREC_SUM) else: coeff_str = str(coeff) blade_str = self.space.blade_bits_to_str(bits) if bits: terms.append("%s * %s" % (blade_str, coeff_str)) else: terms.append(coeff_str) if terms: if any(len(t) > 15 for t in terms): result = "\n " + "\n + ".join(terms) else: result = " + ".join(terms) else: result = "0" return "MV(%s)" % result
def finalize_multi_assign(self, names, exprs, do_not_return, priority): from pytools import any from hedge.tools import is_zero has_zero_assignees = any(is_zero(expr) for expr in exprs) if has_zero_assignees: if len(exprs) > 1: raise RuntimeError("found aggregated zero constant assignment") from hedge.optemplate import FlopCounter flop_count = sum(FlopCounter()(expr) for expr in exprs) if has_zero_assignees or flop_count == 0: return Assign(names, exprs, priority=priority, dep_mapper_factory=self.dep_mapper_factory) else: return VectorExprAssign(names=names, exprs=exprs, do_not_return=do_not_return, dep_mapper_factory=self.dep_mapper_factory, priority=priority)
def finalize_multi_assign(self, names, exprs, do_not_return, priority): from pytools import any from hedge.tools import is_zero has_zero_assignees = any(is_zero(expr) for expr in exprs) if has_zero_assignees: if len(exprs) > 1: raise RuntimeError("found aggregated zero constant assignment") from hedge.optemplate import FlopCounter flop_count = sum(FlopCounter()(expr) for expr in exprs) if has_zero_assignees or flop_count == 0: return Assign(names, exprs, priority=priority, dep_mapper_factory=self.dep_mapper_factory) else: return VectorExprAssign( names=names, exprs=exprs, do_not_return=do_not_return, dep_mapper_factory=self.dep_mapper_factory, priority=priority, )
def __init__(self, ctx, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", options=[], preamble=""): dtype_out = self.dtype_out = np.dtype(dtype_out) self.stage_1_inf = get_reduction_kernel(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, map_expr, arguments, name=name+"_stage1", options=options, preamble=preamble) # stage 2 has only one input and no map expression self.stage_2_inf = get_reduction_kernel(ctx, dtype_to_ctype(dtype_out), dtype_out.itemsize, neutral, reduce_expr, name=name+"_stage2", options=options, preamble=preamble) from pytools import any from pyopencl.tools import VectorArg assert any( isinstance(arg_tp, VectorArg) for arg_tp in self.stage_1_inf.arg_types), \ "ReductionKernel can only be used with functions that have at least one " \ "vector argument"
def map_logical_or(self, expr): from pytools import any return any(self.rec(ch) for ch in expr.children)
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for l in omit_lists: if not any(l == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.complex_kernel and queue.device.type == cl.device_type.CPU: gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,)), **dict(wait_for=wait_for)) # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] starts_ary = info_record.starts evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt
def __init__(self, data, space=None): """ :arg data: This may be one of the following: * a :class:`numpy.ndarray`, which will be turned into a grade-1 multivector, * a mapping from tuples of basis indices (together indicating a blade, order matters and will be mapped to 'normalized' blades) to coefficients, * an array as described in :attr:`data`, * a scalar--where everything that doesn't fall into the above cases is viewed as a scalar. :arg space: A :class:`Space` instance. If *None* or an integer, :func:`get_euclidean_space` is called to obtain a default space with the right number of dimensions for *data*. Note: dimension guessing only works when a :class:`numpy.ndarray` is being passed for *data*. """ dimensions = None if isinstance(data, np.ndarray): if len(data.shape) != 1: raise ValueError( "only numpy vectors (not higher-rank objects) " "are supported for 'data'") dimensions, = data.shape data = dict(((i, ), xi) for i, xi in enumerate(data)) elif isinstance(data, dict): pass else: data = {0: data} if space is None: space = get_euclidean_space(dimensions) else: if dimensions is not None and space.dimensions != dimensions: raise ValueError( "dimension count of 'space' does not match that of 'data'") # {{{ normalize data to bitmaps, if needed from pytools import single_valued from pymbolic.primitives import is_zero if data and single_valued( isinstance(k, tuple) for k in six.iterkeys(data)): # data is in non-normalized non-bits tuple form new_data = {} for basis_indices, coeff in six.iteritems(data): bits, sign = space.bits_and_sign(basis_indices) new_coeff = new_data.setdefault(bits, 0) + sign * coeff if is_zero(new_coeff): del new_data[bits] else: new_data[bits] = new_coeff data = new_data # }}} # assert that multivectors don't get nested from pytools import any assert not any( isinstance(coeff, MultiVector) for coeff in six.itervalues(data)) self.space = space self.data = data
def do_not_vectorize(self): from pytools import any return (self.complex_kernel and any(dev.type & cl.device_type.CPU for dev in self.context.devices))
def do_not_vectorize(self): from pytools import any return self.complex_kernel and any(dev.type == cl.device_type.CPU for dev in self.context.devices)
def is_name_conflicting(self, name): from pytools import any return any( _is_var_name_conflicting(name, other_name) for other_name in self.existing_names)
def __init__(self, data, space=None): """ :arg data: This may be one of the following: * a :class:`numpy.ndarray`, which will be turned into a grade-1 multivector, * a mapping from tuples of basis indices (together indicating a blade, order matters and will be mapped to 'normalized' blades) to coefficients, * an array as described in :attr:`data`, * a scalar--where everything that doesn't fall into the above cases is viewed as a scalar. :arg space: A :class:`Space` instance. If *None* or an integer, :func:`get_euclidean_space` is called to obtain a default space with the right number of dimensions for *data*. Note: dimension guessing only works when a :class:`numpy.ndarray` is being passed for *data*. """ dimensions = None if isinstance(data, np.ndarray): if len(data.shape) != 1: raise ValueError("only numpy vectors (not higher-rank objects) " "are supported for 'data'") dimensions, = data.shape data = dict( ((i,), xi) for i, xi in enumerate(data)) elif isinstance(data, dict): pass else: data = {0: data} if space is None: space = get_euclidean_space(dimensions) else: if dimensions is not None and space.dimensions != dimensions: raise ValueError( "dimension count of 'space' does not match that of 'data'") # {{{ normalize data to bitmaps, if needed from pytools import single_valued from pymbolic.primitives import is_zero if data and single_valued(isinstance(k, tuple) for k in six.iterkeys(data)): # data is in non-normalized non-bits tuple form new_data = {} for basis_indices, coeff in six.iteritems(data): bits, sign = space.bits_and_sign(basis_indices) new_coeff = new_data.setdefault(bits, 0) + sign*coeff if is_zero(new_coeff): del new_data[bits] else: new_data[bits] = new_coeff data = new_data # }}} # assert that multivectors don't get nested from pytools import any assert not any(isinstance(coeff, MultiVector) for coeff in six.itervalues(data)) self.space = space self.data = data
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. :class:`pyopencl.array.Array` are not allowed directly and should be passed as their :attr:`pyopencl.array.Array.data` attribute instead. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for oml in omit_lists: if not any(oml == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. result[name] = BuiltList(starts=counts) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.do_not_vectorize(): gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + args + (n_objects,)), **dict(wait_for=wait_for)) # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] starts_ary = info_record.starts evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt
def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '<u4'), ('value', '<f4')] >>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted(six.iteritems(dtype.fields), key=lambda name_dtype_offset: name_dtype_offset[1][1]) c_fields = [] for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" % (i + 1, name, field_name) for i, (field_name, _) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((size_t) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global size_t *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict(pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1 + len(fields), np.uintp) knl(queue, (1, ), (1, ), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [ dtype_and_offset[1] for field_name, dtype_and_offset in fields ] else: raise RuntimeError( "OpenCL compiler reported offsetof() past sizeof() " "for struct layout on '%s'. " "This makes no sense, and it's usually indicates a " "compiler bug. " "Refusing to discover struct layout." % device) result_buf.data.release() del knl del prg del queue del context try: dtype_arg_dict = { 'names': [field_name for field_name, (field_dtype, offset) in fields], 'formats': [field_dtype for field_name, (field_dtype, offset) in fields], 'offsets': [int(x) for x in offsets], 'itemsize': int(size_and_offsets[0]), } dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1) dtype = np.dtype(dtype_arg_dict) except NotImplementedError: def calc_field_type(): total_size = 0 padding_count = 0 for offset, (field_name, (field_dtype, _)) in zip(offsets, fields): if offset > total_size: padding_count += 1 yield ('__pycl_padding%d' % padding_count, 'V%d' % offset - total_size) yield field_name, field_dtype total_size = field_dtype.itemsize + offset dtype = np.dtype(list(calc_field_type())) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl
def aggregate_assignments(self, instructions, result): from pymbolic.primitives import Variable # aggregation helpers ------------------------------------------------- def get_complete_origins_set(insn, skip_levels=0): if skip_levels < 0: skip_levels = 0 result = set() for dep in insn.get_dependencies(): if isinstance(dep, Variable): dep_origin = origins_map.get(dep.name, None) if dep_origin is not None: if skip_levels <= 0: result.add(dep_origin) result |= get_complete_origins_set( dep_origin, skip_levels-1) return result var_assignees_cache = {} def get_var_assignees(insn): try: return var_assignees_cache[insn] except KeyError: result = set(Variable(assignee) for assignee in insn.get_assignees()) var_assignees_cache[insn] = result return result def aggregate_two_assignments(ass_1, ass_2): names = ass_1.names + ass_2.names from pymbolic.primitives import Variable deps = (ass_1.get_dependencies() | ass_2.get_dependencies()) \ - set(Variable(name) for name in names) return Assign( names=names, exprs=ass_1.exprs + ass_2.exprs, _dependencies=deps, dep_mapper_factory=self.dep_mapper_factory, priority=max(ass_1.priority, ass_2.priority)) # main aggregation pass ----------------------------------------------- origins_map = dict( (assignee, insn) for insn in instructions for assignee in insn.get_assignees()) from pytools import partition unprocessed_assigns, other_insns = partition( lambda insn: isinstance(insn, Assign), instructions) # filter out zero-flop-count assigns--no need to bother with those processed_assigns, unprocessed_assigns = partition( lambda ass: ass.flop_count() == 0, unprocessed_assigns) # filter out zero assignments from pytools import any from hedge.tools import is_zero i = 0 while i < len(unprocessed_assigns): my_assign = unprocessed_assigns[i] if any(is_zero(expr) for expr in my_assign.exprs): processed_assigns.append(unprocessed_assigns.pop()) else: i += 1 # greedy aggregation while unprocessed_assigns: my_assign = unprocessed_assigns.pop() my_deps = my_assign.get_dependencies() my_assignees = get_var_assignees(my_assign) agg_candidates = [] for i, other_assign in enumerate(unprocessed_assigns): other_deps = other_assign.get_dependencies() other_assignees = get_var_assignees(other_assign) if ((my_deps & other_deps or my_deps & other_assignees or other_deps & my_assignees) and my_assign.priority == other_assign.priority): agg_candidates.append((i, other_assign)) did_work = False if agg_candidates: my_indirect_origins = get_complete_origins_set( my_assign, skip_levels=1) for other_assign_index, other_assign in agg_candidates: if self.max_vectors_in_batch_expr is not None: new_assignee_count = len( set(my_assign.get_assignees()) | set(other_assign.get_assignees())) new_dep_count = len( my_assign.get_dependencies( each_vector=True) | other_assign.get_dependencies( each_vector=True)) if (new_assignee_count + new_dep_count \ > self.max_vectors_in_batch_expr): continue other_indirect_origins = get_complete_origins_set( other_assign, skip_levels=1) if (my_assign not in other_indirect_origins and other_assign not in my_indirect_origins): did_work = True # aggregate the two assignments new_assignment = aggregate_two_assignments( my_assign, other_assign) del unprocessed_assigns[other_assign_index] unprocessed_assigns.append(new_assignment) for assignee in new_assignment.get_assignees(): origins_map[assignee] = new_assignment break if not did_work: processed_assigns.append(my_assign) externally_used_names = set( expr for insn in processed_assigns + other_insns for expr in insn.get_dependencies()) from hedge.tools import is_obj_array if is_obj_array(result): externally_used_names |= set(expr for expr in result) else: externally_used_names |= set([result]) def schedule_and_finalize_assignment(ass): dep_mapper = self.dep_mapper_factory() names_exprs = zip(ass.names, ass.exprs) my_assignees = set(name for name, expr in names_exprs) names_exprs_deps = [ (name, expr, set(dep.name for dep in dep_mapper(expr) if isinstance(dep, Variable)) & my_assignees) for name, expr in names_exprs] ordered_names_exprs = [] available_names = set() while names_exprs_deps: schedulable = [] i = 0 while i < len(names_exprs_deps): name, expr, deps = names_exprs_deps[i] unsatisfied_deps = deps - available_names if not unsatisfied_deps: schedulable.append((str(expr), name, expr)) del names_exprs_deps[i] else: i += 1 # make sure these come out in a constant order schedulable.sort() if schedulable: for key, name, expr in schedulable: ordered_names_exprs.append((name, expr)) available_names.add(name) else: raise RuntimeError("aggregation resulted in an " "impossible assignment") return self.finalize_multi_assign( names=[name for name, expr in ordered_names_exprs], exprs=[expr for name, expr in ordered_names_exprs], do_not_return=[Variable(name) not in externally_used_names for name, expr in ordered_names_exprs], priority=ass.priority) return [schedule_and_finalize_assignment(ass) for ass in processed_assigns] + other_insns
def __init__(self, context, list_names_and_dtypes, generate_template, arg_decls, count_sharing=None, devices=None, name_prefix="plb_build_list", options=[], preamble="", debug=False, complex_kernel=False, eliminate_empty_output_lists=[]): """ :arg context: A :class:`pyopencl.Context`. :arg list_names_and_dtypes: a list of `(name, dtype)` tuples indicating the lists to be built. :arg generate_template: a snippet of C as described below :arg arg_decls: A string of comma-separated C argument declarations. :arg count_sharing: A mapping consisting of `(child, mother)` indicating that `mother` and `child` will always have the same number of indices, and the `APPEND` to `mother` will always happen *before* the `APPEND` to the child. :arg name_prefix: the name prefix to use for the compiled kernels :arg options: OpenCL compilation options for kernels using *generate_template*. :arg complex_kernel: If `True`, prevents vectorization on CPUs. :arg eliminate_empty_output_lists: A Python list of list names for which the empty output lists are eliminated. *generate_template* may use the following C macros/identifiers: * `index_type`: expands to C identifier for the index type used for the calculation * `USER_ARG_DECL`: expands to the C declarator for `arg_decls` * `USER_ARGS`: a list of C argument values corresponding to `user_arg_decl` * `LIST_ARG_DECL`: expands to a C argument list representing the data for the output lists. These are escaped prefixed with `"plg_"` so as to not interfere with user-provided names. * `LIST_ARGS`: a list of C argument values corresponding to `LIST_ARG_DECL` * `APPEND_name(entry)`: inserts `entry` into the list `name`. *entry* must be a valid C expression of the correct type. All argument-list related macros have a trailing comma included if they are non-empty. *generate_template* must supply a function: .. code-block:: c void generate(USER_ARG_DECL LIST_ARG_DECL index_type i) { APPEND_mylist(5); } Internally, the `kernel_template` is expanded (at least) twice. Once, for a 'counting' stage where the size of all the lists is determined, and a second time, for a 'generation' stage where the lists are actually filled. A `generate` function that has side effects beyond calling `append` is therefore ill-formed. .. versionchanged:: 2018.1 Change *eliminate_empty_output_lists* argument type from `bool` to `list`. """ if devices is None: devices = context.devices if count_sharing is None: count_sharing = {} self.context = context self.devices = devices self.list_names_and_dtypes = list_names_and_dtypes self.generate_template = generate_template from pyopencl.tools import parse_arg_list self.arg_decls = parse_arg_list(arg_decls) # To match with the signature of the user-supplied generate(), arguments # can't appear to have offsets. arg_decls_no_offset = [] from pyopencl.tools import VectorArg for arg in self.arg_decls: if isinstance(arg, VectorArg) and arg.with_offset: arg = VectorArg(arg.dtype, arg.name) arg_decls_no_offset.append(arg) self.arg_decls_no_offset = arg_decls_no_offset self.count_sharing = count_sharing self.name_prefix = name_prefix self.preamble = preamble self.options = options self.debug = debug self.complex_kernel = complex_kernel if eliminate_empty_output_lists is True: eliminate_empty_output_lists = \ [name for name, _ in self.list_names_and_dtypes] if eliminate_empty_output_lists is False: eliminate_empty_output_lists = [] self.eliminate_empty_output_lists = eliminate_empty_output_lists for list_name in self.eliminate_empty_output_lists: if not any(list_name == name for name, _ in self.list_names_and_dtypes): raise ValueError( "invalid list name '%s' in eliminate_empty_output_lists" % list_name)
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. Array-like arguments must be either 1D :class:`pyopencl.array.Array` objects or :class:`pyopencl.MemoryObject` objects, of which the latter can be obtained from a :class:`pyopencl.array.Array` using the :attr:`pyopencl.array.Array.data` attribute. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. If the list name is specified in *eliminate_empty_output_lists* constructor argument, *lists* has two additional attributes ``num_nonempty_lists`` and ``nonempty_indices`` * ``num_nonempty_lists`` for the number of nonempty lists. * ``nonempty_indices`` for the index of nonempty list in input objects. In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s entry is the start of the *i*'th nonempty list, which is generated by the object with index *nonempty_indices[i]*. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for oml in omit_lists: if not any(oml == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] else: # We'll be modifying it below. wait_for = list(wait_for) count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) if self.eliminate_empty_output_lists: compress_kernel = self.get_compress_kernel(index_dtype) data_args = [] for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)): from pyopencl.tools import VectorArg if isinstance(arg_descr, VectorArg): from pyopencl import MemoryObject if isinstance(arg_val, MemoryObject): data_args.append(arg_val) if arg_descr.with_offset: raise ValueError( "with_offset=True specified for argument %d " "but the argument is not an array" % i) continue if arg_val.ndim != 1: raise ValueError("argument %d is a multidimensional array" % i) data_args.append(arg_val.base_data) if arg_descr.with_offset: data_args.append(arg_val.offset) wait_for.extend(arg_val.events) else: data_args.append(arg_val) del args data_args = tuple(data_args) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. if name in self.eliminate_empty_output_lists: result[name] = BuiltList(count=None, starts=counts, lists=None, num_nonempty_lists=None, nonempty_indices=None) else: result[name] = BuiltList(count=None, starts=counts, lists=None) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.do_not_vectorize(): gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), wait_for=wait_for) compress_events = {} for name, dtype in self.list_names_and_dtypes: if name in omit_lists: continue if name in self.count_sharing: continue if name not in self.eliminate_empty_output_lists: continue compressed_counts = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record = result[name] info_record.nonempty_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.num_nonempty_lists = cl.array.empty( queue, (1,), index_dtype, allocator=allocator) info_record.compressed_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.compressed_indices[0] = 0 compress_events[name] = compress_kernel( info_record.starts, compressed_counts, info_record.nonempty_indices, info_record.compressed_indices, info_record.num_nonempty_lists, wait_for=[count_event] + info_record.compressed_indices.events) info_record.starts = compressed_counts # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] if name in self.eliminate_empty_output_lists: compress_events[name].wait() num_nonempty_lists = info_record.num_nonempty_lists.get()[0] info_record.num_nonempty_lists = num_nonempty_lists info_record.starts = info_record.starts[:num_nonempty_lists + 1] info_record.nonempty_indices = \ info_record.nonempty_indices[:num_nonempty_lists] info_record.starts[-1] = 0 starts_ary = info_record.starts if name in self.eliminate_empty_output_lists: evt = scan_kernel( starts_ary, size=info_record.num_nonempty_lists, wait_for=starts_ary.events) else: evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) if name in self.eliminate_empty_output_lists: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) if name in self.eliminate_empty_output_lists: write_list_args.append(info_record.compressed_indices.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + data_args + (n_objects,)), wait_for=scan_events) return result, evt
def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '<u4'), ('value', '<f4')] >>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ fields = sorted(dtype.fields.iteritems(), key=lambda (name, (dtype, offset)): offset) c_fields = [] for field_name, (field_dtype, offset) in fields: c_fields.append(" %s %s;" % (dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {\n%s\n} %s;\n\n" % ( "\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, (field_dtype, offset) in fields: cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join( "result[%d] = pycl_offsetof(%s, %s);" % (i+1, name, field_name) for i, (field_name, (field_dtype, offset)) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((size_t) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global size_t *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict( pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1+len(fields), np.uintp) knl(queue, (1,), (1,), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) from pytools import any offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [offset for field_name, (field_dtype, offset) in fields] else: raise RuntimeError( "cannot discover struct layout on '%s'" % device) result_buf.data.release() del knl del prg del queue del context dtype_arg_dict = dict( names=[field_name for field_name, (field_dtype, offset) in fields], formats=[field_dtype for field_name, (field_dtype, offset) in fields], offsets=[int(x) for x in offsets], itemsize=int(size_and_offsets[0]), ) dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0])-1) dtype = np.dtype(dtype_arg_dict) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl
def __call__(self, queue, n_objects, *args, **kwargs): """ :arg args: arguments corresponding to arg_decls in the constructor. Array-like arguments must be either 1D :class:`pyopencl.array.Array` objects or :class:`pyopencl.MemoryObject` objects, of which the latter can be obtained from a :class:`pyopencl.array.Array` using the :attr:`pyopencl.array.Array.data` attribute. :arg allocator: optionally, the allocator to use to allocate new arrays. :arg omit_lists: An iterable of list names that should *not* be built with this invocation. The kernel code may *not* call ``APPEND_name`` for these omitted lists. If it does, undefined behavior will result. The returned *lists* dictionary will not contain an entry for names in *omit_lists*. :arg wait_for: |explain-waitfor| :returns: a tuple ``(lists, event)``, where *lists* a mapping from (built) list names to objects which have attributes * ``count`` for the total number of entries in all lists combined * ``lists`` for the array containing all lists. * ``starts`` for the array of starting indices in `lists`. `starts` is built so that it has n+1 entries, so that the *i*'th entry is the start of the *i*'th list, and the *i*'th entry is the index one past the *i*'th list's end, even for the last list. This implies that all lists are contiguous. If the list name is specified in *eliminate_empty_output_lists* constructor argument, *lists* has two additional attributes ``num_nonempty_lists`` and ``nonempty_indices`` * ``num_nonempty_lists`` for the number of nonempty lists. * ``nonempty_indices`` for the index of nonempty list in input objects. In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s entry is the start of the *i*'th nonempty list, which is generated by the object with index *nonempty_indices[i]*. *event* is a :class:`pyopencl.Event` for dependency management. .. versionchanged:: 2016.2 Added omit_lists. """ if n_objects >= int(np.iinfo(np.int32).max): index_dtype = np.int64 else: index_dtype = np.int32 index_dtype = np.dtype(index_dtype) allocator = kwargs.pop("allocator", None) omit_lists = kwargs.pop("omit_lists", []) wait_for = kwargs.pop("wait_for", None) if kwargs: raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs)) for oml in omit_lists: if not any(oml == name for name, _ in self.list_names_and_dtypes): raise ValueError("invalid list name '%s' in omit_lists") result = {} count_list_args = [] if wait_for is None: wait_for = [] count_kernel = self.get_count_kernel(index_dtype) write_kernel = self.get_write_kernel(index_dtype) scan_kernel = self.get_scan_kernel(index_dtype) if self.eliminate_empty_output_lists: compress_kernel = self.get_compress_kernel(index_dtype) data_args = [] for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)): from pyopencl.tools import VectorArg if isinstance(arg_descr, VectorArg): from pyopencl import MemoryObject if isinstance(arg_val, MemoryObject): data_args.append(arg_val) if arg_descr.with_offset: raise ValueError( "with_offset=True specified for argument %d " "but the argument is not an array" % i) continue if arg_val.ndim != 1: raise ValueError("argument %d is a multidimensional array" % i) data_args.append(arg_val.base_data) if arg_descr.with_offset: data_args.append(arg_val.offset) else: data_args.append(arg_val) del args data_args = tuple(data_args) # {{{ allocate memory for counts for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: count_list_args.append(None) continue counts = cl.array.empty(queue, (n_objects + 1), index_dtype, allocator=allocator) counts[-1] = 0 wait_for = wait_for + counts.events # The scan will turn the "counts" array into the "starts" array # in-place. if name in self.eliminate_empty_output_lists: result[name] = BuiltList(count=None, starts=counts, lists=None, num_nonempty_lists=None, nonempty_indices=None) else: result[name] = BuiltList(count=None, starts=counts, lists=None) count_list_args.append(counts.data) # }}} if self.debug: gsize = (1,) lsize = (1,) elif self.do_not_vectorize(): gsize = (4*queue.device.max_compute_units,) lsize = (1,) else: from pyopencl.array import splay gsize, lsize = splay(queue, n_objects) count_event = count_kernel(queue, gsize, lsize, *(tuple(count_list_args) + data_args + (n_objects,)), **dict(wait_for=wait_for)) compress_events = {} for name, dtype in self.list_names_and_dtypes: if name in omit_lists: continue if name in self.count_sharing: continue if name not in self.eliminate_empty_output_lists: continue compressed_counts = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record = result[name] info_record.nonempty_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.num_nonempty_lists = cl.array.empty( queue, (1,), index_dtype, allocator=allocator) info_record.compressed_indices = cl.array.empty( queue, (n_objects + 1,), index_dtype, allocator=allocator) info_record.compressed_indices[0] = 0 compress_events[name] = compress_kernel( info_record.starts, compressed_counts, info_record.nonempty_indices, info_record.compressed_indices, info_record.num_nonempty_lists, wait_for=[count_event] + info_record.compressed_indices.events) info_record.starts = compressed_counts # {{{ run scans scan_events = [] for name, dtype in self.list_names_and_dtypes: if name in self.count_sharing: continue if name in omit_lists: continue info_record = result[name] if name in self.eliminate_empty_output_lists: compress_events[name].wait() num_nonempty_lists = info_record.num_nonempty_lists.get()[0] info_record.num_nonempty_lists = num_nonempty_lists info_record.starts = info_record.starts[:num_nonempty_lists + 1] info_record.nonempty_indices = \ info_record.nonempty_indices[:num_nonempty_lists] info_record.starts[-1] = 0 starts_ary = info_record.starts if name in self.eliminate_empty_output_lists: evt = scan_kernel( starts_ary, size=info_record.num_nonempty_lists, wait_for=starts_ary.events) else: evt = scan_kernel(starts_ary, wait_for=[count_event], size=n_objects) starts_ary.setitem(0, 0, queue=queue, wait_for=[evt]) scan_events.extend(starts_ary.events) # retrieve count info_record.count = int(starts_ary[-1].get()) # }}} # {{{ deal with count-sharing lists, allocate memory for lists write_list_args = [] for name, dtype in self.list_names_and_dtypes: if name in omit_lists: write_list_args.append(None) if name not in self.count_sharing: write_list_args.append(None) if name in self.eliminate_empty_output_lists: write_list_args.append(None) continue if name in self.count_sharing: sharing_from = self.count_sharing[name] info_record = result[name] = BuiltList( count=result[sharing_from].count, starts=result[sharing_from].starts, ) else: info_record = result[name] info_record.lists = cl.array.empty(queue, info_record.count, dtype, allocator=allocator) write_list_args.append(info_record.lists.data) if name not in self.count_sharing: write_list_args.append(info_record.starts.data) if name in self.eliminate_empty_output_lists: write_list_args.append(info_record.compressed_indices.data) # }}} evt = write_kernel(queue, gsize, lsize, *(tuple(write_list_args) + data_args + (n_objects,)), **dict(wait_for=scan_events)) return result, evt
def aggregate_assignments(self, instructions, result): from pymbolic.primitives import Variable # {{{ aggregation helpers def get_complete_origins_set(insn, skip_levels=0): if skip_levels < 0: skip_levels = 0 result = set() for dep in insn.get_dependencies(): if isinstance(dep, Variable): dep_origin = origins_map.get(dep.name, None) if dep_origin is not None: if skip_levels <= 0: result.add(dep_origin) result |= get_complete_origins_set( dep_origin, skip_levels - 1) return result var_assignees_cache = {} def get_var_assignees(insn): try: return var_assignees_cache[insn] except KeyError: result = set( Variable(assignee) for assignee in insn.get_assignees()) var_assignees_cache[insn] = result return result def aggregate_two_assignments(ass_1, ass_2): names = ass_1.names + ass_2.names from pymbolic.primitives import Variable deps = (ass_1.get_dependencies() | ass_2.get_dependencies()) \ - set(Variable(name) for name in names) return Assign(names=names, exprs=ass_1.exprs + ass_2.exprs, _dependencies=deps, dep_mapper_factory=self.dep_mapper_factory, priority=max(ass_1.priority, ass_2.priority)) # }}} # {{{ main aggregation pass origins_map = dict((assignee, insn) for insn in instructions for assignee in insn.get_assignees()) from pytools import partition unprocessed_assigns, other_insns = partition( lambda insn: isinstance(insn, Assign) and not insn. is_scalar_valued, instructions) # filter out zero-flop-count assigns--no need to bother with those processed_assigns, unprocessed_assigns = partition( lambda ass: ass.flop_count() == 0, unprocessed_assigns) # filter out zero assignments from pytools import any from hedge.tools import is_zero i = 0 while i < len(unprocessed_assigns): my_assign = unprocessed_assigns[i] if any(is_zero(expr) for expr in my_assign.exprs): processed_assigns.append(unprocessed_assigns.pop()) else: i += 1 # greedy aggregation while unprocessed_assigns: my_assign = unprocessed_assigns.pop() my_deps = my_assign.get_dependencies() my_assignees = get_var_assignees(my_assign) agg_candidates = [] for i, other_assign in enumerate(unprocessed_assigns): other_deps = other_assign.get_dependencies() other_assignees = get_var_assignees(other_assign) if ((my_deps & other_deps or my_deps & other_assignees or other_deps & my_assignees) and my_assign.priority == other_assign.priority): agg_candidates.append((i, other_assign)) did_work = False if agg_candidates: my_indirect_origins = get_complete_origins_set(my_assign, skip_levels=1) for other_assign_index, other_assign in agg_candidates: if self.max_vectors_in_batch_expr is not None: new_assignee_count = len( set(my_assign.get_assignees()) | set(other_assign.get_assignees())) new_dep_count = len( my_assign.get_dependencies(each_vector=True) | other_assign.get_dependencies(each_vector=True)) if (new_assignee_count + new_dep_count > self.max_vectors_in_batch_expr): continue other_indirect_origins = get_complete_origins_set( other_assign, skip_levels=1) if (my_assign not in other_indirect_origins and other_assign not in my_indirect_origins): did_work = True # aggregate the two assignments new_assignment = aggregate_two_assignments( my_assign, other_assign) del unprocessed_assigns[other_assign_index] unprocessed_assigns.append(new_assignment) for assignee in new_assignment.get_assignees(): origins_map[assignee] = new_assignment break if not did_work: processed_assigns.append(my_assign) externally_used_names = set(expr for insn in processed_assigns + other_insns for expr in insn.get_dependencies()) from hedge.tools import is_obj_array if is_obj_array(result): externally_used_names |= set(expr for expr in result) else: externally_used_names |= set([result]) def schedule_and_finalize_assignment(ass): dep_mapper = self.dep_mapper_factory() names_exprs = zip(ass.names, ass.exprs) my_assignees = set(name for name, expr in names_exprs) names_exprs_deps = [ (name, expr, set(dep.name for dep in dep_mapper(expr) if isinstance(dep, Variable)) & my_assignees) for name, expr in names_exprs ] ordered_names_exprs = [] available_names = set() while names_exprs_deps: schedulable = [] i = 0 while i < len(names_exprs_deps): name, expr, deps = names_exprs_deps[i] unsatisfied_deps = deps - available_names if not unsatisfied_deps: schedulable.append((str(expr), name, expr)) del names_exprs_deps[i] else: i += 1 # make sure these come out in a constant order schedulable.sort() if schedulable: for key, name, expr in schedulable: ordered_names_exprs.append((name, expr)) available_names.add(name) else: raise RuntimeError("aggregation resulted in an " "impossible assignment") return self.finalize_multi_assign( names=[name for name, expr in ordered_names_exprs], exprs=[expr for name, expr in ordered_names_exprs], do_not_return=[ Variable(name) not in externally_used_names for name, expr in ordered_names_exprs ], priority=ass.priority) return [ schedule_and_finalize_assignment(ass) for ass in processed_assigns ] + other_insns