def combine(dtypes): # dtypes may just be a generator expr dtypes = list(dtypes) from loopy.types import LoopyType, NumpyType assert all(isinstance(dtype, LoopyType) for dtype in dtypes) if not all(isinstance(dtype, NumpyType) for dtype in dtypes): from pytools import is_single_valued, single_valued if not is_single_valued(dtypes): raise TypeInferenceFailure( "Nothing known about operations between '%s'" % ", ".join(str(dt) for dt in dtypes)) return single_valued(dtypes) dtypes = [dtype.dtype for dtype in dtypes] result = dtypes.pop() while dtypes: other = dtypes.pop() if result.fields is None and other.fields is None: if (result, other) in [ (np.int32, np.float32), (np.float32, np.int32)]: # numpy makes this a double. I disagree. result = np.dtype(np.float32) else: result = ( np.empty(0, dtype=result) + np.empty(0, dtype=other) ).dtype elif result.fields is None and other.fields is not None: # assume the non-native type takes over # (This is used for vector types.) result = other elif result.fields is not None and other.fields is None: # assume the non-native type takes over # (This is used for vector types.) pass else: if result is not other: raise TypeInferenceFailure( "nothing known about result of operation on " "'%s' and '%s'" % (result, other)) return NumpyType(result)
def __init__(self, basis=None, metric_matrix=None): """ :arg basis: A sequence of names of basis vectors, or an integer (the number of dimensions) to use the default names ``e0`` through ``eN``. :arg metric_matrix: See :attr:`metric_matrix`. If *None*, the Euclidean metric is assumed. """ if basis is None and metric_matrix is None: raise TypeError("at least one of 'basis' and 'metric_matrix' " "must be passed") if basis is None: basis = int(metric_matrix.shape[0]) from numbers import Integral if isinstance(basis, Integral): basis = ["e%d" % i for i in range(basis)] if metric_matrix is None: metric_matrix = np.eye(len(basis), dtype=np.object) from pytools import all if not ( len(metric_matrix.shape) == 2 and all(dim == len(basis) for dim in metric_matrix.shape)): raise ValueError("metric_matrix has the wrong shape") self.basis_names = basis self.metric_matrix = metric_matrix
def get_next_step(self, available_names, done_insns): from pytools import all, argmax2 available_insns = [(insn, insn.priority) for insn in self.instructions if insn not in done_insns and all( dep.name in available_names for dep in insn.get_dependencies())] if not available_insns: raise self.NoInstructionAvailable from pytools import flatten discardable_vars = set(available_names) - set( flatten([dep.name for dep in insn.get_dependencies()] for insn in self.instructions if insn not in done_insns)) # {{{ make sure results do not get discarded dm = mappers.DependencyMapper(composite_leaves=False) def remove_result_variable(result_expr): # The extra dependency mapper run is necessary # because, for instance, subscripts can make it # into the result expression, which then does # not consist of just variables. for var in dm(result_expr): assert isinstance(var, Variable) discardable_vars.discard(var.name) obj_array_vectorize(remove_result_variable, self.result) # }}} return argmax2(available_insns), discardable_vars
def _get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", arg_prep="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, neutral=neutral, reduce_expr=_process_code_for_macro(reduce_expr), map_expr=_process_code_for_macro(map_expr), name=name, preamble=preamble, arg_prep=arg_prep, double_support=all(has_double_support(dev) for dev in devices), )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size)
def __init__(self, basis=None, metric_matrix=None): """ :arg basis: A sequence of names of basis vectors, or an integer (the number of dimensions) to use the default names ``e0`` through ``eN``. :arg metric_matrix: See :attr:`metric_matrix`. If *None*, the Euclidean metric is assumed. """ if basis is None and metric_matrix is None: raise TypeError("at least one of 'basis' and 'metric_matrix' " "must be passed") if basis is None: basis = int(metric_matrix.shape[0]) from numbers import Integral if isinstance(basis, Integral): basis = ["e%d" % i for i in range(basis)] if metric_matrix is None: metric_matrix = np.eye(len(basis), dtype=np.object) from pytools import all if not (len(metric_matrix.shape) == 2 and all(dim == len(basis) for dim in metric_matrix.shape)): raise ValueError("metric_matrix has the wrong shape") self.basis_names = basis self.metric_matrix = metric_matrix
def get_expr_dataset(self, expression, description=None, unit=None): """Prepare a time-series dataset for a given expression. @arg expression: A C{pymbolic} expression that may involve the time-series variables and the constants in this :class:`LogManager`. If there is data from multiple ranks for a quantity occuring in this expression, an aggregator may have to be specified. @return: C{(description, unit, table)}, where C{table} is a list of tuples C{(tick_nbr, value)}. Aggregators are specified as follows: - C{qty.min}, C{qty.max}, C{qty.avg}, C{qty.sum}, C{qty.norm2} - C{qty[rank_nbr]} - C{qty.loc} """ parsed = self._parse_expr(expression) parsed, dep_data = self._get_expr_dep_data(parsed) # aggregate table data for dd in dep_data: table = self.get_table(dd.name) table.sort(["step"]) dd.table = table.aggregated(["step"], "value", dd.agg_func).data # evaluate unit and description, if necessary if unit is None: from pymbolic import substitute, parse unit_dict = dict((dd.varname, dd.qdat.unit) for dd in dep_data) from pytools import all if all(v is not None for v in six.itervalues(unit_dict)): unit_dict = dict( (k, parse(v)) for k, v in six.iteritems(unit_dict)) unit = substitute(parsed, unit_dict) else: unit = None if description is None: description = expression # compile and evaluate from pymbolic import compile compiled = compile(parsed, [dd.varname for dd in dep_data]) data = [] for key, values in _join_by_first_of_tuple(dd.table for dd in dep_data): try: data.append((key, compiled(*values))) except ZeroDivisionError: pass return (description, unit, data)
def get_expr_dataset(self, expression, description=None, unit=None): """Prepare a time-series dataset for a given expression. @arg expression: A C{pymbolic} expression that may involve the time-series variables and the constants in this :class:`LogManager`. If there is data from multiple ranks for a quantity occuring in this expression, an aggregator may have to be specified. @return: C{(description, unit, table)}, where C{table} is a list of tuples C{(tick_nbr, value)}. Aggregators are specified as follows: - C{qty.min}, C{qty.max}, C{qty.avg}, C{qty.sum}, C{qty.norm2} - C{qty[rank_nbr]} - C{qty.loc} """ parsed = self._parse_expr(expression) parsed, dep_data = self._get_expr_dep_data(parsed) # aggregate table data for dd in dep_data: table = self.get_table(dd.name) table.sort(["step"]) dd.table = table.aggregated(["step"], "value", dd.agg_func).data # evaluate unit and description, if necessary if unit is None: from pymbolic import substitute, parse unit_dict = dict((dd.varname, dd.qdat.unit) for dd in dep_data) from pytools import all if all(v is not None for v in six.itervalues(unit_dict)): unit_dict = dict((k, parse(v)) for k, v in six.iteritems(unit_dict)) unit = substitute(parsed, unit_dict) else: unit = None if description is None: description = expression # compile and evaluate from pymbolic import compile compiled = compile(parsed, [dd.varname for dd in dep_data]) data = [] for key, values in _join_by_first_of_tuple(dd.table for dd in dep_data): try: data.append((key, compiled(*values))) except ZeroDivisionError: pass return (description, unit, data)
def map_sum(self, expr): dtypes = [] small_integer_dtypes = [] for child in expr.children: dtype = self.rec(child) if is_integer(child) and abs(child) < 1024: small_integer_dtypes.append(dtype) else: dtypes.append(dtype) from pytools import all if all(dtype.kind == "i" for dtype in dtypes): dtypes.extend(small_integer_dtypes) return self.combine(dtypes)
def get_contained_fluxes(self, expr): from hedge.optemplate.mappers import FluxCollector contained_flux_ops = FluxCollector()(expr) from hedge.optemplate.operators import WholeDomainFluxOperator from pytools import all assert all(isinstance(op, WholeDomainFluxOperator) for op in contained_flux_ops), \ "not all flux operators were of the expected type" return [self.FluxRecord( flux_expr=wdflux, dependencies=set(wdflux.interior_deps) | set(wdflux.boundary_deps), repr_op=wdflux.repr_op()) for wdflux in contained_flux_ops]
def get_contained_fluxes(self, expr): from hedge.optemplate.mappers import FluxCollector contained_flux_ops = FluxCollector()(expr) from hedge.optemplate.operators import WholeDomainFluxOperator from pytools import all assert all(isinstance(op, WholeDomainFluxOperator) for op in contained_flux_ops), \ "not all flux operators were of the expected type" return [ self.FluxRecord(flux_expr=wdflux, dependencies=set(wdflux.interior_deps) | set(wdflux.boundary_deps), repr_op=wdflux.repr_op()) for wdflux in contained_flux_ops ]
def get_next_step(self, available_names, done_insns): from pytools import all, argmax2 available_insns = [ (insn, insn.priority) for insn in self.instructions if insn not in done_insns and all(dep.name in available_names for dep in insn.get_dependencies())] if not available_insns: raise self.NoInstructionAvailable needed_vars = set([ dep.name for insn in self.instructions if insn not in done_insns for dep in insn.get_dependencies() ]) discardable_vars = set(available_names) - needed_vars # {{{ make sure results do not get discarded from pytools.obj_array import with_object_array_or_scalar from pytential.symbolic.mappers import DependencyMapper dm = DependencyMapper(composite_leaves=False) def remove_result_variable(result_expr): # The extra dependency mapper run is necessary # because, for instance, subscripts can make it # into the result expression, which then does # not consist of just variables. for var in dm(result_expr): from pymbolic.primitives import Variable assert isinstance(var, Variable) discardable_vars.discard(var.name) with_object_array_or_scalar(remove_result_variable, self.result) # }}} return argmax2(available_insns), discardable_vars
def get_next_step(self, available_names, done_insns): from pytools import all, argmax2 available_insns = [(insn, insn.priority) for insn in self.instructions if insn not in done_insns and all( dep.name in available_names for dep in insn.get_dependencies())] if not available_insns: raise self.NoInstructionAvailable needed_vars = set([ dep.name for insn in self.instructions if insn not in done_insns for dep in insn.get_dependencies() ]) discardable_vars = set(available_names) - needed_vars # {{{ make sure results do not get discarded from pytools.obj_array import with_object_array_or_scalar from pytential.symbolic.mappers import DependencyMapper dm = DependencyMapper(composite_leaves=False) def remove_result_variable(result_expr): # The extra dependency mapper run is necessary # because, for instance, subscripts can make it # into the result expression, which then does # not consist of just variables. for var in dm(result_expr): from pymbolic.primitives import Variable assert isinstance(var, Variable) discardable_vars.discard(var.name) with_object_array_or_scalar(remove_result_variable, self.result) # }}} return argmax2(available_insns), discardable_vars
def __init__(self, ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None): if isinstance(self, ExclusiveScanKernel) and neutral is None: raise ValueError( "neutral element is required for exclusive scan") self.context = ctx dtype = self.dtype = np.dtype(dtype) self.neutral = neutral if devices is None: devices = ctx.devices self.devices = devices max_wg_size = min(dev.max_work_group_size for dev in self.devices) # Thrust says these are good for GT200 self.scan_wg_size = min(max_wg_size, 128) self.update_wg_size = min(max_wg_size, 256) if self.scan_wg_size < 16: # Hello, Apple CPU. Nice to see you. self.scan_wg_seq_batches = 128 # FIXME: guesswork else: self.scan_wg_seq_batches = 6 from pytools import all from pyopencl.characterize import has_double_support kw_values = dict(preamble=preamble, name_prefix=name_prefix, scan_type=dtype_to_ctype(dtype), scan_expr=scan_expr, neutral=neutral, double_support=all( has_double_support(dev) for dev in devices)) scan_intervals_src = str( SCAN_INTERVALS_SOURCE.render( wg_size=self.scan_wg_size, wg_seq_batches=self.scan_wg_seq_batches, **kw_values)) scan_intervals_prg = cl.Program(ctx, scan_intervals_src).build(options) self.scan_intervals_knl = getattr(scan_intervals_prg, name_prefix + "_scan_intervals") self.scan_intervals_knl.set_scalar_arg_dtypes( (None, np.uint32, np.uint32, None, None)) final_update_src = str( self.final_update_tp.render(wg_size=self.update_wg_size, **kw_values)) final_update_prg = cl.Program(self.context, final_update_src).build(options) self.final_update_knl = getattr(final_update_prg, name_prefix + "_final_update") self.final_update_knl.set_scalar_arg_dtypes( (None, np.uint32, np.uint32, None))
def find_temporary_scope(kernel): logger.debug("%s: mark local temporaries" % kernel.name) new_temp_vars = {} from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag, temp_var_scope) import loopy as lp writers = kernel.writer_map() for temp_var in six.itervalues(kernel.temporary_variables): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) if temp_var.scope is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue my_writers = writers.get(temp_var.name, []) desired_scope_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] # A write race will emerge if: # # - the variable is local # and # - the instruction is run across more inames (locally) parallel # than are reflected in the assignee indices. locparallel_compute_inames = _get_compute_inames_tagged( kernel, insn, LocalIndexTagBase) locparallel_assignee_inames = _get_assignee_inames_tagged( kernel, insn, LocalIndexTagBase, temp_var.name) grpparallel_compute_inames = _get_compute_inames_tagged( kernel, insn, GroupIndexTag) grpparallel_assignee_inames = _get_assignee_inames_tagged( kernel, insn, GroupIndexTag, temp_var.name) assert locparallel_assignee_inames <= locparallel_compute_inames assert grpparallel_assignee_inames <= grpparallel_compute_inames desired_scope = temp_var_scope.PRIVATE for iname_descr, scope_descr, apin, cpin, scope in [ ("local", "local", locparallel_assignee_inames, locparallel_compute_inames, temp_var_scope.LOCAL), ("group", "global", grpparallel_assignee_inames, grpparallel_compute_inames, temp_var_scope.GLOBAL), ]: if (apin != cpin and bool(locparallel_assignee_inames)): warn_with_kernel(kernel, "write_race_local(%s)" % insn_id, "instruction '%s' looks invalid: " "it assigns to indices based on %s IDs, but " "its temporary '%s' cannot be made %s because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" % (insn_id, iname_descr, temp_var.name, scope_descr, ", ".join(cpin - apin)), WriteRaceConditionWarning) if (apin == cpin # doesn't want to be in this scope if there aren't any # parallel inames of that kind: and bool(cpin)): desired_scope = max(desired_scope, scope) break desired_scope_per_insn.append(desired_scope) if not desired_scope_per_insn: if temp_var.initializer is None: warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) else: raise LoopyError("temporary variable '%s': never written, " "cannot automatically determine scope" % temp_var.name) continue overall_scope = max(desired_scope_per_insn) from pytools import all if not all(iscope == overall_scope for iscope in desired_scope_per_insn): raise LoopyError("not all instructions agree on the " "the desired scope (private/local/global) of the " "temporary '%s'" % temp_var.name) new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope) return kernel.copy(temporary_variables=new_temp_vars)
def map_logical_and(self, expr): from pytools import all return all(self.rec(ch) for ch in expr.children)
def get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, arguments, name="reduce_kernel", preamble="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): return min( device.max_work_group_size, (device.local_mem_size + out_type_size - 1) // out_type_size) group_size = min( get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): try: return device.warp_size_nv except: if "nvidia" in device.vendor.lower(): from warnings import warn warn("Reduction might be unnecessarily slow: " "can't query warp size on Nvidia device") return 1 no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=arguments, group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, reduce_expr=reduce_expr, map_expr=map_expr, name=name, preamble=preamble, double_support=all( has_double_support(dev) for dev in devices) )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size)
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError("array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty( queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) from pytools import all assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided( pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo( name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return ref_args, ref_arg_data
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is GlobalArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize*s for s in strides] assert all(s > 0 for s in strides) alloc_size = sum(astrd*(alen-1) for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min( len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided( host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
def mark_local_temporaries(kernel): logger.debug("%s: mark local temporaries" % kernel.name) new_temp_vars = {} from loopy.kernel.data import LocalIndexTagBase import loopy as lp writers = kernel.writer_map() from loopy.symbolic import get_dependencies for temp_var in six.itervalues(kernel.temporary_variables): # Only fill out for variables that do not yet know if they're # local. (I.e. those generated by implicit temporary generation.) if temp_var.is_local is not lp.auto: new_temp_vars[temp_var.name] = temp_var continue my_writers = writers.get(temp_var.name, []) wants_to_be_local_per_insn = [] for insn_id in my_writers: insn = kernel.id_to_insn[insn_id] # A write race will emerge if: # # - the variable is local # and # - the instruction is run across more inames (locally) parallel # than are reflected in the assignee indices. locparallel_compute_inames = set(iname for iname in kernel.insn_inames(insn_id) if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) locparallel_assignee_inames = set(iname for _, assignee_indices in insn.assignees_and_indices() for iname in get_dependencies(assignee_indices) & kernel.all_inames() if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase)) assert locparallel_assignee_inames <= locparallel_compute_inames if (locparallel_assignee_inames != locparallel_compute_inames and bool(locparallel_assignee_inames)): warn(kernel, "write_race_local(%s)" % insn_id, "instruction '%s' looks invalid: " "it assigns to indices based on local IDs, but " "its temporary '%s' cannot be made local because " "a write race across the iname(s) '%s' would emerge. " "(Do you need to add an extra iname to your prefetch?)" % (insn_id, temp_var.name, ", ".join( locparallel_compute_inames - locparallel_assignee_inames)), WriteRaceConditionWarning) wants_to_be_local_per_insn.append( locparallel_assignee_inames == locparallel_compute_inames # doesn't want to be local if there aren't any # parallel inames: and bool(locparallel_compute_inames)) if not wants_to_be_local_per_insn: warn(kernel, "temp_to_write(%s)" % temp_var.name, "temporary variable '%s' never written, eliminating" % temp_var.name, LoopyAdvisory) continue is_local = any(wants_to_be_local_per_insn) from pytools import all if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn): raise LoopyError("not all instructions agree on whether " "temporary '%s' should be in local memory" % temp_var.name) new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local) return kernel.copy(temporary_variables=new_temp_vars)
def get_reduction_source(ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, arguments, name="reduce_kernel", preamble="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 return min(max_work_group_size, (device.local_mem_size + out_type_size - 1) // out_type_size) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): from pyopencl.characterize import get_simd_group_size result = get_simd_group_size(device, out_type_size) if result is None: from warnings import warn warn("Reduction might be unnecessarily slow: " "can't query SIMD group size") return 1 return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support, has_amd_double_support src = str( Template(KERNEL).render( out_type=out_type, arguments=arguments, group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, reduce_expr=reduce_expr, map_expr=map_expr, name=name, preamble=preamble, double_support=all(has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices))) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo(context=ctx, source=src, group_size=group_size)
def get_reduction_source( ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, arguments, name="reduce_kernel", preamble="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): from pyopencl.characterize import get_simd_group_size result = get_simd_group_size(device, out_type_size) if result is None: from warnings import warn warn("Reduction might be unnecessarily slow: " "can't query SIMD group size") return 1 return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support, has_amd_double_support src = str(Template(KERNEL).render( out_type=out_type, arguments=arguments, group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, reduce_expr=reduce_expr, map_expr=map_expr, name=name, preamble=preamble, double_support=all( has_double_support(dev) for dev in devices), amd_double_support=all( has_amd_double_support(dev) for dev in devices) )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo( context=ctx, source=src, group_size=group_size)
def make_ref_args(kernel, impl_arg_info, queue, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, \ TemporaryVariable, ConstantArg from pymbolic import evaluate ref_args = {} ref_arg_data = [] for arg in impl_arg_info: kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: if arg.offset_for_name: continue arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) ref_args[arg.name] = arg_value ref_arg_data.append(None) elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg \ or arg.arg_class is ConstantArg: if arg.shape is None or any(saxis is None for saxis in arg.shape): raise LoopyError( "array '%s' needs known shape to use automatic " "testing" % arg.name) shape = evaluate_shape(arg.unvec_shape, parameters) dtype = kernel_arg.dtype is_output = arg.base_name in kernel.get_written_variables() if arg.arg_class is ImageArg: storage_array = ary = cl_array.empty(queue, shape, dtype, order="C") numpy_strides = None alloc_size = None strides = None else: strides = evaluate(arg.unvec_strides, parameters) from pytools import all assert all(s > 0 for s in strides) alloc_size = sum(astrd * (alen - 1) for alen, astrd in zip(shape, strides)) + 1 if dtype is None: raise LoopyError("dtype for argument '%s' is not yet " "known. Perhaps you want to use " "loopy.add_dtypes " "or loopy.infer_argument_dtypes?" % arg.name) itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] storage_array = cl_array.empty(queue, alloc_size, dtype) if is_output and arg.arg_class is ImageArg: raise LoopyError("write-mode images not supported in " "automatic testing") fill_rand(storage_array) if arg.arg_class is ImageArg: # must be contiguous pre_run_ary = pre_run_storage_array = storage_array.copy() ref_args[arg.name] = cl.image_from_array( queue.context, ary.get()) else: pre_run_storage_array = storage_array.copy() ary = cl_array.as_strided(storage_array, shape, numpy_strides) pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape, numpy_strides) ref_args[arg.name] = ary ref_arg_data.append( TestArgInfo(name=arg.name, ref_array=ary, ref_storage_array=storage_array, ref_pre_run_array=pre_run_ary, ref_pre_run_storage_array=pre_run_storage_array, ref_shape=shape, ref_strides=strides, ref_alloc_size=alloc_size, ref_numpy_strides=numpy_strides, needs_checking=is_output)) elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return ref_args, ref_arg_data
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters): import pyopencl as cl import pyopencl.array as cl_array from loopy.kernel.data import ValueArg, GlobalArg, ImageArg,\ TemporaryVariable, ConstantArg from pymbolic import evaluate args = {} for arg, arg_desc in zip(impl_arg_info, ref_arg_data): kernel_arg = kernel.impl_arg_to_arg.get(arg.name) if arg.arg_class is ValueArg: arg_value = parameters[arg.name] try: argv_dtype = arg_value.dtype except AttributeError: argv_dtype = None if argv_dtype != arg.dtype: arg_value = arg.dtype.numpy_dtype.type(arg_value) args[arg.name] = arg_value elif arg.arg_class is ImageArg: if arg.name in kernel.get_written_variables(): raise NotImplementedError("write-mode images not supported in " "automatic testing") shape = evaluate_shape(arg.unvec_shape, parameters) assert shape == arg_desc.ref_shape # must be contiguous args[arg.name] = cl.image_from_array( queue.context, arg_desc.ref_pre_run_array.get()) elif arg.arg_class is GlobalArg or\ arg.arg_class is ConstantArg: shape = evaluate(arg.unvec_shape, parameters) strides = evaluate(arg.unvec_strides, parameters) dtype = kernel_arg.dtype itemsize = dtype.itemsize numpy_strides = [itemsize * s for s in strides] assert all(s > 0 for s in strides) alloc_size = sum(astrd * (alen - 1) for alen, astrd in zip(shape, strides)) + 1 # use contiguous array to transfer to host host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get() # use device shape/strides from pyopencl.compyte.array import as_strided host_ref_array = as_strided(host_ref_contig_array, arg_desc.ref_shape, arg_desc.ref_numpy_strides) # flatten the thing host_ref_flat_array = host_ref_array.flatten() # create host array with test shape (but not strides) host_contig_array = np.empty(shape, dtype=dtype) common_len = min(len(host_ref_flat_array), len(host_contig_array.ravel())) host_contig_array.ravel()[:common_len] = \ host_ref_flat_array[:common_len] # create host array with test shape and storage layout host_storage_array = np.empty(alloc_size, dtype) host_array = as_strided(host_storage_array, shape, numpy_strides) host_array[...] = host_contig_array host_contig_array = arg_desc.ref_storage_array.get() storage_array = cl_array.to_device(queue, host_storage_array) ary = cl_array.as_strided(storage_array, shape, numpy_strides) args[arg.name] = ary arg_desc.test_storage_array = storage_array arg_desc.test_array = ary arg_desc.test_shape = shape arg_desc.test_strides = strides arg_desc.test_numpy_strides = numpy_strides arg_desc.test_alloc_size = alloc_size elif arg.arg_class is TemporaryVariable: # global temporary, handled by invocation logic pass else: raise LoopyError("arg type not understood") return args
def _get_reduction_source(ctx, out_type, out_type_size, neutral, reduce_expr, map_expr, parsed_args, name="reduce_kernel", preamble="", device=None, max_group_size=None): if device is not None: devices = [device] else: devices = ctx.devices # {{{ compute group size def get_dev_group_size(device): # dirty fix for the RV770 boards max_work_group_size = device.max_work_group_size if "RV770" in device.name: max_work_group_size = 64 # compute lmem limit from pytools import div_ceil lmem_wg_size = div_ceil(max_work_group_size, out_type_size) result = min(max_work_group_size, lmem_wg_size) # round down to power of 2 from pyopencl.tools import bitlog2 return 2**bitlog2(result) group_size = min(get_dev_group_size(dev) for dev in devices) if max_group_size is not None: group_size = min(max_group_size, group_size) # }}} # {{{ compute synchronization-less group size def get_dev_no_sync_size(device): from pyopencl.characterize import get_simd_group_size result = get_simd_group_size(device, out_type_size) if result is None: from warnings import warn warn("Reduction might be unnecessarily slow: " "can't query SIMD group size") return 1 return result no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices) # }}} from mako.template import Template from pytools import all from pyopencl.characterize import has_double_support src = str( Template(KERNEL).render( out_type=out_type, arguments=", ".join(arg.declarator() for arg in parsed_args), group_size=group_size, no_sync_size=no_sync_size, neutral=neutral, reduce_expr=reduce_expr, map_expr=map_expr, name=name, preamble=preamble, double_support=all(has_double_support(dev) for dev in devices), )) from pytools import Record class ReductionInfo(Record): pass return ReductionInfo(context=ctx, source=src, group_size=group_size)
def __init__(self, ctx, dtype, scan_expr, neutral=None, name_prefix="scan", options=[], preamble="", devices=None): if isinstance(self, ExclusiveScanKernel) and neutral is None: raise ValueError("neutral element is required for exclusive scan") self.context = ctx dtype = self.dtype = np.dtype(dtype) self.neutral = neutral if devices is None: devices = ctx.devices self.devices = devices max_wg_size = min(dev.max_work_group_size for dev in self.devices) # loop to find suitable workgrouop size trip_count = 0 while True: # Thrust says these are good for GT200 self.scan_wg_size = min(max_wg_size, 128) self.update_wg_size = min(max_wg_size, 256) if self.scan_wg_size < 16: # Hello, Apple CPU. Nice to see you. self.scan_wg_seq_batches = 128 # FIXME: guesswork else: self.scan_wg_seq_batches = 6 from pytools import all from pyopencl.characterize import has_double_support kw_values = dict( preamble=preamble, name_prefix=name_prefix, scan_type=dtype_to_ctype(dtype), scan_expr=scan_expr, neutral=neutral, double_support=all( has_double_support(dev) for dev in devices) ) scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render( wg_size=self.scan_wg_size, wg_seq_batches=self.scan_wg_seq_batches, **kw_values)) scan_intervals_prg = cl.Program(ctx, scan_intervals_src).build(options) self.scan_intervals_knl = getattr( scan_intervals_prg, name_prefix+"_scan_intervals") self.scan_intervals_knl.set_scalar_arg_dtypes( (None, np.uint32, np.uint32, None, None)) kernel_max_wg_size = self.scan_intervals_knl.get_work_group_info( cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0]) if self.scan_wg_size <= kernel_max_wg_size: break else: max_wg_size = kernel_max_wg_size trip_count += 1 assert trip_count <= 2 final_update_src = str(self.final_update_tp.render( wg_size=self.update_wg_size, **kw_values)) final_update_prg = cl.Program(self.context, final_update_src).build(options) self.final_update_knl = getattr( final_update_prg, name_prefix+"_final_update") self.final_update_knl.set_scalar_arg_dtypes( (None, np.uint32, np.uint32, None))