def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_size_upper_bounds_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def check_sizes(kernel, device): import loopy as lp from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: from loopy.diagnostic import warn warn(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) return parameters = {} for arg in kernel.args: if isinstance(arg, lp.ValueArg) and arg.approximately is not None: parameters[arg.name] = arg.approximately glens, llens = kernel.get_grid_sizes_as_exprs() if (max(len(glens), len(llens)) > device.max_work_item_dimensions): raise LoopyError("too many work item dimensions") from pymbolic import evaluate from pymbolic.mapper.evaluator import UnknownVariableError try: glens = evaluate(glens, parameters) llens = evaluate(llens, parameters) except UnknownVariableError as name: from warnings import warn warn("could not check axis bounds because no value " "for variable '%s' was passed to check_kernels()" % name, LoopyAdvisory) else: for i in range(len(llens)): if llens[i] > device.max_work_item_sizes[i]: raise LoopyError("group axis %d too big" % i) from pytools import product if product(llens) > device.max_work_group_size: raise LoopyError("work group too big") from pyopencl.characterize import usable_local_mem_size if kernel.local_mem_use() > usable_local_mem_size(device): raise LoopyError("using too much local memory") from loopy.kernel.data import ConstantArg const_arg_count = sum( 1 for arg in kernel.args if isinstance(arg, ConstantArg)) if const_arg_count > device.max_constant_args: raise LoopyError("too many constant arguments")
def adjust_local_temp_var_storage(kernel, device): import pyopencl as cl import pyopencl.characterize as cl_char logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} from loopy.kernel.data import temp_var_scope lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): if temp_var.scope != temp_var_scope.LOCAL: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) if tv.scope == temp_var_scope.LOCAL and tv.name != temp_var.name] storage_shape = temp_var.storage_shape if storage_shape is None: storage_shape = temp_var.shape storage_shape = list(storage_shape) # sizes of all dims except the last one, which we may change # below to avoid bank conflicts from pytools import product if device.local_mem_type == cl.device_local_mem_type.GLOBAL: # FIXME: could try to avoid cache associativity disasters new_storage_shape = storage_shape elif device.local_mem_type == cl.device_local_mem_type.LOCAL: min_mult = cl_char.local_memory_bank_count(device) good_incr = None new_storage_shape = storage_shape min_why_not = None for increment in range(storage_shape[-1]//2): test_storage_shape = storage_shape[:] test_storage_shape[-1] = test_storage_shape[-1] + increment new_mult, why_not = cl_char.why_not_local_access_conflict_free( device, temp_var.dtype.itemsize, temp_var.shape, test_storage_shape) # will choose smallest increment 'automatically' if new_mult < min_mult: new_lmem_use = (sum(other_loctemp_nbytes) + temp_var.dtype.itemsize*product(test_storage_shape)) if new_lmem_use < lmem_size: new_storage_shape = test_storage_shape min_mult = new_mult min_why_not = why_not good_incr = increment if min_mult != 1: from warnings import warn from loopy.diagnostic import LoopyAdvisory warn("could not find a conflict-free mem layout " "for local variable '%s' " "(currently: %dx conflict, increment: %s, reason: %s)" % (temp_var.name, min_mult, good_incr, min_why_not), LoopyAdvisory) else: from warnings import warn warn("unknown type of local memory") new_storage_shape = storage_shape new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) return kernel.copy(temporary_variables=new_temp_vars)
def adjust_local_temp_var_storage(kernel, device): logger.debug("%s: adjust temp var storage" % kernel.name) new_temp_vars = {} lmem_size = cl_char.usable_local_mem_size(device) for temp_var in six.itervalues(kernel.temporary_variables): if not temp_var.is_local: new_temp_vars[temp_var.name] = \ temp_var.copy(storage_shape=temp_var.shape) continue other_loctemp_nbytes = [ tv.nbytes for tv in six.itervalues(kernel.temporary_variables) if tv.is_local and tv.name != temp_var.name] storage_shape = temp_var.storage_shape if storage_shape is None: storage_shape = temp_var.shape storage_shape = list(storage_shape) # sizes of all dims except the last one, which we may change # below to avoid bank conflicts from pytools import product if device.local_mem_type == cl.device_local_mem_type.GLOBAL: # FIXME: could try to avoid cache associativity disasters new_storage_shape = storage_shape elif device.local_mem_type == cl.device_local_mem_type.LOCAL: min_mult = cl_char.local_memory_bank_count(device) good_incr = None new_storage_shape = storage_shape min_why_not = None for increment in range(storage_shape[-1]//2): test_storage_shape = storage_shape[:] test_storage_shape[-1] = test_storage_shape[-1] + increment new_mult, why_not = cl_char.why_not_local_access_conflict_free( device, temp_var.dtype.itemsize, temp_var.shape, test_storage_shape) # will choose smallest increment 'automatically' if new_mult < min_mult: new_lmem_use = (sum(other_loctemp_nbytes) + temp_var.dtype.itemsize*product(test_storage_shape)) if new_lmem_use < lmem_size: new_storage_shape = test_storage_shape min_mult = new_mult min_why_not = why_not good_incr = increment if min_mult != 1: from warnings import warn from loopy.diagnostic import LoopyAdvisory warn("could not find a conflict-free mem layout " "for local variable '%s' " "(currently: %dx conflict, increment: %s, reason: %s)" % (temp_var.name, min_mult, good_incr, min_why_not), LoopyAdvisory) else: from warnings import warn warn("unknown type of local memory") new_storage_shape = storage_shape new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) return kernel.copy(temporary_variables=new_temp_vars)