def get_temporary_decls(self, codegen_state, schedule_state): from genpy import Assign, Comment, Line def alloc_nbytes(tv): from functools import reduce from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) global_temporaries = self._get_global_temporaries(codegen_state) if not global_temporaries: return [] return [Comment("{{{ allocate global temporaries"), Line()] + [ Assign( tv.name, "allocator(%s)" % ecm(alloc_nbytes(tv), PREC_NONE, "i")) for tv in global_temporaries ] + [ Assign( "_global_temporaries", "[{tvs}]".format(tvs=", ".join( tv.name for tv in global_temporaries))) ] + [Line(), Comment("}}}"), Line()]
def get_temporary_decls(self, codegen_state, schedule_state): from genpy import Assign, Comment, Line def alloc_nbytes(tv): from six.moves import reduce from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from loopy.kernel.data import temp_var_scope global_temporaries = sorted( (tv for tv in six.itervalues(codegen_state.kernel.temporary_variables) if tv.scope == temp_var_scope.GLOBAL), key=lambda tv: tv.name) from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) if not global_temporaries: return [Assign("_global_temporaries", "[]"), Line()] return [ Comment("{{{ allocate global temporaries"), Line()] + [ Assign(tv.name, "allocator(%s)" % ecm(alloc_nbytes(tv), PREC_NONE, "i")) for tv in global_temporaries] + [ Assign("_global_temporaries", "[{tvs}]".format(tvs=", ".join( tv.name for tv in global_temporaries)))] + [ Line(), Comment("}}}"), Line()]
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: gsize = (1, ) if not lsize: lsize = (1, ) all_args = codegen_state.implemented_data_info + extra_args value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( codegen_state.kernel, [self.target.device], all_args) arry_arg_code = generate_array_arg_setup(codegen_state.kernel, all_args, arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE import pyopencl.version as cl_ver if cl_ver.VERSION < (2020, 2): from warnings import warn warn("Your kernel invocation will likely fail because your " "version of PyOpenCL does not support allow_empty_ndrange. " "Please upgrade to version 2020.2 or newer.") # TODO: Generate finer-grained dependency structure return Suite([ Comment("{{{ enqueue %s" % name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels." + name), Assert("_lpy_knl.num_args == %d" % cl_arg_count), Line(), value_arg_code, arry_arg_code, Assign( "_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " "%(gsize)s, %(lsize)s, " # using positional args because pybind is slow with kwargs "None, " # offset "wait_for, " "True, " # g_times_l "True, " # allow_empty_ndrange ")" % dict(pyopencl_module_name=self.target.pyopencl_module_name, gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), Assign("wait_for", "[_lpy_evt]"), Line(), Comment("}}}"), Line(), ])
def get_temporary_decls(self, codegen_state, schedule_index): kernel = codegen_state.kernel ecm = codegen_state.expression_to_code_mapper result = [] from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign for tv in sorted( kernel.temporary_variables.values(), key=lambda tv: tv.name): if tv.shape: result.append( Assign( tv.name, "_lpy_np.empty(%s, dtype=%s)" % ( ecm(tv.shape, PREC_NONE, "i"), "_lpy_np."+( tv.dtype.numpy_dtype.name if tv.dtype.numpy_dtype.name != "bool" else "bool8") ))) return result
def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): ecm = self.get_expression_to_code_mapper(codegen_state) if not gsize: gsize = (1,) if not lsize: lsize = (1,) all_args = codegen_state.implemented_data_info + extra_args value_arg_code, arg_idx_to_cl_arg_idx, cl_arg_count = \ generate_value_arg_setup( codegen_state.kernel, [self.target.device], all_args) arry_arg_code = generate_array_arg_setup( codegen_state.kernel, all_args, arg_idx_to_cl_arg_idx) from genpy import Suite, Assign, Assert, Line, Comment from pymbolic.mapper.stringifier import PREC_NONE # TODO: Generate finer-grained dependency structure return Suite([ Comment("{{{ enqueue %s" % name), Line(), Assign("_lpy_knl", "_lpy_cl_kernels."+name), Assert("_lpy_knl.num_args == %d" % cl_arg_count), Line(), value_arg_code, arry_arg_code, Assign("_lpy_evt", "%(pyopencl_module_name)s.enqueue_nd_range_kernel(" "queue, _lpy_knl, " "%(gsize)s, %(lsize)s, wait_for=wait_for, g_times_l=True)" % dict( pyopencl_module_name=self.target.pyopencl_module_name, gsize=ecm(gsize, prec=PREC_NONE, type_context="i"), lsize=ecm(lsize, prec=PREC_NONE, type_context="i"))), Assign("wait_for", "[_lpy_evt]"), Line(), Comment("}}}"), Line(), ])
def emit_assignment(self, codegen_state, insn): ecm = codegen_state.expression_to_code_mapper if insn.atomicity: raise NotImplementedError("atomic ops in Python") from pymbolic.mapper.stringifier import PREC_NONE from genpy import Assign return Assign(ecm(insn.assignee, prec=PREC_NONE, type_context=None), ecm(insn.expression, prec=PREC_NONE, type_context=None))
def get_function_definition(self, codegen_state, codegen_result, schedule_index, function_decl, function_body): from loopy.kernel.data import TemporaryVariable args = (["_lpy_cl_kernels", "queue"] + [ idi.name for idi in codegen_state.implemented_data_info if not issubclass(idi.arg_class, TemporaryVariable) ] + ["wait_for=None", "allocator=None"]) from genpy import (For, Function, Suite, Import, ImportAs, Return, FromImport, If, Assign, Line, Statement as S) return Function( codegen_result.current_program(codegen_state).name, args, Suite([ FromImport("struct", ["pack as _lpy_pack"]), ImportAs("pyopencl", "_lpy_cl"), Import("pyopencl.tools"), Line(), If( "allocator is None", Assign("allocator", "_lpy_cl_tools.DeferredAllocator(queue.context)")), Line(), ] + [ Line(), function_body, Line(), ] + [ For( "_tv", "_global_temporaries", # free global temporaries S("_tv.release()")) ] + [ Line(), Return("_lpy_evt"), ]))
def generate_value_arg_setup(kernel, devices, implemented_data_info): options = kernel.options import loopy as lp from loopy.kernel.array import ArrayBase # {{{ arg counting bug handling # For example: # https://github.com/pocl/pocl/issues/197 # (but Apple CPU has a similar bug) work_around_arg_count_bug = False warn_about_arg_count_bug = False try: from pyopencl.characterize import has_struct_arg_count_bug except ImportError: count_bug_per_dev = [False]*len(devices) else: count_bug_per_dev = [ has_struct_arg_count_bug(dev) if dev is not None else False for dev in devices] if any(dev is None for dev in devices): warn("{knl_name}: device not supplied to PyOpenCLTarget--" "workarounds for broken OpenCL implementations " "(such as those relating to complex numbers) " "may not be enabled when needed" .format(knl_name=kernel.name)) if any(count_bug_per_dev): if all(count_bug_per_dev): work_around_arg_count_bug = True else: warn_about_arg_count_bug = True # }}} cl_arg_idx = 0 arg_idx_to_cl_arg_idx = {} fp_arg_count = 0 from genpy import ( Comment, Line, If, Raise, Assign, Statement as S, Suite) result = [] gen = result.append for arg_idx, idi in enumerate(implemented_data_info): arg_idx_to_cl_arg_idx[arg_idx] = cl_arg_idx if not issubclass(idi.arg_class, lp.ValueArg): assert issubclass(idi.arg_class, ArrayBase) # assume each of those generates exactly one... cl_arg_idx += 1 continue gen(Comment("{{{ process %s" % idi.name)) gen(Line()) if not options.skip_arg_checks: gen(If("%s is None" % idi.name, Raise('RuntimeError("input argument \'{name}\' ' 'must be supplied")'.format(name=idi.name)))) if idi.dtype.is_integral(): gen(Comment("cast to Python int to avoid trouble " "with struct packing or Boost.Python")) if sys.version_info < (3,): py_type = "long" else: py_type = "int" gen(Assign(idi.name, "%s(%s)" % (py_type, idi.name))) gen(Line()) if idi.dtype.is_composite(): gen(S("_lpy_knl.set_arg(%d, %s)" % (cl_arg_idx, idi.name))) cl_arg_idx += 1 elif idi.dtype.is_complex(): assert isinstance(idi.dtype, NumpyType) dtype = idi.dtype if warn_about_arg_count_bug: warn("{knl_name}: arguments include complex numbers, and " "some (but not all) of the target devices mishandle " "struct kernel arguments (hence the workaround is " "disabled".format( knl_name=kernel.name)) if dtype.numpy_dtype == np.complex64: arg_char = "f" elif dtype.numpy_dtype == np.complex128: arg_char = "d" else: raise TypeError("unexpected complex type: %s" % dtype) if (work_around_arg_count_bug and dtype.numpy_dtype == np.complex128 and fp_arg_count + 2 <= 8): gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.real)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}', {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 else: gen(Assign( "_lpy_buf", "_lpy_pack('{arg_char}{arg_char}', " "{arg_var}.real, {arg_var}.imag)" .format(arg_char=arg_char, arg_var=idi.name))) gen(S( "_lpy_knl.set_arg({cl_arg_idx}, _lpy_buf)" .format(cl_arg_idx=cl_arg_idx))) cl_arg_idx += 1 fp_arg_count += 2 elif isinstance(idi.dtype, NumpyType): if idi.dtype.dtype.kind == "f": fp_arg_count += 1 gen(S( "_lpy_knl.set_arg(%d, _lpy_pack('%s', %s))" % (cl_arg_idx, idi.dtype.dtype.char, idi.name))) cl_arg_idx += 1 else: raise LoopyError("do not know how to pass argument of type '%s'" % idi.dtype) gen(Line()) gen(Comment("}}}")) gen(Line()) return Suite(result), arg_idx_to_cl_arg_idx, cl_arg_idx
def emit_initializer(self, codegen_state, dtype, name, val_str, is_const): from genpy import Assign return Assign(name, val_str)
def get_temporary_decls(self, codegen_state, schedule_index): from genpy import Assign, Comment, Line from collections import defaultdict from numbers import Number import pymbolic.primitives as prim def alloc_nbytes(tv): from functools import reduce from operator import mul return tv.dtype.numpy_dtype.itemsize * reduce(mul, tv.shape, 1) from pymbolic.mapper.stringifier import PREC_NONE ecm = self.get_expression_to_code_mapper(codegen_state) global_temporaries = self._get_global_temporaries(codegen_state) if not global_temporaries: return [] # {{{ allocate space for the base_storage base_storage_sizes = defaultdict(set) for tv in global_temporaries: if tv.base_storage: base_storage_sizes[tv.base_storage].add(tv.nbytes) # }}} allocated_var_names = [] code_lines = [] code_lines.append(Line()) code_lines.append(Comment("{{{ allocate global temporaries")) code_lines.append(Line()) for name, sizes in base_storage_sizes.items(): if all(isinstance(s, Number) for s in sizes): size = max(sizes) else: size = prim.Max(tuple(sizes)) allocated_var_names.append(name) code_lines.append( Assign(name, f"allocator({ecm(size, PREC_NONE, 'i')})")) for tv in global_temporaries: if tv.base_storage: assert tv.base_storage in base_storage_sizes code_lines.append(Assign(tv.name, tv.base_storage)) else: nbytes_str = ecm(tv.nbytes, PREC_NONE, "i") allocated_var_names.append(tv.name) code_lines.append(Assign(tv.name, f"allocator({nbytes_str})")) code_lines.append( Assign( "_global_temporaries", "[{tvs}]".format(tvs=", ".join( tv for tv in allocated_var_names)))) code_lines.append(Line()) code_lines.append(Comment("}}}")) code_lines.append(Line()) return code_lines