def compile( self, template_src, render_args=None, render_kwds=None, fast_math=False, compiler_options=None, constant_arrays=None, keep=False): """ Creates a module object from the given template. :param template_src: Mako template source to render :param render_args: an iterable with positional arguments to pass to the template. :param render_kwds: a dictionary with keyword parameters to pass to the template. :param fast_math: whether to enable fast mathematical operations during compilation. :param compiler_options: a list of strings to be passed to the compiler as arguments. :param constant_arrays: (**CUDA only**) a dictionary ``{name: metadata}`` of constant memory arrays to be declared in the compiled program. ``metadata`` can be either an array-like object (possessing ``shape`` and ``dtype`` attributes), or a pair ``(shape, dtype)``. :param keep: if `True`, preserve the source file being compiled and the accompanying binaries (if any). With PyCUDA backend, it is used as the ``keep`` option when creating ``SourceModule``. With PyOpenCL backend, it is used as the ``cache_dir`` option for ``Program.build()`` (and, additionally, the kernel source itself is put there). :returns: a :py:class:`Program` object. """ src = render_template_source( template_src, render_args=render_args, render_kwds=render_kwds) return Program( self, src, fast_math=fast_math, compiler_options=compiler_options, constant_arrays=constant_arrays, keep=keep)
def __init__(self, thr, template_src, name, global_size, local_size=None, render_args=None, render_kwds=None, fast_math=False, compiler_options=None, constant_arrays=None, keep=False): """__init__()""" # hide the signature from Sphinx self._thr = thr if render_args is None: render_args = [] if render_kwds is None: render_kwds = {} main_src = render_template_source( template_src, render_args=render_args, render_kwds=render_kwds) # Since virtual size function require some registers, they affect the maximum local size. # Start from the device's max work group size as the first approximation # and recompile kernels with smaller local sizes until convergence. max_local_size = thr.device_params.max_work_group_size while True: # Try to find kernel launch parameters for the requested local size. # May raise OutOfResourcesError if it's not possible, # just let it pass to the caller. vs = VirtualSizes( thr.device_params, global_size, virtual_local_size=local_size, max_local_size=max_local_size) # Try to compile the kernel with the corresponding virtual size functions program = Program( self._thr, vs.vsize_functions + main_src, static=True, fast_math=fast_math, compiler_options=compiler_options, constant_arrays=constant_arrays, keep=keep) kernel = getattr(program, name) if kernel.max_work_group_size >= product(vs.real_local_size): # Kernel will execute with this local size, use it break # By the contract of VirtualSizes, # product(vs.real_local_size) <= max_local_size # Also, since we're still in this loop, # kernel.max_work_group_size < product(vs.real_local_size). # Therefore the new max_local_size value is guaranteed # to be smaller than the previous one. max_local_size = kernel.max_work_group_size self._program = program self._kernel = kernel self.virtual_local_size = vs.virtual_local_size self.virtual_global_size = vs.virtual_global_size self.local_size = vs.real_local_size self.global_size = vs.real_global_size self._kernel.prepare(self.global_size, local_size=self.local_size)
def __init__(self, thr, template_src, name, global_size, local_size=None, render_args=None, render_kwds=None, fast_math=False): """__init__()""" # hide the signature from Sphinx self._thr = thr if render_args is None: render_args = [] if render_kwds is None: render_kwds = {} main_src = render_template_source( template_src, render_args=render_args, render_kwds=render_kwds) # Since virtual size function require some registers, they affect the maximum local size. # Start from the device's max work group size as the first approximation # and recompile kernels with smaller local sizes until convergence. max_local_size = thr.device_params.max_work_group_size while True: # Try to find kernel launch parameters for the requested local size. # May raise OutOfResourcesError if it's not possible, # just let it pass to the caller. vs = VirtualSizes( thr.device_params, global_size, virtual_local_size=local_size, max_local_size=max_local_size) # Try to compile the kernel with the corresponding virtual size functions program = Program( self._thr, vs.vsize_functions + main_src, static=True, fast_math=fast_math) kernel = getattr(program, name) if kernel.max_work_group_size >= product(vs.real_local_size): # Kernel will execute with this local size, use it break # By the contract of VirtualSizes, # product(vs.real_local_size) <= max_local_size # Also, since we're still in this loop, # kernel.max_work_group_size < product(vs.real_local_size). # Therefore the new max_local_size value is guaranteed # to be smaller than the previous one. max_local_size = kernel.max_work_group_size self._program = program self._kernel = kernel self.virtual_local_size = vs.virtual_local_size self.virtual_global_size = vs.virtual_global_size self.local_size = vs.real_local_size self.global_size = vs.real_global_size self._kernel.prepare(self.global_size, local_size=self.local_size)
def compile(self, template_src, render_args=None, render_kwds=None, fast_math=False): """ Creates a module object from the given template. :param template_src: Mako template source to render :param render_kwds: an iterable with positional arguments to pass to the template. :param render_kwds: a dictionary with keyword parameters to pass to the template. :param fast_math: whether to enable fast mathematical operations during compilation. :returns: a :py:class:`Program` object. """ src = render_template_source( template_src, render_args=render_args, render_kwds=render_kwds) return Program(self, src, fast_math=fast_math)
def compile(self, template_src, render_args=None, render_kwds=None, fast_math=False, compiler_options=None, constant_arrays=None, keep=False): """ Creates a module object from the given template. :param template_src: Mako template source to render :param render_args: an iterable with positional arguments to pass to the template. :param render_kwds: a dictionary with keyword parameters to pass to the template. :param fast_math: whether to enable fast mathematical operations during compilation. :param compiler_options: a list of strings to be passed to the compiler as arguments. :param constant_arrays: (**CUDA only**) a dictionary ``{name: metadata}`` of constant memory arrays to be declared in the compiled program. ``metadata`` can be either an array-like object (possessing ``shape`` and ``dtype`` attributes), or a pair ``(shape, dtype)``. :param keep: if `True`, preserve the source file being compiled and the accompanying binaries (if any). With PyCUDA backend, it is used as the ``keep`` option when creating ``SourceModule``. With PyOpenCL backend, it is used as the ``cache_dir`` option for ``Program.build()`` (and, additionally, the kernel source itself is put there). :returns: a :py:class:`Program` object. """ src = render_template_source(template_src, render_args=render_args, render_kwds=render_kwds) return Program(self, src, fast_math=fast_math, compiler_options=compiler_options, constant_arrays=constant_arrays, keep=keep)