Ejemplo n.º 1
0
    def combine(dtypes):
        # dtypes may just be a generator expr
        dtypes = list(dtypes)

        from loopy.types import LoopyType, NumpyType
        assert all(isinstance(dtype, LoopyType) for dtype in dtypes)

        if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
            from pytools import is_single_valued, single_valued
            if not is_single_valued(dtypes):
                raise TypeInferenceFailure(
                        "Nothing known about operations between '%s'"
                        % ", ".join(str(dt) for dt in dtypes))

            return single_valued(dtypes)

        dtypes = [dtype.dtype for dtype in dtypes]

        result = dtypes.pop()
        while dtypes:
            other = dtypes.pop()

            if result.fields is None and other.fields is None:
                if (result, other) in [
                        (np.int32, np.float32), (np.float32, np.int32)]:
                    # numpy makes this a double. I disagree.
                    result = np.dtype(np.float32)
                else:
                    result = (
                            np.empty(0, dtype=result)
                            + np.empty(0, dtype=other)
                            ).dtype

            elif result.fields is None and other.fields is not None:
                # assume the non-native type takes over
                # (This is used for vector types.)
                result = other
            elif result.fields is not None and other.fields is None:
                # assume the non-native type takes over
                # (This is used for vector types.)
                pass
            else:
                if result is not other:
                    raise TypeInferenceFailure(
                            "nothing known about result of operation on "
                            "'%s' and '%s'" % (result, other))

        return NumpyType(result)
Ejemplo n.º 2
0
    def __init__(self, basis=None, metric_matrix=None):
        """
        :arg basis: A sequence of names of basis vectors, or an integer (the
            number of dimensions) to use the default names ``e0`` through ``eN``.
        :arg metric_matrix: See :attr:`metric_matrix`.
            If *None*, the Euclidean metric is assumed.
        """

        if basis is None and metric_matrix is None:
            raise TypeError("at least one of 'basis' and 'metric_matrix' "
                    "must be passed")

        if basis is None:
            basis = int(metric_matrix.shape[0])

        from numbers import Integral
        if isinstance(basis, Integral):
            basis = ["e%d" % i for i in range(basis)]

        if metric_matrix is None:
            metric_matrix = np.eye(len(basis), dtype=np.object)

        from pytools import all
        if not (
                len(metric_matrix.shape) == 2
                and
                all(dim == len(basis) for dim in metric_matrix.shape)):
            raise ValueError("metric_matrix has the wrong shape")

        self.basis_names = basis
        self.metric_matrix = metric_matrix
Ejemplo n.º 3
0
    def get_next_step(self, available_names, done_insns):
        from pytools import all, argmax2
        available_insns = [(insn, insn.priority) for insn in self.instructions
                           if insn not in done_insns and all(
                               dep.name in available_names
                               for dep in insn.get_dependencies())]

        if not available_insns:
            raise self.NoInstructionAvailable

        from pytools import flatten
        discardable_vars = set(available_names) - set(
            flatten([dep.name for dep in insn.get_dependencies()]
                    for insn in self.instructions if insn not in done_insns))

        # {{{ make sure results do not get discarded

        dm = mappers.DependencyMapper(composite_leaves=False)

        def remove_result_variable(result_expr):
            # The extra dependency mapper run is necessary
            # because, for instance, subscripts can make it
            # into the result expression, which then does
            # not consist of just variables.

            for var in dm(result_expr):
                assert isinstance(var, Variable)
                discardable_vars.discard(var.name)

        obj_array_vectorize(remove_result_variable, self.result)

        # }}}

        return argmax2(available_insns), discardable_vars
Ejemplo n.º 4
0
def _get_reduction_source(
        ctx, out_type, out_type_size,
        neutral, reduce_expr, map_expr, parsed_args,
        name="reduce_kernel", preamble="", arg_prep="",
        device=None, max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64

        # compute lmem limit
        from pytools import div_ceil
        lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
        result = min(max_work_group_size, lmem_wg_size)

        # round down to power of 2
        from pyopencl.tools import bitlog2
        return 2**bitlog2(result)

    group_size = min(get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support
    src = str(Template(KERNEL).render(
        out_type=out_type,
        arguments=", ".join(arg.declarator() for arg in parsed_args),
        group_size=group_size,
        neutral=neutral,
        reduce_expr=_process_code_for_macro(reduce_expr),
        map_expr=_process_code_for_macro(map_expr),
        name=name,
        preamble=preamble,
        arg_prep=arg_prep,
        double_support=all(has_double_support(dev) for dev in devices),
        ))

    from pytools import Record

    class ReductionInfo(Record):
        pass

    return ReductionInfo(
            context=ctx,
            source=src,
            group_size=group_size)
Ejemplo n.º 5
0
    def __init__(self, basis=None, metric_matrix=None):
        """
        :arg basis: A sequence of names of basis vectors, or an integer (the
            number of dimensions) to use the default names ``e0`` through ``eN``.
        :arg metric_matrix: See :attr:`metric_matrix`.
            If *None*, the Euclidean metric is assumed.
        """

        if basis is None and metric_matrix is None:
            raise TypeError("at least one of 'basis' and 'metric_matrix' "
                            "must be passed")

        if basis is None:
            basis = int(metric_matrix.shape[0])

        from numbers import Integral
        if isinstance(basis, Integral):
            basis = ["e%d" % i for i in range(basis)]

        if metric_matrix is None:
            metric_matrix = np.eye(len(basis), dtype=np.object)

        from pytools import all
        if not (len(metric_matrix.shape) == 2
                and all(dim == len(basis) for dim in metric_matrix.shape)):
            raise ValueError("metric_matrix has the wrong shape")

        self.basis_names = basis
        self.metric_matrix = metric_matrix
Ejemplo n.º 6
0
    def get_expr_dataset(self, expression, description=None, unit=None):
        """Prepare a time-series dataset for a given expression.

        @arg expression: A C{pymbolic} expression that may involve
          the time-series variables and the constants in this :class:`LogManager`.
          If there is data from multiple ranks for a quantity occuring in
          this expression, an aggregator may have to be specified.
        @return: C{(description, unit, table)}, where C{table}
          is a list of tuples C{(tick_nbr, value)}.

        Aggregators are specified as follows:
        - C{qty.min}, C{qty.max}, C{qty.avg}, C{qty.sum}, C{qty.norm2}
        - C{qty[rank_nbr]}
        - C{qty.loc}
        """

        parsed = self._parse_expr(expression)
        parsed, dep_data = self._get_expr_dep_data(parsed)

        # aggregate table data
        for dd in dep_data:
            table = self.get_table(dd.name)
            table.sort(["step"])
            dd.table = table.aggregated(["step"], "value", dd.agg_func).data

        # evaluate unit and description, if necessary
        if unit is None:
            from pymbolic import substitute, parse

            unit_dict = dict((dd.varname, dd.qdat.unit) for dd in dep_data)
            from pytools import all
            if all(v is not None for v in six.itervalues(unit_dict)):
                unit_dict = dict(
                    (k, parse(v)) for k, v in six.iteritems(unit_dict))
                unit = substitute(parsed, unit_dict)
            else:
                unit = None

        if description is None:
            description = expression

        # compile and evaluate
        from pymbolic import compile
        compiled = compile(parsed, [dd.varname for dd in dep_data])

        data = []

        for key, values in _join_by_first_of_tuple(dd.table
                                                   for dd in dep_data):
            try:
                data.append((key, compiled(*values)))
            except ZeroDivisionError:
                pass

        return (description, unit, data)
Ejemplo n.º 7
0
    def get_expr_dataset(self, expression, description=None, unit=None):
        """Prepare a time-series dataset for a given expression.

        @arg expression: A C{pymbolic} expression that may involve
          the time-series variables and the constants in this :class:`LogManager`.
          If there is data from multiple ranks for a quantity occuring in
          this expression, an aggregator may have to be specified.
        @return: C{(description, unit, table)}, where C{table}
          is a list of tuples C{(tick_nbr, value)}.

        Aggregators are specified as follows:
        - C{qty.min}, C{qty.max}, C{qty.avg}, C{qty.sum}, C{qty.norm2}
        - C{qty[rank_nbr]}
        - C{qty.loc}
        """

        parsed = self._parse_expr(expression)
        parsed, dep_data = self._get_expr_dep_data(parsed)

        # aggregate table data
        for dd in dep_data:
            table = self.get_table(dd.name)
            table.sort(["step"])
            dd.table = table.aggregated(["step"], "value", dd.agg_func).data

        # evaluate unit and description, if necessary
        if unit is None:
            from pymbolic import substitute, parse

            unit_dict = dict((dd.varname, dd.qdat.unit) for dd in dep_data)
            from pytools import all
            if all(v is not None for v in six.itervalues(unit_dict)):
                unit_dict = dict((k, parse(v)) for k, v in six.iteritems(unit_dict))
                unit = substitute(parsed, unit_dict)
            else:
                unit = None

        if description is None:
            description = expression

        # compile and evaluate
        from pymbolic import compile
        compiled = compile(parsed, [dd.varname for dd in dep_data])

        data = []

        for key, values in _join_by_first_of_tuple(dd.table for dd in dep_data):
            try:
                data.append((key, compiled(*values)))
            except ZeroDivisionError:
                pass

        return (description, unit, data)
Ejemplo n.º 8
0
    def map_sum(self, expr):
        dtypes = []
        small_integer_dtypes = []
        for child in expr.children:
            dtype = self.rec(child)
            if is_integer(child) and abs(child) < 1024:
                small_integer_dtypes.append(dtype)
            else:
                dtypes.append(dtype)

        from pytools import all
        if all(dtype.kind == "i" for dtype in dtypes):
            dtypes.extend(small_integer_dtypes)

        return self.combine(dtypes)
Ejemplo n.º 9
0
    def get_contained_fluxes(self, expr):
        from hedge.optemplate.mappers import FluxCollector
        contained_flux_ops = FluxCollector()(expr)

        from hedge.optemplate.operators import WholeDomainFluxOperator
        from pytools import all
        assert all(isinstance(op, WholeDomainFluxOperator)
                for op in contained_flux_ops), \
                        "not all flux operators were of the expected type"

        return [self.FluxRecord(
            flux_expr=wdflux,
            dependencies=set(wdflux.interior_deps) | set(wdflux.boundary_deps),
            repr_op=wdflux.repr_op())
            for wdflux in contained_flux_ops]
Ejemplo n.º 10
0
    def get_contained_fluxes(self, expr):
        from hedge.optemplate.mappers import FluxCollector
        contained_flux_ops = FluxCollector()(expr)

        from hedge.optemplate.operators import WholeDomainFluxOperator
        from pytools import all
        assert all(isinstance(op, WholeDomainFluxOperator)
                for op in contained_flux_ops), \
                        "not all flux operators were of the expected type"

        return [
            self.FluxRecord(flux_expr=wdflux,
                            dependencies=set(wdflux.interior_deps)
                            | set(wdflux.boundary_deps),
                            repr_op=wdflux.repr_op())
            for wdflux in contained_flux_ops
        ]
Ejemplo n.º 11
0
    def get_next_step(self, available_names, done_insns):
        from pytools import all, argmax2
        available_insns = [
                (insn, insn.priority) for insn in self.instructions
                if insn not in done_insns
                and all(dep.name in available_names
                    for dep in insn.get_dependencies())]

        if not available_insns:
            raise self.NoInstructionAvailable

        needed_vars = set([
            dep.name
            for insn in self.instructions
            if insn not in done_insns
            for dep in insn.get_dependencies()
            ])
        discardable_vars = set(available_names) - needed_vars

        # {{{ make sure results do not get discarded
        from pytools.obj_array import with_object_array_or_scalar

        from pytential.symbolic.mappers import DependencyMapper
        dm = DependencyMapper(composite_leaves=False)

        def remove_result_variable(result_expr):
            # The extra dependency mapper run is necessary
            # because, for instance, subscripts can make it
            # into the result expression, which then does
            # not consist of just variables.

            for var in dm(result_expr):
                from pymbolic.primitives import Variable
                assert isinstance(var, Variable)
                discardable_vars.discard(var.name)

        with_object_array_or_scalar(remove_result_variable, self.result)
        # }}}

        return argmax2(available_insns), discardable_vars
Ejemplo n.º 12
0
    def get_next_step(self, available_names, done_insns):
        from pytools import all, argmax2
        available_insns = [(insn, insn.priority) for insn in self.instructions
                           if insn not in done_insns and all(
                               dep.name in available_names
                               for dep in insn.get_dependencies())]

        if not available_insns:
            raise self.NoInstructionAvailable

        needed_vars = set([
            dep.name for insn in self.instructions if insn not in done_insns
            for dep in insn.get_dependencies()
        ])
        discardable_vars = set(available_names) - needed_vars

        # {{{ make sure results do not get discarded
        from pytools.obj_array import with_object_array_or_scalar

        from pytential.symbolic.mappers import DependencyMapper
        dm = DependencyMapper(composite_leaves=False)

        def remove_result_variable(result_expr):
            # The extra dependency mapper run is necessary
            # because, for instance, subscripts can make it
            # into the result expression, which then does
            # not consist of just variables.

            for var in dm(result_expr):
                from pymbolic.primitives import Variable
                assert isinstance(var, Variable)
                discardable_vars.discard(var.name)

        with_object_array_or_scalar(remove_result_variable, self.result)
        # }}}

        return argmax2(available_insns), discardable_vars
Ejemplo n.º 13
0
        def __init__(self,
                     ctx,
                     dtype,
                     scan_expr,
                     neutral=None,
                     name_prefix="scan",
                     options=[],
                     preamble="",
                     devices=None):

            if isinstance(self, ExclusiveScanKernel) and neutral is None:
                raise ValueError(
                    "neutral element is required for exclusive scan")

            self.context = ctx
            dtype = self.dtype = np.dtype(dtype)
            self.neutral = neutral

            if devices is None:
                devices = ctx.devices
            self.devices = devices

            max_wg_size = min(dev.max_work_group_size for dev in self.devices)

            # Thrust says these are good for GT200
            self.scan_wg_size = min(max_wg_size, 128)
            self.update_wg_size = min(max_wg_size, 256)

            if self.scan_wg_size < 16:
                # Hello, Apple CPU. Nice to see you.
                self.scan_wg_seq_batches = 128  # FIXME: guesswork
            else:
                self.scan_wg_seq_batches = 6

            from pytools import all
            from pyopencl.characterize import has_double_support

            kw_values = dict(preamble=preamble,
                             name_prefix=name_prefix,
                             scan_type=dtype_to_ctype(dtype),
                             scan_expr=scan_expr,
                             neutral=neutral,
                             double_support=all(
                                 has_double_support(dev) for dev in devices))

            scan_intervals_src = str(
                SCAN_INTERVALS_SOURCE.render(
                    wg_size=self.scan_wg_size,
                    wg_seq_batches=self.scan_wg_seq_batches,
                    **kw_values))
            scan_intervals_prg = cl.Program(ctx,
                                            scan_intervals_src).build(options)
            self.scan_intervals_knl = getattr(scan_intervals_prg,
                                              name_prefix + "_scan_intervals")
            self.scan_intervals_knl.set_scalar_arg_dtypes(
                (None, np.uint32, np.uint32, None, None))

            final_update_src = str(
                self.final_update_tp.render(wg_size=self.update_wg_size,
                                            **kw_values))

            final_update_prg = cl.Program(self.context,
                                          final_update_src).build(options)
            self.final_update_knl = getattr(final_update_prg,
                                            name_prefix + "_final_update")
            self.final_update_knl.set_scalar_arg_dtypes(
                (None, np.uint32, np.uint32, None))
Ejemplo n.º 14
0
def find_temporary_scope(kernel):
    logger.debug("%s: mark local temporaries" % kernel.name)

    new_temp_vars = {}
    from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag,
            temp_var_scope)
    import loopy as lp

    writers = kernel.writer_map()

    for temp_var in six.itervalues(kernel.temporary_variables):
        # Only fill out for variables that do not yet know if they're
        # local. (I.e. those generated by implicit temporary generation.)

        if temp_var.scope is not lp.auto:
            new_temp_vars[temp_var.name] = temp_var
            continue

        my_writers = writers.get(temp_var.name, [])

        desired_scope_per_insn = []
        for insn_id in my_writers:
            insn = kernel.id_to_insn[insn_id]

            # A write race will emerge if:
            #
            # - the variable is local
            #   and
            # - the instruction is run across more inames (locally) parallel
            #   than are reflected in the assignee indices.

            locparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, LocalIndexTagBase)

            locparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, LocalIndexTagBase, temp_var.name)

            grpparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, GroupIndexTag)

            grpparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, GroupIndexTag, temp_var.name)

            assert locparallel_assignee_inames <= locparallel_compute_inames
            assert grpparallel_assignee_inames <= grpparallel_compute_inames

            desired_scope = temp_var_scope.PRIVATE
            for iname_descr, scope_descr, apin, cpin, scope in [
                    ("local", "local", locparallel_assignee_inames,
                        locparallel_compute_inames, temp_var_scope.LOCAL),
                    ("group", "global", grpparallel_assignee_inames,
                        grpparallel_compute_inames, temp_var_scope.GLOBAL),
                    ]:

                if (apin != cpin and bool(locparallel_assignee_inames)):
                    warn_with_kernel(kernel, "write_race_local(%s)" % insn_id,
                            "instruction '%s' looks invalid: "
                            "it assigns to indices based on %s IDs, but "
                            "its temporary '%s' cannot be made %s because "
                            "a write race across the iname(s) '%s' would emerge. "
                            "(Do you need to add an extra iname to your prefetch?)"
                            % (insn_id, iname_descr, temp_var.name, scope_descr,
                                ", ".join(cpin - apin)),
                            WriteRaceConditionWarning)

                if (apin == cpin

                        # doesn't want to be in this scope if there aren't any
                        # parallel inames of that kind:
                        and bool(cpin)):
                    desired_scope = max(desired_scope, scope)
                    break

            desired_scope_per_insn.append(desired_scope)

        if not desired_scope_per_insn:
            if temp_var.initializer is None:
                warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
                        "temporary variable '%s' never written, eliminating"
                        % temp_var.name, LoopyAdvisory)
            else:
                raise LoopyError("temporary variable '%s': never written, "
                        "cannot automatically determine scope"
                        % temp_var.name)

            continue

        overall_scope = max(desired_scope_per_insn)

        from pytools import all
        if not all(iscope == overall_scope for iscope in desired_scope_per_insn):
            raise LoopyError("not all instructions agree on the "
                    "the desired scope (private/local/global) of  the "
                    "temporary '%s'" % temp_var.name)

        new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope)

    return kernel.copy(temporary_variables=new_temp_vars)
Ejemplo n.º 15
0
 def map_logical_and(self, expr):
     from pytools import all
     return all(self.rec(ch) for ch in expr.children)
Ejemplo n.º 16
0
 def map_logical_and(self, expr):
     from pytools import all
     return all(self.rec(ch) for ch in expr.children)
Ejemplo n.º 17
0
def  get_reduction_source(
         ctx, out_type, out_type_size,
         neutral, reduce_expr, map_expr, arguments,
         name="reduce_kernel", preamble="",
         device=None, max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        return min(
                device.max_work_group_size,
                (device.local_mem_size + out_type_size - 1)
                // out_type_size)

    group_size = min(
            get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    # {{{ compute synchronization-less group size

    def get_dev_no_sync_size(device):
        try:
            return device.warp_size_nv
        except:
            if "nvidia" in device.vendor.lower():
                from warnings import warn
                warn("Reduction might be unnecessarily slow: "
                        "can't query warp size on Nvidia device")
            return 1

    no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support
    src = str(Template(KERNEL).render(
        out_type=out_type,
        arguments=arguments,
        group_size=group_size,
        no_sync_size=no_sync_size,
        neutral=neutral,
        reduce_expr=reduce_expr,
        map_expr=map_expr,
        name=name,
        preamble=preamble,
        double_support=all(
            has_double_support(dev) for dev in devices)
        ))

    from pytools import Record
    class ReductionInfo(Record):
        pass

    return ReductionInfo(
            context=ctx,
            source=src,
            group_size=group_size)
Ejemplo n.º 18
0
def make_ref_args(kernel, impl_arg_info, queue, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable

    from pymbolic import evaluate

    ref_args = {}
    ref_arg_data = []

    for arg in impl_arg_info:
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            if arg.offset_for_name:
                continue

            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            ref_args[arg.name] = arg_value

            ref_arg_data.append(None)

        elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg:
            if arg.shape is None or any(saxis is None for saxis in arg.shape):
                raise LoopyError("array '%s' needs known shape to use automatic "
                        "testing" % arg.name)

            shape = evaluate_shape(arg.unvec_shape, parameters)
            dtype = kernel_arg.dtype

            is_output = arg.base_name in kernel.get_written_variables()

            if arg.arg_class is ImageArg:
                storage_array = ary = cl_array.empty(
                        queue, shape, dtype, order="C")
                numpy_strides = None
                alloc_size = None
                strides = None
            else:
                strides = evaluate(arg.unvec_strides, parameters)

                from pytools import all
                assert all(s > 0 for s in strides)
                alloc_size = sum(astrd*(alen-1)
                        for alen, astrd in zip(shape, strides)) + 1

                if dtype is None:
                    raise LoopyError("dtype for argument '%s' is not yet "
                            "known. Perhaps you want to use "
                            "loopy.add_dtypes "
                            "or loopy.infer_argument_dtypes?"
                            % arg.name)

                itemsize = dtype.itemsize
                numpy_strides = [itemsize*s for s in strides]

                storage_array = cl_array.empty(queue, alloc_size, dtype)

            if is_output and arg.arg_class is ImageArg:
                raise LoopyError("write-mode images not supported in "
                        "automatic testing")

            fill_rand(storage_array)

            if arg.arg_class is ImageArg:
                # must be contiguous
                pre_run_ary = pre_run_storage_array = storage_array.copy()

                ref_args[arg.name] = cl.image_from_array(
                        queue.context, ary.get())
            else:
                pre_run_storage_array = storage_array.copy()

                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
                pre_run_ary = cl_array.as_strided(
                        pre_run_storage_array, shape, numpy_strides)
                ref_args[arg.name] = ary

            ref_arg_data.append(
                    TestArgInfo(
                        name=arg.name,
                        ref_array=ary,
                        ref_storage_array=storage_array,

                        ref_pre_run_array=pre_run_ary,
                        ref_pre_run_storage_array=pre_run_storage_array,

                        ref_shape=shape,
                        ref_strides=strides,
                        ref_alloc_size=alloc_size,
                        ref_numpy_strides=numpy_strides,
                        needs_checking=is_output))

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return ref_args, ref_arg_data
Ejemplo n.º 19
0
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, TemporaryVariable

    from pymbolic import evaluate

    args = {}
    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            args[arg.name] = arg_value

        elif arg.arg_class is ImageArg:
            if arg.name in kernel.get_written_variables():
                raise NotImplementedError("write-mode images not supported in "
                        "automatic testing")

            shape = evaluate_shape(arg.unvec_shape, parameters)
            assert shape == arg_desc.ref_shape

            # must be contiguous
            args[arg.name] = cl.image_from_array(
                    queue.context, arg_desc.ref_pre_run_array.get())

        elif arg.arg_class is GlobalArg:
            shape = evaluate(arg.unvec_shape, parameters)
            strides = evaluate(arg.unvec_strides, parameters)

            dtype = kernel_arg.dtype
            itemsize = dtype.itemsize
            numpy_strides = [itemsize*s for s in strides]

            assert all(s > 0 for s in strides)
            alloc_size = sum(astrd*(alen-1)
                    for alen, astrd in zip(shape, strides)) + 1

            # use contiguous array to transfer to host
            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()

            # use device shape/strides
            from pyopencl.compyte.array import as_strided
            host_ref_array = as_strided(host_ref_contig_array,
                    arg_desc.ref_shape, arg_desc.ref_numpy_strides)

            # flatten the thing
            host_ref_flat_array = host_ref_array.flatten()

            # create host array with test shape (but not strides)
            host_contig_array = np.empty(shape, dtype=dtype)

            common_len = min(
                    len(host_ref_flat_array),
                    len(host_contig_array.ravel()))
            host_contig_array.ravel()[:common_len] = \
                    host_ref_flat_array[:common_len]

            # create host array with test shape and storage layout
            host_storage_array = np.empty(alloc_size, dtype)
            host_array = as_strided(
                    host_storage_array, shape, numpy_strides)
            host_array[...] = host_contig_array

            host_contig_array = arg_desc.ref_storage_array.get()
            storage_array = cl_array.to_device(queue, host_storage_array)
            ary = cl_array.as_strided(storage_array, shape, numpy_strides)

            args[arg.name] = ary

            arg_desc.test_storage_array = storage_array
            arg_desc.test_array = ary
            arg_desc.test_shape = shape
            arg_desc.test_strides = strides
            arg_desc.test_numpy_strides = numpy_strides
            arg_desc.test_alloc_size = alloc_size

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return args
Ejemplo n.º 20
0
def mark_local_temporaries(kernel):
    logger.debug("%s: mark local temporaries" % kernel.name)

    new_temp_vars = {}
    from loopy.kernel.data import LocalIndexTagBase
    import loopy as lp

    writers = kernel.writer_map()

    from loopy.symbolic import get_dependencies

    for temp_var in six.itervalues(kernel.temporary_variables):
        # Only fill out for variables that do not yet know if they're
        # local. (I.e. those generated by implicit temporary generation.)

        if temp_var.is_local is not lp.auto:
            new_temp_vars[temp_var.name] = temp_var
            continue

        my_writers = writers.get(temp_var.name, [])

        wants_to_be_local_per_insn = []
        for insn_id in my_writers:
            insn = kernel.id_to_insn[insn_id]

            # A write race will emerge if:
            #
            # - the variable is local
            #   and
            # - the instruction is run across more inames (locally) parallel
            #   than are reflected in the assignee indices.

            locparallel_compute_inames = set(iname
                    for iname in kernel.insn_inames(insn_id)
                    if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))

            locparallel_assignee_inames = set(iname
                    for _, assignee_indices in insn.assignees_and_indices()
                    for iname in get_dependencies(assignee_indices)
                        & kernel.all_inames()
                    if isinstance(kernel.iname_to_tag.get(iname), LocalIndexTagBase))

            assert locparallel_assignee_inames <= locparallel_compute_inames

            if (locparallel_assignee_inames != locparallel_compute_inames
                    and bool(locparallel_assignee_inames)):
                warn(kernel, "write_race_local(%s)" % insn_id,
                        "instruction '%s' looks invalid: "
                        "it assigns to indices based on local IDs, but "
                        "its temporary '%s' cannot be made local because "
                        "a write race across the iname(s) '%s' would emerge. "
                        "(Do you need to add an extra iname to your prefetch?)"
                        % (insn_id, temp_var.name, ", ".join(
                            locparallel_compute_inames
                            - locparallel_assignee_inames)),
                        WriteRaceConditionWarning)

            wants_to_be_local_per_insn.append(
                    locparallel_assignee_inames == locparallel_compute_inames

                    # doesn't want to be local if there aren't any
                    # parallel inames:
                    and bool(locparallel_compute_inames))

        if not wants_to_be_local_per_insn:
            warn(kernel, "temp_to_write(%s)" % temp_var.name,
                    "temporary variable '%s' never written, eliminating"
                    % temp_var.name, LoopyAdvisory)

            continue

        is_local = any(wants_to_be_local_per_insn)

        from pytools import all
        if not all(wtbl == is_local for wtbl in wants_to_be_local_per_insn):
            raise LoopyError("not all instructions agree on whether "
                    "temporary '%s' should be in local memory" % temp_var.name)

        new_temp_vars[temp_var.name] = temp_var.copy(is_local=is_local)

    return kernel.copy(temporary_variables=new_temp_vars)
Ejemplo n.º 21
0
def get_reduction_source(ctx,
                         out_type,
                         out_type_size,
                         neutral,
                         reduce_expr,
                         map_expr,
                         arguments,
                         name="reduce_kernel",
                         preamble="",
                         device=None,
                         max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64
        return min(max_work_group_size,
                   (device.local_mem_size + out_type_size - 1) //
                   out_type_size)

    group_size = min(get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    # {{{ compute synchronization-less group size

    def get_dev_no_sync_size(device):
        from pyopencl.characterize import get_simd_group_size
        result = get_simd_group_size(device, out_type_size)

        if result is None:
            from warnings import warn
            warn("Reduction might be unnecessarily slow: "
                 "can't query SIMD group size")
            return 1

        return result

    no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support, has_amd_double_support
    src = str(
        Template(KERNEL).render(
            out_type=out_type,
            arguments=arguments,
            group_size=group_size,
            no_sync_size=no_sync_size,
            neutral=neutral,
            reduce_expr=reduce_expr,
            map_expr=map_expr,
            name=name,
            preamble=preamble,
            double_support=all(has_double_support(dev) for dev in devices),
            amd_double_support=all(
                has_amd_double_support(dev) for dev in devices)))

    from pytools import Record

    class ReductionInfo(Record):
        pass

    return ReductionInfo(context=ctx, source=src, group_size=group_size)
Ejemplo n.º 22
0
def  get_reduction_source(
         ctx, out_type, out_type_size,
         neutral, reduce_expr, map_expr, arguments,
         name="reduce_kernel", preamble="",
         device=None, max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64

        # compute lmem limit
        from pytools import div_ceil
        lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
        result = min(max_work_group_size, lmem_wg_size)

        # round down to power of 2
        from pyopencl.tools import bitlog2
        return 2**bitlog2(result)

    group_size = min(get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    # {{{ compute synchronization-less group size

    def get_dev_no_sync_size(device):
        from pyopencl.characterize import get_simd_group_size
        result = get_simd_group_size(device, out_type_size)

        if result is None:
            from warnings import warn
            warn("Reduction might be unnecessarily slow: "
                    "can't query SIMD group size")
            return 1

        return result

    no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support, has_amd_double_support
    src = str(Template(KERNEL).render(
        out_type=out_type,
        arguments=arguments,
        group_size=group_size,
        no_sync_size=no_sync_size,
        neutral=neutral,
        reduce_expr=reduce_expr,
        map_expr=map_expr,
        name=name,
        preamble=preamble,
        double_support=all(
            has_double_support(dev) for dev in devices),
        amd_double_support=all(
            has_amd_double_support(dev) for dev in devices)
        ))

    from pytools import Record
    class ReductionInfo(Record):
        pass

    return ReductionInfo(
            context=ctx,
            source=src,
            group_size=group_size)
Ejemplo n.º 23
0
def make_ref_args(kernel, impl_arg_info, queue, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg, \
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    ref_args = {}
    ref_arg_data = []

    for arg in impl_arg_info:
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            if arg.offset_for_name:
                continue

            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            ref_args[arg.name] = arg_value

            ref_arg_data.append(None)

        elif arg.arg_class is GlobalArg or arg.arg_class is ImageArg \
                or arg.arg_class is ConstantArg:
            if arg.shape is None or any(saxis is None for saxis in arg.shape):
                raise LoopyError(
                    "array '%s' needs known shape to use automatic "
                    "testing" % arg.name)

            shape = evaluate_shape(arg.unvec_shape, parameters)
            dtype = kernel_arg.dtype

            is_output = arg.base_name in kernel.get_written_variables()

            if arg.arg_class is ImageArg:
                storage_array = ary = cl_array.empty(queue,
                                                     shape,
                                                     dtype,
                                                     order="C")
                numpy_strides = None
                alloc_size = None
                strides = None
            else:
                strides = evaluate(arg.unvec_strides, parameters)

                from pytools import all
                assert all(s > 0 for s in strides)
                alloc_size = sum(astrd * (alen - 1)
                                 for alen, astrd in zip(shape, strides)) + 1

                if dtype is None:
                    raise LoopyError("dtype for argument '%s' is not yet "
                                     "known. Perhaps you want to use "
                                     "loopy.add_dtypes "
                                     "or loopy.infer_argument_dtypes?" %
                                     arg.name)

                itemsize = dtype.itemsize
                numpy_strides = [itemsize * s for s in strides]

                storage_array = cl_array.empty(queue, alloc_size, dtype)

            if is_output and arg.arg_class is ImageArg:
                raise LoopyError("write-mode images not supported in "
                                 "automatic testing")

            fill_rand(storage_array)

            if arg.arg_class is ImageArg:
                # must be contiguous
                pre_run_ary = pre_run_storage_array = storage_array.copy()

                ref_args[arg.name] = cl.image_from_array(
                    queue.context, ary.get())
            else:
                pre_run_storage_array = storage_array.copy()

                ary = cl_array.as_strided(storage_array, shape, numpy_strides)
                pre_run_ary = cl_array.as_strided(pre_run_storage_array, shape,
                                                  numpy_strides)
                ref_args[arg.name] = ary

            ref_arg_data.append(
                TestArgInfo(name=arg.name,
                            ref_array=ary,
                            ref_storage_array=storage_array,
                            ref_pre_run_array=pre_run_ary,
                            ref_pre_run_storage_array=pre_run_storage_array,
                            ref_shape=shape,
                            ref_strides=strides,
                            ref_alloc_size=alloc_size,
                            ref_numpy_strides=numpy_strides,
                            needs_checking=is_output))

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return ref_args, ref_arg_data
Ejemplo n.º 24
0
def make_args(kernel, impl_arg_info, queue, ref_arg_data, parameters):
    import pyopencl as cl
    import pyopencl.array as cl_array

    from loopy.kernel.data import ValueArg, GlobalArg, ImageArg,\
            TemporaryVariable, ConstantArg

    from pymbolic import evaluate

    args = {}
    for arg, arg_desc in zip(impl_arg_info, ref_arg_data):
        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)

        if arg.arg_class is ValueArg:
            arg_value = parameters[arg.name]

            try:
                argv_dtype = arg_value.dtype
            except AttributeError:
                argv_dtype = None

            if argv_dtype != arg.dtype:
                arg_value = arg.dtype.numpy_dtype.type(arg_value)

            args[arg.name] = arg_value

        elif arg.arg_class is ImageArg:
            if arg.name in kernel.get_written_variables():
                raise NotImplementedError("write-mode images not supported in "
                                          "automatic testing")

            shape = evaluate_shape(arg.unvec_shape, parameters)
            assert shape == arg_desc.ref_shape

            # must be contiguous
            args[arg.name] = cl.image_from_array(
                queue.context, arg_desc.ref_pre_run_array.get())

        elif arg.arg_class is GlobalArg or\
                arg.arg_class is ConstantArg:
            shape = evaluate(arg.unvec_shape, parameters)
            strides = evaluate(arg.unvec_strides, parameters)

            dtype = kernel_arg.dtype
            itemsize = dtype.itemsize
            numpy_strides = [itemsize * s for s in strides]

            assert all(s > 0 for s in strides)
            alloc_size = sum(astrd * (alen - 1)
                             for alen, astrd in zip(shape, strides)) + 1

            # use contiguous array to transfer to host
            host_ref_contig_array = arg_desc.ref_pre_run_storage_array.get()

            # use device shape/strides
            from pyopencl.compyte.array import as_strided
            host_ref_array = as_strided(host_ref_contig_array,
                                        arg_desc.ref_shape,
                                        arg_desc.ref_numpy_strides)

            # flatten the thing
            host_ref_flat_array = host_ref_array.flatten()

            # create host array with test shape (but not strides)
            host_contig_array = np.empty(shape, dtype=dtype)

            common_len = min(len(host_ref_flat_array),
                             len(host_contig_array.ravel()))
            host_contig_array.ravel()[:common_len] = \
                    host_ref_flat_array[:common_len]

            # create host array with test shape and storage layout
            host_storage_array = np.empty(alloc_size, dtype)
            host_array = as_strided(host_storage_array, shape, numpy_strides)
            host_array[...] = host_contig_array

            host_contig_array = arg_desc.ref_storage_array.get()
            storage_array = cl_array.to_device(queue, host_storage_array)
            ary = cl_array.as_strided(storage_array, shape, numpy_strides)

            args[arg.name] = ary

            arg_desc.test_storage_array = storage_array
            arg_desc.test_array = ary
            arg_desc.test_shape = shape
            arg_desc.test_strides = strides
            arg_desc.test_numpy_strides = numpy_strides
            arg_desc.test_alloc_size = alloc_size

        elif arg.arg_class is TemporaryVariable:
            # global temporary, handled by invocation logic
            pass

        else:
            raise LoopyError("arg type not understood")

    return args
Ejemplo n.º 25
0
def _get_reduction_source(ctx,
                          out_type,
                          out_type_size,
                          neutral,
                          reduce_expr,
                          map_expr,
                          parsed_args,
                          name="reduce_kernel",
                          preamble="",
                          device=None,
                          max_group_size=None):

    if device is not None:
        devices = [device]
    else:
        devices = ctx.devices

    # {{{ compute group size

    def get_dev_group_size(device):
        # dirty fix for the RV770 boards
        max_work_group_size = device.max_work_group_size
        if "RV770" in device.name:
            max_work_group_size = 64

        # compute lmem limit
        from pytools import div_ceil
        lmem_wg_size = div_ceil(max_work_group_size, out_type_size)
        result = min(max_work_group_size, lmem_wg_size)

        # round down to power of 2
        from pyopencl.tools import bitlog2
        return 2**bitlog2(result)

    group_size = min(get_dev_group_size(dev) for dev in devices)

    if max_group_size is not None:
        group_size = min(max_group_size, group_size)

    # }}}

    # {{{ compute synchronization-less group size

    def get_dev_no_sync_size(device):
        from pyopencl.characterize import get_simd_group_size
        result = get_simd_group_size(device, out_type_size)

        if result is None:
            from warnings import warn
            warn("Reduction might be unnecessarily slow: "
                 "can't query SIMD group size")
            return 1

        return result

    no_sync_size = min(get_dev_no_sync_size(dev) for dev in devices)

    # }}}

    from mako.template import Template
    from pytools import all
    from pyopencl.characterize import has_double_support
    src = str(
        Template(KERNEL).render(
            out_type=out_type,
            arguments=", ".join(arg.declarator() for arg in parsed_args),
            group_size=group_size,
            no_sync_size=no_sync_size,
            neutral=neutral,
            reduce_expr=reduce_expr,
            map_expr=map_expr,
            name=name,
            preamble=preamble,
            double_support=all(has_double_support(dev) for dev in devices),
        ))

    from pytools import Record

    class ReductionInfo(Record):
        pass

    return ReductionInfo(context=ctx, source=src, group_size=group_size)
Ejemplo n.º 26
0
def find_temporary_scope(kernel):
    logger.debug("%s: mark local temporaries" % kernel.name)

    new_temp_vars = {}
    from loopy.kernel.data import (LocalIndexTagBase, GroupIndexTag,
            temp_var_scope)
    import loopy as lp

    writers = kernel.writer_map()

    for temp_var in six.itervalues(kernel.temporary_variables):
        # Only fill out for variables that do not yet know if they're
        # local. (I.e. those generated by implicit temporary generation.)

        if temp_var.scope is not lp.auto:
            new_temp_vars[temp_var.name] = temp_var
            continue

        my_writers = writers.get(temp_var.name, [])

        desired_scope_per_insn = []
        for insn_id in my_writers:
            insn = kernel.id_to_insn[insn_id]

            # A write race will emerge if:
            #
            # - the variable is local
            #   and
            # - the instruction is run across more inames (locally) parallel
            #   than are reflected in the assignee indices.

            locparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, LocalIndexTagBase)

            locparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, LocalIndexTagBase, temp_var.name)

            grpparallel_compute_inames = _get_compute_inames_tagged(
                    kernel, insn, GroupIndexTag)

            grpparallel_assignee_inames = _get_assignee_inames_tagged(
                    kernel, insn, GroupIndexTag, temp_var.name)

            assert locparallel_assignee_inames <= locparallel_compute_inames
            assert grpparallel_assignee_inames <= grpparallel_compute_inames

            desired_scope = temp_var_scope.PRIVATE
            for iname_descr, scope_descr, apin, cpin, scope in [
                    ("local", "local", locparallel_assignee_inames,
                        locparallel_compute_inames, temp_var_scope.LOCAL),
                    ("group", "global", grpparallel_assignee_inames,
                        grpparallel_compute_inames, temp_var_scope.GLOBAL),
                    ]:

                if (apin != cpin and bool(locparallel_assignee_inames)):
                    warn_with_kernel(kernel, "write_race_local(%s)" % insn_id,
                            "instruction '%s' looks invalid: "
                            "it assigns to indices based on %s IDs, but "
                            "its temporary '%s' cannot be made %s because "
                            "a write race across the iname(s) '%s' would emerge. "
                            "(Do you need to add an extra iname to your prefetch?)"
                            % (insn_id, iname_descr, temp_var.name, scope_descr,
                                ", ".join(cpin - apin)),
                            WriteRaceConditionWarning)

                if (apin == cpin

                        # doesn't want to be in this scope if there aren't any
                        # parallel inames of that kind:
                        and bool(cpin)):
                    desired_scope = max(desired_scope, scope)
                    break

            desired_scope_per_insn.append(desired_scope)

        if not desired_scope_per_insn:
            if temp_var.initializer is None:
                warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
                        "temporary variable '%s' never written, eliminating"
                        % temp_var.name, LoopyAdvisory)
            else:
                raise LoopyError("temporary variable '%s': never written, "
                        "cannot automatically determine scope"
                        % temp_var.name)

            continue

        overall_scope = max(desired_scope_per_insn)

        from pytools import all
        if not all(iscope == overall_scope for iscope in desired_scope_per_insn):
            raise LoopyError("not all instructions agree on the "
                    "the desired scope (private/local/global) of  the "
                    "temporary '%s'" % temp_var.name)

        new_temp_vars[temp_var.name] = temp_var.copy(scope=overall_scope)

    return kernel.copy(temporary_variables=new_temp_vars)
Ejemplo n.º 27
0
Archivo: scan.py Proyecto: lichinka/cai
        def __init__(self, ctx, dtype,
                scan_expr, neutral=None,
                name_prefix="scan", options=[], preamble="", devices=None):

            if isinstance(self, ExclusiveScanKernel) and neutral is None:
                raise ValueError("neutral element is required for exclusive scan")

            self.context = ctx
            dtype = self.dtype = np.dtype(dtype)
            self.neutral = neutral

            if devices is None:
                devices = ctx.devices
            self.devices = devices

            max_wg_size = min(dev.max_work_group_size for dev in self.devices)

            # loop to find suitable workgrouop size
            trip_count = 0
            while True:
                # Thrust says these are good for GT200
                self.scan_wg_size = min(max_wg_size, 128)
                self.update_wg_size = min(max_wg_size, 256)

                if self.scan_wg_size < 16:
                    # Hello, Apple CPU. Nice to see you.
                    self.scan_wg_seq_batches = 128 # FIXME: guesswork
                else:
                    self.scan_wg_seq_batches = 6

                from pytools import all
                from pyopencl.characterize import has_double_support

                kw_values = dict(
                    preamble=preamble,
                    name_prefix=name_prefix,
                    scan_type=dtype_to_ctype(dtype),
                    scan_expr=scan_expr,
                    neutral=neutral,
                    double_support=all(
                        has_double_support(dev) for dev in devices)
                    )

                scan_intervals_src = str(SCAN_INTERVALS_SOURCE.render(
                    wg_size=self.scan_wg_size,
                    wg_seq_batches=self.scan_wg_seq_batches,
                    **kw_values))
                scan_intervals_prg = cl.Program(ctx, scan_intervals_src).build(options)
                self.scan_intervals_knl = getattr(
                        scan_intervals_prg,
                        name_prefix+"_scan_intervals")
                self.scan_intervals_knl.set_scalar_arg_dtypes(
                        (None, np.uint32, np.uint32, None, None))

                kernel_max_wg_size = self.scan_intervals_knl.get_work_group_info(
                        cl.kernel_work_group_info.WORK_GROUP_SIZE,
                        ctx.devices[0])

                if self.scan_wg_size <= kernel_max_wg_size:
                    break
                else:
                    max_wg_size = kernel_max_wg_size

                trip_count += 1
                assert trip_count <= 2

            final_update_src = str(self.final_update_tp.render(
                wg_size=self.update_wg_size,
                **kw_values))

            final_update_prg = cl.Program(self.context, final_update_src).build(options)
            self.final_update_knl = getattr(
                    final_update_prg,
                    name_prefix+"_final_update")
            self.final_update_knl.set_scalar_arg_dtypes(
                    (None, np.uint32, np.uint32, None))