Example #1
0
    def add_watches(self, watches):
        """Add quantities that are printed after every time step."""

        from pytools import Record

        class WatchInfo(Record):
            pass

        for watch in watches:
            if isinstance(watch, tuple):
                display, expr = watch
            else:
                display = watch
                expr = watch

            parsed = self._parse_expr(expr)
            parsed, dep_data = self._get_expr_dep_data(parsed)

            from pytools import any
            self.have_nonlocal_watches = self.have_nonlocal_watches or \
                    any(dd.nonlocal_agg for dd in dep_data)

            from pymbolic import compile
            compiled = compile(parsed, [dd.varname for dd in dep_data])

            watch_info = WatchInfo(display=display, parsed=parsed, dep_data=dep_data,
                    compiled=compiled)

            self.watches.append(watch_info)
Example #2
0
    def __init__(self,
                 ctx,
                 dtype_out,
                 neutral,
                 reduce_expr,
                 map_expr=None,
                 arguments=None,
                 name="reduce_kernel",
                 options=[],
                 preamble=""):

        dtype_out = self.dtype_out = np.dtype(dtype_out)

        max_group_size = None
        trip_count = 0

        while True:
            self.stage_1_inf = get_reduction_kernel(
                1,
                ctx,
                dtype_out,
                neutral,
                reduce_expr,
                map_expr,
                arguments,
                name=name + "_stage1",
                options=options,
                preamble=preamble,
                max_group_size=max_group_size)

            kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE, ctx.devices[0])

            if self.stage_1_inf.group_size <= kernel_max_wg_size:
                break
            else:
                max_group_size = kernel_max_wg_size

            trip_count += 1
            assert trip_count <= 2

        self.stage_2_inf = get_reduction_kernel(2,
                                                ctx,
                                                dtype_out,
                                                neutral,
                                                reduce_expr,
                                                arguments=arguments,
                                                name=name + "_stage2",
                                                options=options,
                                                preamble=preamble,
                                                max_group_size=max_group_size)

        from pytools import any
        from pyopencl.tools import VectorArg
        assert any(
                isinstance(arg_tp, VectorArg)
                for arg_tp in self.stage_1_inf.arg_types), \
                "ReductionKernel can only be used with functions " \
                "that have at least one vector argument"
Example #3
0
    def is_affine(self):
        from pytools import any

        has_high_order_geometry = any(
            sum(mid) >= 2 and abs(mc) >= 1e-13
            for mc_along_axis in self.modal_coeff.T for mid, mc in zip(
                self.ldis.generate_mode_identifiers(), mc_along_axis))

        return not has_high_order_geometry
Example #4
0
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z):
    if not np.float64 in [dtype_x, dtype_y]:
        func = func + "f"

    from pytools import any
    if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]):
        func = "f" + func

    return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
Example #5
0
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z):
    if not np.float64 in [dtype_x, dtype_y]:
        func = func +"f"

    from pytools import any
    if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]):
        func = "f"+func

    return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
Example #6
0
    def is_affine(self):
        from pytools import any

        has_high_order_geometry = any(
            sum(mid) >= 2 and abs(mc) >= 1e-13
            for mc_along_axis in self.modal_coeff.T
            for mid, mc in zip(self.ldis.generate_mode_identifiers(), mc_along_axis)
        )

        return not has_high_order_geometry
Example #7
0
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
    if np.float64 not in [dtype_x, dtype_y]:
        func = func + "f"

    from pytools import any
    if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]):
        func = "f"+func

    if use_scalar:
        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z)
    else:
        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
Example #8
0
def get_binary_minmax_kernel(func, dtype_x, dtype_y, dtype_z, use_scalar):
    if np.float64 not in [dtype_x, dtype_y]:
        func = func + "f"

    from pytools import any
    if any(dt.kind == "f" for dt in [dtype_x, dtype_y, dtype_z]):
        func = "f" + func

    if use_scalar:
        return get_binary_func_scalar_kernel(func, dtype_x, dtype_y, dtype_z)
    else:
        return get_binary_func_kernel(func, dtype_x, dtype_y, dtype_z)
Example #9
0
def has_barrier_within(kernel, sched_index):
    sched_item = kernel.schedule[sched_index]

    if isinstance(sched_item, EnterLoop):
        loop_contents, _ = gather_schedule_subloop(kernel.schedule, sched_index)
        from pytools import any

        return any(isinstance(subsched_item, Barrier) for subsched_item in loop_contents)
    elif isinstance(sched_item, Barrier):
        return True
    else:
        return False
Example #10
0
        def p2p(kernels):
            from pytools import any
            if any(knl.is_complex_valued for knl in kernels):
                value_dtype = self.complex_dtype
            else:
                value_dtype = self.real_dtype

            from sumpy.p2p import P2P
            return P2P(actx.context,
                       kernels,
                       exclude_self=False,
                       value_dtypes=value_dtype)
Example #11
0
    def get_p2p(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from sumpy.p2p import P2P
        p2p = P2P(self.cl_context,
                    kernels, exclude_self=False, value_dtypes=value_dtype)

        return p2p
Example #12
0
    def get_p2p(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from sumpy.p2p import P2P
        p2p = P2P(self.cl_context,
                  kernels, exclude_self=False, value_dtypes=value_dtype)

        return p2p
Example #13
0
    def get_lpot_applier(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from sumpy.qbx import LayerPotential
        return LayerPotential(self.cl_context,
                    [self.expansion_getter(knl, self.qbx_order)
                        for knl in kernels],
                    value_dtypes=value_dtype)
    def __init__(self, dtype_out,
                 neutral, reduce_expr, arguments=None,
                 map_exprs=[None],
                 name="reduce_kernel", options=[], preamble=""):

        ctx = get_device().context
        dtype_out = self.dtype_out = np.dtype(dtype_out)

        max_group_size = None
        trip_count = 0

        self.n_exprs = len(map_exprs)
        assert self.n_exprs>0

        while True:
            self.stage_1_inf = get_reduction_kernel(1, ctx,
                                                    dtype_out,
                                                    neutral, reduce_expr, arguments,
                                                    name=name+"_stage1", options=options, preamble=preamble,
                                                    map_exprs=map_exprs,
                                                    max_group_size=max_group_size)


            kernel_max_wg_size = self.stage_1_inf.kernel.get_work_group_info(
                cl.kernel_work_group_info.WORK_GROUP_SIZE,
                ctx.devices[0])

            if self.stage_1_inf.group_size<=kernel_max_wg_size:
                break
            else:
                max_group_size = kernel_max_wg_size

            trip_count += 1
            assert trip_count<=2

        self.stage_2_inf = get_reduction_kernel(2, ctx,
                                dtype_out,
                                neutral, reduce_expr, arguments=arguments,
                                name=name+"_stage2", options=options,
                                map_exprs=map_exprs,
                                preamble=preamble,
                                max_group_size=max_group_size)

        from pytools import any
        from pyopencl.tools import VectorArg
        assert any(
            isinstance(arg_tp, VectorArg)
            for arg_tp in self.stage_1_inf.arg_types), \
            "ReductionKernel can only be used with functions " \
            "that have at least one vector argument"
Example #15
0
    def get_lpot_applier_on_tgt_subset(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset
        from sumpy.expansion.local import VolumeTaylorLocalExpansion
        return LayerPotentialOnTargetAndCenterSubset(self.cl_context, [
            VolumeTaylorLocalExpansion(knl, self.qbx_order) for knl in kernels
        ],
                                                     value_dtypes=value_dtype)
Example #16
0
    def get_lpot_applier(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from sumpy.qbx import LayerPotential
        from sumpy.expansion.local import LineTaylorLocalExpansion
        return LayerPotential(
            self.cl_context,
            [LineTaylorLocalExpansion(knl, self.qbx_order) for knl in kernels],
            value_dtypes=value_dtype)
Example #17
0
    def get_lpot_applier_on_tgt_subset(self, kernels):
        # needs to be separate method for caching

        from pytools import any
        if any(knl.is_complex_valued for knl in kernels):
            value_dtype = self.density_discr.complex_dtype
        else:
            value_dtype = self.density_discr.real_dtype

        from pytential.qbx.direct import LayerPotentialOnTargetAndCenterSubset
        from sumpy.expansion.local import VolumeTaylorLocalExpansion
        return LayerPotentialOnTargetAndCenterSubset(
                self.cl_context,
                [VolumeTaylorLocalExpansion(knl, self.qbx_order)
                    for knl in kernels],
                value_dtypes=value_dtype)
Example #18
0
    def stringify(self, coeff_stringifier, enclosing_prec):
        from pymbolic.mapper.stringifier import PREC_PRODUCT, PREC_SUM

        terms = []
        for bits in sorted(six.iterkeys(self.data),
                           key=lambda bits: (bit_count(bits), bits)):
            coeff = self.data[bits]

            # {{{ try to find a stringifier

            strifier = None
            if coeff_stringifier is None:
                try:
                    strifier = coeff.stringifier()()
                except AttributeError:
                    pass
            else:
                strifier = coeff_stringifier

            # }}}

            if strifier is not None:
                if bits:
                    coeff_str = strifier(coeff, PREC_PRODUCT)
                else:
                    coeff_str = strifier(coeff, PREC_SUM)
            else:
                coeff_str = str(coeff)

            blade_str = self.space.blade_bits_to_str(bits)
            if bits:
                terms.append("%s * %s" % (blade_str, coeff_str))
            else:
                terms.append(coeff_str)

        if terms:
            if any(len(t) > 15 for t in terms):
                result = "\n    " + "\n    + ".join(terms)
            else:
                result = " + ".join(terms)
        else:
            result = "0"

        return "MV(%s)" % result
Example #19
0
    def stringify(self, coeff_stringifier, enclosing_prec):
        from pymbolic.mapper.stringifier import PREC_PRODUCT, PREC_SUM

        terms = []
        for bits in sorted(six.iterkeys(self.data),
                key=lambda bits: (bit_count(bits), bits)):
            coeff = self.data[bits]

            # {{{ try to find a stringifier

            strifier = None
            if coeff_stringifier is None:
                try:
                    strifier = coeff.stringifier()()
                except AttributeError:
                    pass
            else:
                strifier = coeff_stringifier

            # }}}

            if strifier is not None:
                if bits:
                    coeff_str = strifier(coeff, PREC_PRODUCT)
                else:
                    coeff_str = strifier(coeff, PREC_SUM)
            else:
                coeff_str = str(coeff)

            blade_str = self.space.blade_bits_to_str(bits)
            if bits:
                terms.append("%s * %s" % (blade_str, coeff_str))
            else:
                terms.append(coeff_str)

        if terms:
            if any(len(t) > 15 for t in terms):
                result = "\n    " + "\n    + ".join(terms)
            else:
                result = " + ".join(terms)
        else:
            result = "0"

        return "MV(%s)" % result
Example #20
0
    def finalize_multi_assign(self, names, exprs, do_not_return, priority):
        from pytools import any
        from hedge.tools import is_zero

        has_zero_assignees = any(is_zero(expr) for expr in exprs)
        if has_zero_assignees:
            if len(exprs) > 1:
                raise RuntimeError("found aggregated zero constant assignment")

        from hedge.optemplate import FlopCounter
        flop_count = sum(FlopCounter()(expr) for expr in exprs)

        if has_zero_assignees or flop_count == 0:
            return Assign(names,
                          exprs,
                          priority=priority,
                          dep_mapper_factory=self.dep_mapper_factory)
        else:
            return VectorExprAssign(names=names,
                                    exprs=exprs,
                                    do_not_return=do_not_return,
                                    dep_mapper_factory=self.dep_mapper_factory,
                                    priority=priority)
Example #21
0
    def finalize_multi_assign(self, names, exprs, do_not_return, priority):
        from pytools import any
        from hedge.tools import is_zero

        has_zero_assignees = any(is_zero(expr) for expr in exprs)
        if has_zero_assignees:
            if len(exprs) > 1:
                raise RuntimeError("found aggregated zero constant assignment")

        from hedge.optemplate import FlopCounter

        flop_count = sum(FlopCounter()(expr) for expr in exprs)

        if has_zero_assignees or flop_count == 0:
            return Assign(names, exprs, priority=priority, dep_mapper_factory=self.dep_mapper_factory)
        else:
            return VectorExprAssign(
                names=names,
                exprs=exprs,
                do_not_return=do_not_return,
                dep_mapper_factory=self.dep_mapper_factory,
                priority=priority,
            )
Example #22
0
    def __init__(self, ctx, dtype_out,
            neutral, reduce_expr, map_expr=None, arguments=None,
            name="reduce_kernel", options=[], preamble=""):

        dtype_out = self.dtype_out = np.dtype(dtype_out)

        self.stage_1_inf = get_reduction_kernel(ctx,
                dtype_to_ctype(dtype_out), dtype_out.itemsize,
                neutral, reduce_expr, map_expr, arguments,
                name=name+"_stage1", options=options, preamble=preamble)

        # stage 2 has only one input and no map expression
        self.stage_2_inf = get_reduction_kernel(ctx,
                dtype_to_ctype(dtype_out), dtype_out.itemsize,
                neutral, reduce_expr,
                name=name+"_stage2", options=options, preamble=preamble)

        from pytools import any
        from pyopencl.tools import VectorArg
        assert any(
                isinstance(arg_tp, VectorArg)
                for arg_tp in self.stage_1_inf.arg_types), \
                "ReductionKernel can only be used with functions that have at least one " \
                "vector argument"
Example #23
0
 def map_logical_or(self, expr):
     from pytools import any
     return any(self.rec(ch) for ch in expr.children)
Example #24
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

              *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for l in omit_lists:
            if not any(l == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.complex_kernel and queue.device.type == cl.device_type.CPU:
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + args + (n_objects,)),
                **dict(wait_for=wait_for))

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            evt = scan_kernel(starts_ary, wait_for=[count_event],
                    size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + args + (n_objects,)),
                **dict(wait_for=scan_events))

        return result, evt
Example #25
0
    def __init__(self, data, space=None):
        """
        :arg data: This may be one of the following:

            * a :class:`numpy.ndarray`, which will be turned into a grade-1
              multivector,
            * a mapping from tuples of basis indices (together indicating a blade,
              order matters and will be mapped to 'normalized' blades) to
              coefficients,
            * an array as described in :attr:`data`,
            * a scalar--where everything that doesn't fall into the above cases
              is viewed as a scalar.
        :arg space: A :class:`Space` instance. If *None* or an integer,
            :func:`get_euclidean_space` is called to obtain a default space with
            the right number of dimensions for *data*. Note: dimension guessing only
            works when a :class:`numpy.ndarray` is being passed for *data*.
        """

        dimensions = None

        if isinstance(data, np.ndarray):
            if len(data.shape) != 1:
                raise ValueError(
                    "only numpy vectors (not higher-rank objects) "
                    "are supported for 'data'")
            dimensions, = data.shape
            data = dict(((i, ), xi) for i, xi in enumerate(data))
        elif isinstance(data, dict):
            pass
        else:
            data = {0: data}

        if space is None:
            space = get_euclidean_space(dimensions)
        else:
            if dimensions is not None and space.dimensions != dimensions:
                raise ValueError(
                    "dimension count of 'space' does not match that of 'data'")

        # {{{ normalize data to bitmaps, if needed

        from pytools import single_valued
        from pymbolic.primitives import is_zero
        if data and single_valued(
                isinstance(k, tuple) for k in six.iterkeys(data)):
            # data is in non-normalized non-bits tuple form
            new_data = {}
            for basis_indices, coeff in six.iteritems(data):
                bits, sign = space.bits_and_sign(basis_indices)
                new_coeff = new_data.setdefault(bits, 0) + sign * coeff

                if is_zero(new_coeff):
                    del new_data[bits]
                else:
                    new_data[bits] = new_coeff

            data = new_data

        # }}}

        # assert that multivectors don't get nested
        from pytools import any
        assert not any(
            isinstance(coeff, MultiVector) for coeff in six.itervalues(data))

        self.space = space
        self.data = data
 def do_not_vectorize(self):
     from pytools import any
     return (self.complex_kernel and any(dev.type & cl.device_type.CPU
                                         for dev in self.context.devices))
Example #27
0
    def do_not_vectorize(self):
        from pytools import any

        return self.complex_kernel and any(dev.type == cl.device_type.CPU for dev in self.context.devices)
Example #28
0
 def map_logical_or(self, expr):
     from pytools import any
     return any(self.rec(ch) for ch in expr.children)
Example #29
0
 def is_name_conflicting(self, name):
     from pytools import any
     return any(
             _is_var_name_conflicting(name, other_name)
             for other_name in self.existing_names)
Example #30
0
 def is_name_conflicting(self, name):
     from pytools import any
     return any(
         _is_var_name_conflicting(name, other_name)
         for other_name in self.existing_names)
Example #31
0
    def __init__(self, data, space=None):
        """
        :arg data: This may be one of the following:

            * a :class:`numpy.ndarray`, which will be turned into a grade-1
              multivector,
            * a mapping from tuples of basis indices (together indicating a blade,
              order matters and will be mapped to 'normalized' blades) to
              coefficients,
            * an array as described in :attr:`data`,
            * a scalar--where everything that doesn't fall into the above cases
              is viewed as a scalar.
        :arg space: A :class:`Space` instance. If *None* or an integer,
            :func:`get_euclidean_space` is called to obtain a default space with
            the right number of dimensions for *data*. Note: dimension guessing only
            works when a :class:`numpy.ndarray` is being passed for *data*.
        """

        dimensions = None

        if isinstance(data, np.ndarray):
            if len(data.shape) != 1:
                raise ValueError("only numpy vectors (not higher-rank objects) "
                        "are supported for 'data'")
            dimensions, = data.shape
            data = dict(
                    ((i,), xi) for i, xi in enumerate(data))
        elif isinstance(data, dict):
            pass
        else:
            data = {0: data}

        if space is None:
            space = get_euclidean_space(dimensions)
        else:
            if dimensions is not None and space.dimensions != dimensions:
                raise ValueError(
                        "dimension count of 'space' does not match that of 'data'")

        # {{{ normalize data to bitmaps, if needed

        from pytools import single_valued
        from pymbolic.primitives import is_zero
        if data and single_valued(isinstance(k, tuple) for k in six.iterkeys(data)):
            # data is in non-normalized non-bits tuple form
            new_data = {}
            for basis_indices, coeff in six.iteritems(data):
                bits, sign = space.bits_and_sign(basis_indices)
                new_coeff = new_data.setdefault(bits, 0) + sign*coeff

                if is_zero(new_coeff):
                    del new_data[bits]
                else:
                    new_data[bits] = new_coeff

            data = new_data

        # }}}

        # assert that multivectors don't get nested
        from pytools import any
        assert not any(isinstance(coeff, MultiVector)
                for coeff in six.itervalues(data))

        self.space = space
        self.data = data
Example #32
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            :class:`pyopencl.array.Array` are not allowed directly and should
            be passed as their :attr:`pyopencl.array.Array.data` attribute instead.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

              *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for oml in omit_lists:
            if not any(oml == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            result[name] = BuiltList(starts=counts)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.do_not_vectorize():
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + args + (n_objects,)),
                **dict(wait_for=wait_for))

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            starts_ary = info_record.starts
            evt = scan_kernel(starts_ary, wait_for=[count_event],
                    size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + args + (n_objects,)),
                **dict(wait_for=scan_events))

        return result, evt
Example #33
0
def match_dtype_to_c_struct(device, name, dtype, context=None):
    """Return a tuple `(dtype, c_decl)` such that the C struct declaration
    in `c_decl` and the structure :class:`numpy.dtype` instance `dtype`
    have the same memory layout.

    Note that *dtype* may be modified from the value that was passed in,
    for example to insert padding.

    (As a remark on implementation, this routine runs a small kernel on
    the given *device* to ensure that :mod:`numpy` and C offsets and
    sizes match.)

    .. versionadded: 2013.1

    This example explains the use of this function::

        >>> import numpy as np
        >>> import pyopencl as cl
        >>> import pyopencl.tools
        >>> ctx = cl.create_some_context()
        >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)])
        >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct(
        ...     ctx.devices[0], 'id_val', dtype)
        >>> print c_decl
        typedef struct {
          unsigned id;
          float value;
        } id_val;
        >>> print dtype
        [('id', '<u4'), ('value', '<f4')]
        >>> cl.tools.get_or_register_dtype('id_val', dtype)

    As this example shows, it is important to call
    :func:`get_or_register_dtype` on the modified `dtype` returned by this
    function, not the original one.
    """

    fields = sorted(six.iteritems(dtype.fields),
                    key=lambda name_dtype_offset: name_dtype_offset[1][1])

    c_fields = []
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))

    c_decl = "typedef struct {\n%s\n} %s;\n\n" % ("\n".join(c_fields), name)

    cdl = _CDeclList(device)
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        cdl.add_dtype(field_dtype)

    pre_decls = cdl.get_declarations()

    offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" %
                            (i + 1, name, field_name)
                            for i, (field_name, _) in enumerate(fields))

    src = r"""
        #define pycl_offsetof(st, m) \
                 ((size_t) ((__local char *) &(dummy.m) \
                 - (__local char *)&dummy ))

        %(pre_decls)s

        %(my_decl)s

        __kernel void get_size_and_offsets(__global size_t *result)
        {
            result[0] = sizeof(%(my_type)s);
            __local %(my_type)s dummy;
            %(offset_code)s
        }
    """ % dict(pre_decls=pre_decls,
               my_decl=c_decl,
               my_type=name,
               offset_code=offset_code)

    if context is None:
        context = cl.Context([device])

    queue = cl.CommandQueue(context)

    prg = cl.Program(context, src)
    knl = prg.build(devices=[device]).get_size_and_offsets

    import pyopencl.array  # noqa
    result_buf = cl.array.empty(queue, 1 + len(fields), np.uintp)
    knl(queue, (1, ), (1, ), result_buf.data)
    queue.finish()
    size_and_offsets = result_buf.get()

    size = int(size_and_offsets[0])

    from pytools import any
    offsets = size_and_offsets[1:]
    if any(ofs >= size for ofs in offsets):
        # offsets not plausible

        if dtype.itemsize == size:
            # If sizes match, use numpy's idea of the offsets.
            offsets = [
                dtype_and_offset[1] for field_name, dtype_and_offset in fields
            ]
        else:
            raise RuntimeError(
                "OpenCL compiler reported offsetof() past sizeof() "
                "for struct layout on '%s'. "
                "This makes no sense, and it's usually indicates a "
                "compiler bug. "
                "Refusing to discover struct layout." % device)

    result_buf.data.release()
    del knl
    del prg
    del queue
    del context

    try:
        dtype_arg_dict = {
            'names':
            [field_name for field_name, (field_dtype, offset) in fields],
            'formats':
            [field_dtype for field_name, (field_dtype, offset) in fields],
            'offsets': [int(x) for x in offsets],
            'itemsize':
            int(size_and_offsets[0]),
        }
        dtype = np.dtype(dtype_arg_dict)
        if dtype.itemsize != size_and_offsets[0]:
            # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo.
            dtype_arg_dict["names"].append("_pycl_size_fixer")
            dtype_arg_dict["formats"].append(np.uint8)
            dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1)
            dtype = np.dtype(dtype_arg_dict)
    except NotImplementedError:

        def calc_field_type():
            total_size = 0
            padding_count = 0
            for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                if offset > total_size:
                    padding_count += 1
                    yield ('__pycl_padding%d' % padding_count,
                           'V%d' % offset - total_size)
                yield field_name, field_dtype
                total_size = field_dtype.itemsize + offset

        dtype = np.dtype(list(calc_field_type()))

    assert dtype.itemsize == size_and_offsets[0]

    return dtype, c_decl
Example #34
0
    def aggregate_assignments(self, instructions, result):
        from pymbolic.primitives import Variable

        # aggregation helpers -------------------------------------------------
        def get_complete_origins_set(insn, skip_levels=0):
            if skip_levels < 0:
                skip_levels = 0

            result = set()
            for dep in insn.get_dependencies():
                if isinstance(dep, Variable):
                    dep_origin = origins_map.get(dep.name, None)
                    if dep_origin is not None:
                        if skip_levels <= 0:
                            result.add(dep_origin)
                        result |= get_complete_origins_set(
                                dep_origin, skip_levels-1)

            return result

        var_assignees_cache = {}
        def get_var_assignees(insn):
            try:
                return var_assignees_cache[insn]
            except KeyError:
                result = set(Variable(assignee)
                        for assignee in insn.get_assignees())
                var_assignees_cache[insn] = result
                return result

        def aggregate_two_assignments(ass_1, ass_2):
            names = ass_1.names + ass_2.names

            from pymbolic.primitives import Variable
            deps = (ass_1.get_dependencies() | ass_2.get_dependencies()) \
                    - set(Variable(name) for name in names)

            return Assign(
                    names=names, exprs=ass_1.exprs + ass_2.exprs,
                    _dependencies=deps,
                    dep_mapper_factory=self.dep_mapper_factory,
                    priority=max(ass_1.priority, ass_2.priority))

        # main aggregation pass -----------------------------------------------
        origins_map = dict(
                    (assignee, insn)
                    for insn in instructions
                    for assignee in insn.get_assignees())

        from pytools import partition
        unprocessed_assigns, other_insns = partition(
                lambda insn: isinstance(insn, Assign),
                instructions)

        # filter out zero-flop-count assigns--no need to bother with those
        processed_assigns, unprocessed_assigns = partition(
                lambda ass: ass.flop_count() == 0,
                unprocessed_assigns)

        # filter out zero assignments
        from pytools import any
        from hedge.tools import is_zero

        i = 0

        while i < len(unprocessed_assigns):
            my_assign = unprocessed_assigns[i]
            if any(is_zero(expr) for expr in my_assign.exprs):
                processed_assigns.append(unprocessed_assigns.pop())
            else:
                i += 1

        # greedy aggregation
        while unprocessed_assigns:
            my_assign = unprocessed_assigns.pop()

            my_deps = my_assign.get_dependencies()
            my_assignees = get_var_assignees(my_assign)

            agg_candidates = []
            for i, other_assign in enumerate(unprocessed_assigns):
                other_deps = other_assign.get_dependencies()
                other_assignees = get_var_assignees(other_assign)

                if ((my_deps & other_deps
                        or my_deps & other_assignees
                        or other_deps & my_assignees)
                        and my_assign.priority == other_assign.priority):
                    agg_candidates.append((i, other_assign))

            did_work = False

            if agg_candidates:
                my_indirect_origins = get_complete_origins_set(
                        my_assign, skip_levels=1)

                for other_assign_index, other_assign in agg_candidates:
                    if self.max_vectors_in_batch_expr is not None:
                        new_assignee_count = len(
                                set(my_assign.get_assignees())
                                | set(other_assign.get_assignees()))
                        new_dep_count = len(
                                my_assign.get_dependencies(
                                    each_vector=True)
                                | other_assign.get_dependencies(
                                    each_vector=True))

                        if (new_assignee_count + new_dep_count \
                                > self.max_vectors_in_batch_expr):
                            continue

                    other_indirect_origins = get_complete_origins_set(
                            other_assign, skip_levels=1)

                    if (my_assign not in other_indirect_origins and
                            other_assign not in my_indirect_origins):
                        did_work = True

                        # aggregate the two assignments
                        new_assignment = aggregate_two_assignments(
                                my_assign, other_assign)
                        del unprocessed_assigns[other_assign_index]
                        unprocessed_assigns.append(new_assignment)
                        for assignee in new_assignment.get_assignees():
                            origins_map[assignee] = new_assignment

                        break

            if not did_work:
                processed_assigns.append(my_assign)

        externally_used_names = set(
                expr
                for insn in processed_assigns + other_insns
                for expr in insn.get_dependencies())

        from hedge.tools import is_obj_array
        if is_obj_array(result):
            externally_used_names |= set(expr for expr in result)
        else:
            externally_used_names |= set([result])

        def schedule_and_finalize_assignment(ass):
            dep_mapper = self.dep_mapper_factory()

            names_exprs = zip(ass.names, ass.exprs)

            my_assignees = set(name for name, expr in names_exprs)
            names_exprs_deps = [
                    (name, expr,
                        set(dep.name for dep in dep_mapper(expr) if
                            isinstance(dep, Variable)) & my_assignees)
                    for name, expr in names_exprs]

            ordered_names_exprs = []
            available_names = set()

            while names_exprs_deps:
                schedulable = []

                i = 0
                while i < len(names_exprs_deps):
                    name, expr, deps = names_exprs_deps[i]

                    unsatisfied_deps = deps - available_names

                    if not unsatisfied_deps:
                        schedulable.append((str(expr), name, expr))
                        del names_exprs_deps[i]
                    else:
                        i += 1

                # make sure these come out in a constant order
                schedulable.sort()

                if schedulable:
                    for key, name, expr in schedulable:
                        ordered_names_exprs.append((name, expr))
                        available_names.add(name)
                else:
                    raise RuntimeError("aggregation resulted in an "
                            "impossible assignment")

            return self.finalize_multi_assign(
                    names=[name for name, expr in ordered_names_exprs],
                    exprs=[expr for name, expr in ordered_names_exprs],
                    do_not_return=[Variable(name) not in externally_used_names
                        for name, expr in ordered_names_exprs],
                    priority=ass.priority)

        return [schedule_and_finalize_assignment(ass)
            for ass in processed_assigns] + other_insns
Example #35
0
    def __init__(self, context, list_names_and_dtypes, generate_template,
            arg_decls, count_sharing=None, devices=None,
            name_prefix="plb_build_list", options=[], preamble="",
            debug=False, complex_kernel=False,
            eliminate_empty_output_lists=[]):
        """
        :arg context: A :class:`pyopencl.Context`.
        :arg list_names_and_dtypes: a list of `(name, dtype)` tuples
            indicating the lists to be built.
        :arg generate_template: a snippet of C as described below
        :arg arg_decls: A string of comma-separated C argument declarations.
        :arg count_sharing: A mapping consisting of `(child, mother)`
            indicating that `mother` and `child` will always have the
            same number of indices, and the `APPEND` to `mother`
            will always happen *before* the `APPEND` to the child.
        :arg name_prefix: the name prefix to use for the compiled kernels
        :arg options: OpenCL compilation options for kernels using
            *generate_template*.
        :arg complex_kernel: If `True`, prevents vectorization on CPUs.
        :arg eliminate_empty_output_lists: A Python list of list names
            for which the empty output lists are eliminated.

        *generate_template* may use the following C macros/identifiers:

        * `index_type`: expands to C identifier for the index type used
          for the calculation
        * `USER_ARG_DECL`: expands to the C declarator for `arg_decls`
        * `USER_ARGS`: a list of C argument values corresponding to
          `user_arg_decl`
        * `LIST_ARG_DECL`: expands to a C argument list representing the
          data for the output lists. These are escaped prefixed with
          `"plg_"` so as to not interfere with user-provided names.
        * `LIST_ARGS`: a list of C argument values corresponding to
          `LIST_ARG_DECL`
        * `APPEND_name(entry)`: inserts `entry` into the list `name`.
          *entry* must be a valid C expression of the correct type.

        All argument-list related macros have a trailing comma included
        if they are non-empty.

        *generate_template* must supply a function:

        .. code-block:: c

            void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
            {
                APPEND_mylist(5);
            }

        Internally, the `kernel_template` is expanded (at least) twice. Once,
        for a 'counting' stage where the size of all the lists is determined,
        and a second time, for a 'generation' stage where the lists are
        actually filled. A `generate` function that has side effects beyond
        calling `append` is therefore ill-formed.

        .. versionchanged:: 2018.1

            Change *eliminate_empty_output_lists* argument type from `bool` to
            `list`.
        """

        if devices is None:
            devices = context.devices

        if count_sharing is None:
            count_sharing = {}

        self.context = context
        self.devices = devices

        self.list_names_and_dtypes = list_names_and_dtypes
        self.generate_template = generate_template

        from pyopencl.tools import parse_arg_list
        self.arg_decls = parse_arg_list(arg_decls)

        # To match with the signature of the user-supplied generate(), arguments
        # can't appear to have offsets.
        arg_decls_no_offset = []
        from pyopencl.tools import VectorArg
        for arg in self.arg_decls:
            if isinstance(arg, VectorArg) and arg.with_offset:
                arg = VectorArg(arg.dtype, arg.name)
            arg_decls_no_offset.append(arg)
        self.arg_decls_no_offset = arg_decls_no_offset

        self.count_sharing = count_sharing

        self.name_prefix = name_prefix
        self.preamble = preamble
        self.options = options

        self.debug = debug

        self.complex_kernel = complex_kernel

        if eliminate_empty_output_lists is True:
            eliminate_empty_output_lists = \
                    [name for name, _ in self.list_names_and_dtypes]

        if eliminate_empty_output_lists is False:
            eliminate_empty_output_lists = []

        self.eliminate_empty_output_lists = eliminate_empty_output_lists
        for list_name in self.eliminate_empty_output_lists:
            if not any(list_name == name for name, _ in self.list_names_and_dtypes):
                raise ValueError(
                    "invalid list name '%s' in eliminate_empty_output_lists"
                    % list_name)
Example #36
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            Array-like arguments must be either
            1D :class:`pyopencl.array.Array` objects or
            :class:`pyopencl.MemoryObject` objects, of which the latter
            can be obtained from a :class:`pyopencl.array.Array` using the
            :attr:`pyopencl.array.Array.data` attribute.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

            If the list name is specified in *eliminate_empty_output_lists*
            constructor argument, *lists* has two additional attributes
            ``num_nonempty_lists`` and ``nonempty_indices``

            * ``num_nonempty_lists`` for the number of nonempty lists.
            * ``nonempty_indices`` for the index of nonempty list in input objects.

            In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s
            entry is the start of the *i*'th nonempty list, which is generated by
            the object with index *nonempty_indices[i]*.

            *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for oml in omit_lists:
            if not any(oml == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []
        else:
            # We'll be modifying it below.
            wait_for = list(wait_for)

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)
        if self.eliminate_empty_output_lists:
            compress_kernel = self.get_compress_kernel(index_dtype)

        data_args = []
        for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
            from pyopencl.tools import VectorArg
            if isinstance(arg_descr, VectorArg):
                from pyopencl import MemoryObject
                if isinstance(arg_val, MemoryObject):
                    data_args.append(arg_val)
                    if arg_descr.with_offset:
                        raise ValueError(
                                "with_offset=True specified for argument %d "
                                "but the argument is not an array" % i)
                    continue

                if arg_val.ndim != 1:
                    raise ValueError("argument %d is a multidimensional array" % i)

                data_args.append(arg_val.base_data)
                if arg_descr.with_offset:
                    data_args.append(arg_val.offset)
                wait_for.extend(arg_val.events)
            else:
                data_args.append(arg_val)

        del args
        data_args = tuple(data_args)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            if name in self.eliminate_empty_output_lists:
                result[name] = BuiltList(count=None, starts=counts, lists=None,
                                         num_nonempty_lists=None,
                                         nonempty_indices=None)
            else:
                result[name] = BuiltList(count=None, starts=counts, lists=None)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.do_not_vectorize():
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + data_args + (n_objects,)),
                wait_for=wait_for)

        compress_events = {}
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                continue
            if name in self.count_sharing:
                continue
            if name not in self.eliminate_empty_output_lists:
                continue

            compressed_counts = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record = result[name]
            info_record.nonempty_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.num_nonempty_lists = cl.array.empty(
                queue, (1,), index_dtype, allocator=allocator)
            info_record.compressed_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.compressed_indices[0] = 0
            compress_events[name] = compress_kernel(
                info_record.starts,
                compressed_counts,
                info_record.nonempty_indices,
                info_record.compressed_indices,
                info_record.num_nonempty_lists,
                wait_for=[count_event] + info_record.compressed_indices.events)

            info_record.starts = compressed_counts

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            if name in self.eliminate_empty_output_lists:
                compress_events[name].wait()
                num_nonempty_lists = info_record.num_nonempty_lists.get()[0]
                info_record.num_nonempty_lists = num_nonempty_lists
                info_record.starts = info_record.starts[:num_nonempty_lists + 1]
                info_record.nonempty_indices = \
                    info_record.nonempty_indices[:num_nonempty_lists]
                info_record.starts[-1] = 0

            starts_ary = info_record.starts
            if name in self.eliminate_empty_output_lists:
                evt = scan_kernel(
                        starts_ary,
                        size=info_record.num_nonempty_lists,
                        wait_for=starts_ary.events)
            else:
                evt = scan_kernel(starts_ary, wait_for=[count_event],
                        size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                if name in self.eliminate_empty_output_lists:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

            if name in self.eliminate_empty_output_lists:
                write_list_args.append(info_record.compressed_indices.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + data_args + (n_objects,)),
                wait_for=scan_events)

        return result, evt
Example #37
0
def match_dtype_to_c_struct(device, name, dtype, context=None):
    """Return a tuple `(dtype, c_decl)` such that the C struct declaration
    in `c_decl` and the structure :class:`numpy.dtype` instance `dtype`
    have the same memory layout.

    Note that *dtype* may be modified from the value that was passed in,
    for example to insert padding.

    (As a remark on implementation, this routine runs a small kernel on
    the given *device* to ensure that :mod:`numpy` and C offsets and
    sizes match.)

    .. versionadded: 2013.1

    This example explains the use of this function::

        >>> import numpy as np
        >>> import pyopencl as cl
        >>> import pyopencl.tools
        >>> ctx = cl.create_some_context()
        >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)])
        >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct(
        ...     ctx.devices[0], 'id_val', dtype)
        >>> print c_decl
        typedef struct {
          unsigned id;
          float value;
        } id_val;
        >>> print dtype
        [('id', '<u4'), ('value', '<f4')]
        >>> cl.tools.get_or_register_dtype('id_val', dtype)

    As this example shows, it is important to call
    :func:`get_or_register_dtype` on the modified `dtype` returned by this
    function, not the original one.
    """

    fields = sorted(dtype.fields.iteritems(),
            key=lambda (name, (dtype, offset)): offset)

    c_fields = []
    for field_name, (field_dtype, offset) in fields:
        c_fields.append("  %s %s;" % (dtype_to_ctype(field_dtype), field_name))

    c_decl = "typedef struct {\n%s\n} %s;\n\n" % (
            "\n".join(c_fields),
            name)

    cdl = _CDeclList(device)
    for field_name, (field_dtype, offset) in fields:
        cdl.add_dtype(field_dtype)

    pre_decls = cdl.get_declarations()

    offset_code = "\n".join(
            "result[%d] = pycl_offsetof(%s, %s);" % (i+1, name, field_name)
            for i, (field_name, (field_dtype, offset)) in enumerate(fields))

    src = r"""
        #define pycl_offsetof(st, m) \
                 ((size_t) ((__local char *) &(dummy.m) \
                 - (__local char *)&dummy ))

        %(pre_decls)s

        %(my_decl)s

        __kernel void get_size_and_offsets(__global size_t *result)
        {
            result[0] = sizeof(%(my_type)s);
            __local %(my_type)s dummy;
            %(offset_code)s
        }
    """ % dict(
            pre_decls=pre_decls,
            my_decl=c_decl,
            my_type=name,
            offset_code=offset_code)

    if context is None:
        context = cl.Context([device])

    queue = cl.CommandQueue(context)

    prg = cl.Program(context, src)
    knl = prg.build(devices=[device]).get_size_and_offsets

    import pyopencl.array  # noqa
    result_buf = cl.array.empty(queue, 1+len(fields), np.uintp)
    knl(queue, (1,), (1,), result_buf.data)
    queue.finish()
    size_and_offsets = result_buf.get()

    size = int(size_and_offsets[0])

    from pytools import any
    offsets = size_and_offsets[1:]
    if any(ofs >= size for ofs in offsets):
        # offsets not plausible

        if dtype.itemsize == size:
            # If sizes match, use numpy's idea of the offsets.
            offsets = [offset
                    for field_name, (field_dtype, offset) in fields]
        else:
            raise RuntimeError(
                    "cannot discover struct layout on '%s'" % device)

    result_buf.data.release()
    del knl
    del prg
    del queue
    del context

    dtype_arg_dict = dict(
            names=[field_name for field_name, (field_dtype, offset) in fields],
            formats=[field_dtype
                for field_name, (field_dtype, offset) in fields],
            offsets=[int(x) for x in offsets],
            itemsize=int(size_and_offsets[0]),
            )
    dtype = np.dtype(dtype_arg_dict)

    if dtype.itemsize != size_and_offsets[0]:
        # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo.
        dtype_arg_dict["names"].append("_pycl_size_fixer")
        dtype_arg_dict["formats"].append(np.uint8)
        dtype_arg_dict["offsets"].append(int(size_and_offsets[0])-1)
        dtype = np.dtype(dtype_arg_dict)

    assert dtype.itemsize == size_and_offsets[0]

    return dtype, c_decl
Example #38
0
    def __call__(self, queue, n_objects, *args, **kwargs):
        """
        :arg args: arguments corresponding to arg_decls in the constructor.
            Array-like arguments must be either
            1D :class:`pyopencl.array.Array` objects or
            :class:`pyopencl.MemoryObject` objects, of which the latter
            can be obtained from a :class:`pyopencl.array.Array` using the
            :attr:`pyopencl.array.Array.data` attribute.
        :arg allocator: optionally, the allocator to use to allocate new
            arrays.
        :arg omit_lists: An iterable of list names that should *not* be built
            with this invocation. The kernel code may *not* call ``APPEND_name``
            for these omitted lists. If it does, undefined behavior will result.
            The returned *lists* dictionary will not contain an entry for names
            in *omit_lists*.
        :arg wait_for: |explain-waitfor|
        :returns: a tuple ``(lists, event)``, where
            *lists* a mapping from (built) list names to objects which
            have attributes

            * ``count`` for the total number of entries in all lists combined
            * ``lists`` for the array containing all lists.
            * ``starts`` for the array of starting indices in `lists`.
              `starts` is built so that it has n+1 entries, so that
              the *i*'th entry is the start of the *i*'th list, and the
              *i*'th entry is the index one past the *i*'th list's end,
              even for the last list.

              This implies that all lists are contiguous.

            If the list name is specified in *eliminate_empty_output_lists*
            constructor argument, *lists* has two additional attributes
            ``num_nonempty_lists`` and ``nonempty_indices``

            * ``num_nonempty_lists`` for the number of nonempty lists.
            * ``nonempty_indices`` for the index of nonempty list in input objects.

            In this case, `starts` has `num_nonempty_lists` + 1 entries. The *i*'s
            entry is the start of the *i*'th nonempty list, which is generated by
            the object with index *nonempty_indices[i]*.

            *event* is a :class:`pyopencl.Event` for dependency management.

        .. versionchanged:: 2016.2

            Added omit_lists.
        """
        if n_objects >= int(np.iinfo(np.int32).max):
            index_dtype = np.int64
        else:
            index_dtype = np.int32
        index_dtype = np.dtype(index_dtype)

        allocator = kwargs.pop("allocator", None)
        omit_lists = kwargs.pop("omit_lists", [])
        wait_for = kwargs.pop("wait_for", None)
        if kwargs:
            raise TypeError("invalid keyword arguments: '%s'" % ", ".join(kwargs))

        for oml in omit_lists:
            if not any(oml == name for name, _ in self.list_names_and_dtypes):
                raise ValueError("invalid list name '%s' in omit_lists")

        result = {}
        count_list_args = []

        if wait_for is None:
            wait_for = []

        count_kernel = self.get_count_kernel(index_dtype)
        write_kernel = self.get_write_kernel(index_dtype)
        scan_kernel = self.get_scan_kernel(index_dtype)
        if self.eliminate_empty_output_lists:
            compress_kernel = self.get_compress_kernel(index_dtype)

        data_args = []
        for i, (arg_descr, arg_val) in enumerate(zip(self.arg_decls, args)):
            from pyopencl.tools import VectorArg
            if isinstance(arg_descr, VectorArg):
                from pyopencl import MemoryObject
                if isinstance(arg_val, MemoryObject):
                    data_args.append(arg_val)
                    if arg_descr.with_offset:
                        raise ValueError(
                                "with_offset=True specified for argument %d "
                                "but the argument is not an array" % i)
                    continue

                if arg_val.ndim != 1:
                    raise ValueError("argument %d is a multidimensional array" % i)

                data_args.append(arg_val.base_data)
                if arg_descr.with_offset:
                    data_args.append(arg_val.offset)
            else:
                data_args.append(arg_val)

        del args
        data_args = tuple(data_args)

        # {{{ allocate memory for counts

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                count_list_args.append(None)
                continue

            counts = cl.array.empty(queue,
                    (n_objects + 1), index_dtype, allocator=allocator)
            counts[-1] = 0
            wait_for = wait_for + counts.events

            # The scan will turn the "counts" array into the "starts" array
            # in-place.
            if name in self.eliminate_empty_output_lists:
                result[name] = BuiltList(count=None, starts=counts, lists=None,
                                         num_nonempty_lists=None,
                                         nonempty_indices=None)
            else:
                result[name] = BuiltList(count=None, starts=counts, lists=None)
            count_list_args.append(counts.data)

        # }}}

        if self.debug:
            gsize = (1,)
            lsize = (1,)
        elif self.do_not_vectorize():
            gsize = (4*queue.device.max_compute_units,)
            lsize = (1,)
        else:
            from pyopencl.array import splay
            gsize, lsize = splay(queue, n_objects)

        count_event = count_kernel(queue, gsize, lsize,
                *(tuple(count_list_args) + data_args + (n_objects,)),
                **dict(wait_for=wait_for))

        compress_events = {}
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                continue
            if name in self.count_sharing:
                continue
            if name not in self.eliminate_empty_output_lists:
                continue

            compressed_counts = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record = result[name]
            info_record.nonempty_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.num_nonempty_lists = cl.array.empty(
                queue, (1,), index_dtype, allocator=allocator)
            info_record.compressed_indices = cl.array.empty(
                queue, (n_objects + 1,), index_dtype, allocator=allocator)
            info_record.compressed_indices[0] = 0
            compress_events[name] = compress_kernel(
                info_record.starts,
                compressed_counts,
                info_record.nonempty_indices,
                info_record.compressed_indices,
                info_record.num_nonempty_lists,
                wait_for=[count_event] + info_record.compressed_indices.events)

            info_record.starts = compressed_counts

        # {{{ run scans

        scan_events = []

        for name, dtype in self.list_names_and_dtypes:
            if name in self.count_sharing:
                continue
            if name in omit_lists:
                continue

            info_record = result[name]
            if name in self.eliminate_empty_output_lists:
                compress_events[name].wait()
                num_nonempty_lists = info_record.num_nonempty_lists.get()[0]
                info_record.num_nonempty_lists = num_nonempty_lists
                info_record.starts = info_record.starts[:num_nonempty_lists + 1]
                info_record.nonempty_indices = \
                    info_record.nonempty_indices[:num_nonempty_lists]
                info_record.starts[-1] = 0

            starts_ary = info_record.starts
            if name in self.eliminate_empty_output_lists:
                evt = scan_kernel(
                        starts_ary,
                        size=info_record.num_nonempty_lists,
                        wait_for=starts_ary.events)
            else:
                evt = scan_kernel(starts_ary, wait_for=[count_event],
                        size=n_objects)

            starts_ary.setitem(0, 0, queue=queue, wait_for=[evt])
            scan_events.extend(starts_ary.events)

            # retrieve count
            info_record.count = int(starts_ary[-1].get())

        # }}}

        # {{{ deal with count-sharing lists, allocate memory for lists

        write_list_args = []
        for name, dtype in self.list_names_and_dtypes:
            if name in omit_lists:
                write_list_args.append(None)
                if name not in self.count_sharing:
                    write_list_args.append(None)
                if name in self.eliminate_empty_output_lists:
                    write_list_args.append(None)
                continue

            if name in self.count_sharing:
                sharing_from = self.count_sharing[name]

                info_record = result[name] = BuiltList(
                        count=result[sharing_from].count,
                        starts=result[sharing_from].starts,
                        )

            else:
                info_record = result[name]

            info_record.lists = cl.array.empty(queue,
                    info_record.count, dtype, allocator=allocator)
            write_list_args.append(info_record.lists.data)

            if name not in self.count_sharing:
                write_list_args.append(info_record.starts.data)

            if name in self.eliminate_empty_output_lists:
                write_list_args.append(info_record.compressed_indices.data)

        # }}}

        evt = write_kernel(queue, gsize, lsize,
                *(tuple(write_list_args) + data_args + (n_objects,)),
                **dict(wait_for=scan_events))

        return result, evt
Example #39
0
    def __init__(self, context, list_names_and_dtypes, generate_template,
            arg_decls, count_sharing=None, devices=None,
            name_prefix="plb_build_list", options=[], preamble="",
            debug=False, complex_kernel=False,
            eliminate_empty_output_lists=[]):
        """
        :arg context: A :class:`pyopencl.Context`.
        :arg list_names_and_dtypes: a list of `(name, dtype)` tuples
            indicating the lists to be built.
        :arg generate_template: a snippet of C as described below
        :arg arg_decls: A string of comma-separated C argument declarations.
        :arg count_sharing: A mapping consisting of `(child, mother)`
            indicating that `mother` and `child` will always have the
            same number of indices, and the `APPEND` to `mother`
            will always happen *before* the `APPEND` to the child.
        :arg name_prefix: the name prefix to use for the compiled kernels
        :arg options: OpenCL compilation options for kernels using
            *generate_template*.
        :arg complex_kernel: If `True`, prevents vectorization on CPUs.
        :arg eliminate_empty_output_lists: A Python list of list names
            for which the empty output lists are eliminated.

        *generate_template* may use the following C macros/identifiers:

        * `index_type`: expands to C identifier for the index type used
          for the calculation
        * `USER_ARG_DECL`: expands to the C declarator for `arg_decls`
        * `USER_ARGS`: a list of C argument values corresponding to
          `user_arg_decl`
        * `LIST_ARG_DECL`: expands to a C argument list representing the
          data for the output lists. These are escaped prefixed with
          `"plg_"` so as to not interfere with user-provided names.
        * `LIST_ARGS`: a list of C argument values corresponding to
          `LIST_ARG_DECL`
        * `APPEND_name(entry)`: inserts `entry` into the list `name`.
          *entry* must be a valid C expression of the correct type.

        All argument-list related macros have a trailing comma included
        if they are non-empty.

        *generate_template* must supply a function:

        .. code-block:: c

            void generate(USER_ARG_DECL LIST_ARG_DECL index_type i)
            {
                APPEND_mylist(5);
            }

        Internally, the `kernel_template` is expanded (at least) twice. Once,
        for a 'counting' stage where the size of all the lists is determined,
        and a second time, for a 'generation' stage where the lists are
        actually filled. A `generate` function that has side effects beyond
        calling `append` is therefore ill-formed.

        .. versionchanged:: 2018.1

            Change *eliminate_empty_output_lists* argument type from `bool` to
            `list`.
        """

        if devices is None:
            devices = context.devices

        if count_sharing is None:
            count_sharing = {}

        self.context = context
        self.devices = devices

        self.list_names_and_dtypes = list_names_and_dtypes
        self.generate_template = generate_template

        from pyopencl.tools import parse_arg_list
        self.arg_decls = parse_arg_list(arg_decls)

        # To match with the signature of the user-supplied generate(), arguments
        # can't appear to have offsets.
        arg_decls_no_offset = []
        from pyopencl.tools import VectorArg
        for arg in self.arg_decls:
            if isinstance(arg, VectorArg) and arg.with_offset:
                arg = VectorArg(arg.dtype, arg.name)
            arg_decls_no_offset.append(arg)
        self.arg_decls_no_offset = arg_decls_no_offset

        self.count_sharing = count_sharing

        self.name_prefix = name_prefix
        self.preamble = preamble
        self.options = options

        self.debug = debug

        self.complex_kernel = complex_kernel

        if eliminate_empty_output_lists is True:
            eliminate_empty_output_lists = \
                    [name for name, _ in self.list_names_and_dtypes]

        if eliminate_empty_output_lists is False:
            eliminate_empty_output_lists = []

        self.eliminate_empty_output_lists = eliminate_empty_output_lists
        for list_name in self.eliminate_empty_output_lists:
            if not any(list_name == name for name, _ in self.list_names_and_dtypes):
                raise ValueError(
                    "invalid list name '%s' in eliminate_empty_output_lists"
                    % list_name)
Example #40
0
    def aggregate_assignments(self, instructions, result):
        from pymbolic.primitives import Variable

        # {{{ aggregation helpers

        def get_complete_origins_set(insn, skip_levels=0):
            if skip_levels < 0:
                skip_levels = 0

            result = set()
            for dep in insn.get_dependencies():
                if isinstance(dep, Variable):
                    dep_origin = origins_map.get(dep.name, None)
                    if dep_origin is not None:
                        if skip_levels <= 0:
                            result.add(dep_origin)
                        result |= get_complete_origins_set(
                            dep_origin, skip_levels - 1)

            return result

        var_assignees_cache = {}

        def get_var_assignees(insn):
            try:
                return var_assignees_cache[insn]
            except KeyError:
                result = set(
                    Variable(assignee) for assignee in insn.get_assignees())
                var_assignees_cache[insn] = result
                return result

        def aggregate_two_assignments(ass_1, ass_2):
            names = ass_1.names + ass_2.names

            from pymbolic.primitives import Variable
            deps = (ass_1.get_dependencies() | ass_2.get_dependencies()) \
                    - set(Variable(name) for name in names)

            return Assign(names=names,
                          exprs=ass_1.exprs + ass_2.exprs,
                          _dependencies=deps,
                          dep_mapper_factory=self.dep_mapper_factory,
                          priority=max(ass_1.priority, ass_2.priority))

        # }}}

        # {{{ main aggregation pass

        origins_map = dict((assignee, insn) for insn in instructions
                           for assignee in insn.get_assignees())

        from pytools import partition
        unprocessed_assigns, other_insns = partition(
            lambda insn: isinstance(insn, Assign) and not insn.
            is_scalar_valued, instructions)

        # filter out zero-flop-count assigns--no need to bother with those
        processed_assigns, unprocessed_assigns = partition(
            lambda ass: ass.flop_count() == 0, unprocessed_assigns)

        # filter out zero assignments
        from pytools import any
        from hedge.tools import is_zero

        i = 0

        while i < len(unprocessed_assigns):
            my_assign = unprocessed_assigns[i]
            if any(is_zero(expr) for expr in my_assign.exprs):
                processed_assigns.append(unprocessed_assigns.pop())
            else:
                i += 1

        # greedy aggregation
        while unprocessed_assigns:
            my_assign = unprocessed_assigns.pop()

            my_deps = my_assign.get_dependencies()
            my_assignees = get_var_assignees(my_assign)

            agg_candidates = []
            for i, other_assign in enumerate(unprocessed_assigns):
                other_deps = other_assign.get_dependencies()
                other_assignees = get_var_assignees(other_assign)

                if ((my_deps & other_deps or my_deps & other_assignees
                     or other_deps & my_assignees)
                        and my_assign.priority == other_assign.priority):
                    agg_candidates.append((i, other_assign))

            did_work = False

            if agg_candidates:
                my_indirect_origins = get_complete_origins_set(my_assign,
                                                               skip_levels=1)

                for other_assign_index, other_assign in agg_candidates:
                    if self.max_vectors_in_batch_expr is not None:
                        new_assignee_count = len(
                            set(my_assign.get_assignees())
                            | set(other_assign.get_assignees()))
                        new_dep_count = len(
                            my_assign.get_dependencies(each_vector=True)
                            | other_assign.get_dependencies(each_vector=True))

                        if (new_assignee_count + new_dep_count >
                                self.max_vectors_in_batch_expr):
                            continue

                    other_indirect_origins = get_complete_origins_set(
                        other_assign, skip_levels=1)

                    if (my_assign not in other_indirect_origins
                            and other_assign not in my_indirect_origins):
                        did_work = True

                        # aggregate the two assignments
                        new_assignment = aggregate_two_assignments(
                            my_assign, other_assign)
                        del unprocessed_assigns[other_assign_index]
                        unprocessed_assigns.append(new_assignment)
                        for assignee in new_assignment.get_assignees():
                            origins_map[assignee] = new_assignment

                        break

            if not did_work:
                processed_assigns.append(my_assign)

        externally_used_names = set(expr
                                    for insn in processed_assigns + other_insns
                                    for expr in insn.get_dependencies())

        from hedge.tools import is_obj_array
        if is_obj_array(result):
            externally_used_names |= set(expr for expr in result)
        else:
            externally_used_names |= set([result])

        def schedule_and_finalize_assignment(ass):
            dep_mapper = self.dep_mapper_factory()

            names_exprs = zip(ass.names, ass.exprs)

            my_assignees = set(name for name, expr in names_exprs)
            names_exprs_deps = [
                (name, expr,
                 set(dep.name
                     for dep in dep_mapper(expr) if isinstance(dep, Variable))
                 & my_assignees) for name, expr in names_exprs
            ]

            ordered_names_exprs = []
            available_names = set()

            while names_exprs_deps:
                schedulable = []

                i = 0
                while i < len(names_exprs_deps):
                    name, expr, deps = names_exprs_deps[i]

                    unsatisfied_deps = deps - available_names

                    if not unsatisfied_deps:
                        schedulable.append((str(expr), name, expr))
                        del names_exprs_deps[i]
                    else:
                        i += 1

                # make sure these come out in a constant order
                schedulable.sort()

                if schedulable:
                    for key, name, expr in schedulable:
                        ordered_names_exprs.append((name, expr))
                        available_names.add(name)
                else:
                    raise RuntimeError("aggregation resulted in an "
                                       "impossible assignment")

            return self.finalize_multi_assign(
                names=[name for name, expr in ordered_names_exprs],
                exprs=[expr for name, expr in ordered_names_exprs],
                do_not_return=[
                    Variable(name) not in externally_used_names
                    for name, expr in ordered_names_exprs
                ],
                priority=ass.priority)

        return [
            schedule_and_finalize_assignment(ass) for ass in processed_assigns
        ] + other_insns