Example #1
0
def match_dtype_to_c_struct(device, name, dtype, context=None):
    """Return a tuple `(dtype, c_decl)` such that the C struct declaration
    in `c_decl` and the structure :class:`numpy.dtype` instance `dtype`
    have the same memory layout.

    Note that *dtype* may be modified from the value that was passed in,
    for example to insert padding.

    (As a remark on implementation, this routine runs a small kernel on
    the given *device* to ensure that :mod:`numpy` and C offsets and
    sizes match.)

    .. versionadded: 2013.1

    This example explains the use of this function::

        >>> import numpy as np
        >>> import pyopencl as cl
        >>> import pyopencl.tools
        >>> ctx = cl.create_some_context()
        >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)])
        >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct(
        ...     ctx.devices[0], 'id_val', dtype)
        >>> print c_decl
        typedef struct {
          unsigned id;
          float value;
        } id_val;
        >>> print dtype
        [('id', '<u4'), ('value', '<f4')]
        >>> cl.tools.get_or_register_dtype('id_val', dtype)

    As this example shows, it is important to call
    :func:`get_or_register_dtype` on the modified `dtype` returned by this
    function, not the original one.
    """

    import pyopencl as cl

    fields = sorted(dtype.fields.items(),
                    key=lambda name_dtype_offset: name_dtype_offset[1][1])

    c_fields = []
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        if hasattr(field_dtype,
                   "subdtype") and field_dtype.subdtype is not None:
            array_dtype = field_dtype.subdtype[0]
            if hasattr(array_dtype,
                       "subdtype") and array_dtype.subdtype is not None:
                raise NotImplementedError(
                    "nested array dtypes are not supported")
            array_dims = field_dtype.subdtype[1]
            dims_str = ""
            try:
                for dim in array_dims:
                    dims_str += "[%d]" % dim
            except TypeError:
                dims_str = "[%d]" % array_dims
            c_fields.append("  {} {}{};".format(dtype_to_ctype(array_dtype),
                                                field_name, dims_str))
        else:
            c_fields.append("  {} {};".format(dtype_to_ctype(field_dtype),
                                              field_name))

    c_decl = "typedef struct {{\n{}\n}} {};\n\n".format(
        "\n".join(c_fields), name)

    cdl = _CDeclList(device)
    for field_name, dtype_and_offset in fields:
        field_dtype, offset = dtype_and_offset[:2]
        cdl.add_dtype(field_dtype)

    pre_decls = cdl.get_declarations()

    offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" %
                            (i + 1, name, field_name)
                            for i, (field_name, _) in enumerate(fields))

    src = r"""
        #define pycl_offsetof(st, m) \
                 ((uint) ((__local char *) &(dummy.m) \
                 - (__local char *)&dummy ))

        %(pre_decls)s

        %(my_decl)s

        __kernel void get_size_and_offsets(__global uint *result)
        {
            result[0] = sizeof(%(my_type)s);
            __local %(my_type)s dummy;
            %(offset_code)s
        }
    """ % dict(pre_decls=pre_decls,
               my_decl=c_decl,
               my_type=name,
               offset_code=offset_code)

    if context is None:
        context = cl.Context([device])

    queue = cl.CommandQueue(context)

    prg = cl.Program(context, src)
    knl = prg.build(devices=[device]).get_size_and_offsets

    import pyopencl.array  # noqa
    result_buf = cl.array.empty(queue, 1 + len(fields), np.uint32)
    knl(queue, (1, ), (1, ), result_buf.data)
    queue.finish()
    size_and_offsets = result_buf.get()

    size = int(size_and_offsets[0])

    offsets = size_and_offsets[1:]
    if any(ofs >= size for ofs in offsets):
        # offsets not plausible

        if dtype.itemsize == size:
            # If sizes match, use numpy's idea of the offsets.
            offsets = [
                dtype_and_offset[1] for field_name, dtype_and_offset in fields
            ]
        else:
            raise RuntimeError(
                "OpenCL compiler reported offsetof() past sizeof() "
                "for struct layout on '%s'. "
                "This makes no sense, and it's usually indicates a "
                "compiler bug. "
                "Refusing to discover struct layout." % device)

    result_buf.data.release()
    del knl
    del prg
    del queue
    del context

    try:
        dtype_arg_dict = {
            "names":
            [field_name for field_name, (field_dtype, offset) in fields],
            "formats":
            [field_dtype for field_name, (field_dtype, offset) in fields],
            "offsets": [int(x) for x in offsets],
            "itemsize":
            int(size_and_offsets[0]),
        }
        dtype = np.dtype(dtype_arg_dict)
        if dtype.itemsize != size_and_offsets[0]:
            # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo.
            dtype_arg_dict["names"].append("_pycl_size_fixer")
            dtype_arg_dict["formats"].append(np.uint8)
            dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1)
            dtype = np.dtype(dtype_arg_dict)
    except NotImplementedError:

        def calc_field_type():
            total_size = 0
            padding_count = 0
            for offset, (field_name, (field_dtype, _)) in zip(offsets, fields):
                if offset > total_size:
                    padding_count += 1
                    yield ("__pycl_padding%d" % padding_count,
                           "V%d" % offset - total_size)
                yield field_name, field_dtype
                total_size = field_dtype.itemsize + offset

        dtype = np.dtype(list(calc_field_type()))

    assert dtype.itemsize == size_and_offsets[0]

    return dtype, c_decl
Example #2
0
def test_space_invader_query(ctx_factory, dims, dtype, do_plot=False):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    dtype = np.dtype(dtype)
    nparticles = 10**5

    particles = make_normal_particle_array(queue, nparticles, dims, dtype)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(particles[0].get(), particles[1].get(), "x")

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True)

    nballs = 10**4
    ball_centers = make_normal_particle_array(queue, nballs, dims, dtype)
    ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1)

    from boxtree.area_query import (LeavesToBallsLookupBuilder,
                                    SpaceInvaderQueryBuilder)

    siqb = SpaceInvaderQueryBuilder(ctx)
    # We can use leaves-to-balls lookup to get the set of overlapping balls for
    # each box, and from there to compute the outer space invader distance.
    lblb = LeavesToBallsLookupBuilder(ctx)

    siq, _ = siqb(queue, tree, ball_centers, ball_radii)
    lbl, _ = lblb(queue, tree, ball_centers, ball_radii)

    # get data to host for test
    tree = tree.get(queue=queue)
    siq = siq.get(queue=queue)
    lbl = lbl.get(queue=queue)

    ball_centers = np.array([x.get() for x in ball_centers])
    ball_radii = ball_radii.get()

    # Find leaf boxes.
    from boxtree import box_flags_enum

    outer_space_invader_dist = np.zeros(tree.nboxes)

    for ibox in range(tree.nboxes):
        # We only want leaves here.
        if tree.box_flags[ibox] & box_flags_enum.HAS_CHILDREN:
            continue

        start, end = lbl.balls_near_box_starts[ibox:ibox + 2]
        space_invaders = lbl.balls_near_box_lists[start:end]
        if len(space_invaders) > 0:
            outer_space_invader_dist[ibox] = np.max(
                np.abs(tree.box_centers[:, ibox].reshape((-1, 1)) -
                       ball_centers[:, space_invaders]))

    assert np.allclose(siq, outer_space_invader_dist)
Example #3
0
def test_extent_tree(ctx_factory, dims, extent_norm, do_plot=False):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    nsources = 100000
    ntargets = 200000
    dtype = np.float64
    npoint_sources_per_source = 16

    sources = make_normal_particle_array(queue, nsources, dims, dtype, seed=12)
    targets = make_normal_particle_array(queue, ntargets, dims, dtype, seed=19)

    refine_weights = cl.array.zeros(queue, nsources + ntargets, np.int32)
    refine_weights[:nsources] = 1

    from pyopencl.clrandom import PhiloxGenerator
    rng = PhiloxGenerator(queue.context, seed=13)
    source_radii = 2**rng.uniform(queue, nsources, dtype=dtype, a=-10, b=0)
    target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0)

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    dev_tree, _ = tb(
        queue,
        sources,
        targets=targets,
        source_radii=source_radii,
        target_radii=target_radii,
        extent_norm=extent_norm,
        refine_weights=refine_weights,
        max_leaf_refine_weight=20,

        #max_particles_in_box=10,

        # Set artificially small, to exercise the reallocation code.
        nboxes_guess=10,
        debug=True,
        stick_out_factor=0)

    logger.info("transfer tree, check orderings")

    tree = dev_tree.get(queue=queue)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(sources[0].get(), sources[1].get(), "rx")
        pt.plot(targets[0].get(), targets[1].get(), "g+")

        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
        plotter.draw_box_numbers()
        plotter.set_bounding_box()

        pt.gca().set_aspect("equal", "datalim")
        pt.show()

    sorted_sources = np.array(list(tree.sources))
    sorted_targets = np.array(list(tree.targets))
    sorted_source_radii = tree.source_radii
    sorted_target_radii = tree.target_radii

    unsorted_sources = np.array([pi.get() for pi in sources])
    unsorted_targets = np.array([pi.get() for pi in targets])
    unsorted_source_radii = source_radii.get()
    unsorted_target_radii = target_radii.get()

    assert (sorted_sources == unsorted_sources[:, tree.user_source_ids]).all()
    assert (sorted_source_radii == unsorted_source_radii[tree.user_source_ids]
            ).all()

    # {{{ test box structure, stick-out criterion

    logger.info("test box structure, stick-out criterion")

    user_target_ids = np.empty(tree.ntargets, dtype=np.intp)
    user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets,
                                                        dtype=np.intp)
    if ntargets:
        assert (sorted_targets == unsorted_targets[:, user_target_ids]).all()
        assert (sorted_target_radii == unsorted_target_radii[user_target_ids]
                ).all()

    all_good_so_far = True

    # {{{ check sources, targets

    assert np.sum(tree.box_source_counts_nonchild) == nsources
    assert np.sum(tree.box_target_counts_nonchild) == ntargets

    for ibox in range(tree.nboxes):
        kid_sum = sum(tree.box_target_counts_cumul[ichild_box]
                      for ichild_box in tree.box_child_ids[:, ibox]
                      if ichild_box != 0)
        assert (tree.box_target_counts_cumul[ibox] == (
            tree.box_target_counts_nonchild[ibox] + kid_sum)), ibox

    for ibox in range(tree.nboxes):
        extent_low, extent_high = tree.get_box_extent(ibox)

        assert (extent_low >=
                tree.bounding_box[0] - 1e-12 * tree.root_extent).all(), ibox
        assert (extent_high <=
                tree.bounding_box[1] + 1e-12 * tree.root_extent).all(), ibox

        box_children = tree.box_child_ids[:, ibox]
        existing_children = box_children[box_children != 0]

        assert (tree.box_source_counts_nonchild[ibox] +
                np.sum(tree.box_source_counts_cumul[existing_children]) ==
                tree.box_source_counts_cumul[ibox])
        assert (tree.box_target_counts_nonchild[ibox] +
                np.sum(tree.box_target_counts_cumul[existing_children]) ==
                tree.box_target_counts_cumul[ibox])

    del existing_children
    del box_children

    for ibox in range(tree.nboxes):
        lev = int(tree.box_levels[ibox])
        box_radius = 0.5 * tree.root_extent / (1 << lev)
        box_center = tree.box_centers[:, ibox]
        extent_low = box_center - box_radius
        extent_high = box_center + box_radius

        stick_out_dist = tree.stick_out_factor * box_radius
        radius_with_stickout = (1 + tree.stick_out_factor) * box_radius

        for what, starts, counts, points, radii in [
            ("source", tree.box_source_starts, tree.box_source_counts_cumul,
             sorted_sources, sorted_source_radii),
            ("target", tree.box_target_starts, tree.box_target_counts_cumul,
             sorted_targets, sorted_target_radii),
        ]:
            bstart = starts[ibox]
            bslice = slice(bstart, bstart + counts[ibox])
            check_particles = points[:, bslice]
            check_radii = radii[bslice]

            if extent_norm == "linf":
                good = ((check_particles + check_radii <
                         extent_high[:, np.newaxis] + stick_out_dist)
                        &  # noqa: W504
                        (extent_low[:, np.newaxis] - stick_out_dist <=
                         check_particles - check_radii)).all(axis=0)

            elif extent_norm == "l2":
                center_dists = np.sqrt(
                    np.sum((check_particles - box_center.reshape(-1, 1))**2,
                           axis=0))

                good = ((center_dists + check_radii)**2 <
                        dims * radius_with_stickout**2)

            else:
                raise ValueError("unexpected value of extent_norm")

            all_good_here = good.all()

            if not all_good_here:
                print("BAD BOX %s %d level %d" %
                      (what, ibox, tree.box_levels[ibox]))

            all_good_so_far = all_good_so_far and all_good_here
            assert all_good_here

    # }}}

    assert all_good_so_far

    # }}}

    # {{{ create, link point sources

    logger.info("creating point sources")

    np.random.seed(20)

    from pytools.obj_array import make_obj_array
    point_sources = make_obj_array([
        cl.array.to_device(
            queue, unsorted_sources[i][:, np.newaxis] +
            unsorted_source_radii[:, np.newaxis] * np.random.uniform(
                -1, 1, size=(nsources, npoint_sources_per_source)))
        for i in range(dims)
    ])

    point_source_starts = cl.array.arange(queue,
                                          0, (nsources + 1) *
                                          npoint_sources_per_source,
                                          npoint_sources_per_source,
                                          dtype=tree.particle_id_dtype)

    from boxtree.tree import link_point_sources
    dev_tree = link_point_sources(queue,
                                  dev_tree,
                                  point_source_starts,
                                  point_sources,
                                  debug=True)
Example #4
0
}

"""

import pyopencl as cl
from time import time
import numpy

block_size = 16

ctx = cl.create_some_context()

for dev in ctx.devices:
    assert dev.local_mem_size > 0

queue = cl.CommandQueue(
    ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)

#queue = cl.CommandQueue(ctx)

if False:
    a_height = 4096
    #a_height = 1024
    a_width = 2048
    #a_width = 256
    #b_height == a_width
    b_width = a_height

elif False:
    # like PyCUDA
    a_height = 2516
    a_width = 1472
Example #5
0
                                                3,
                                                *params,
                                                test_case='exact')


@pytest.mark.parametrize("params", [
    [2, 5, 4, 4],
    [3, 7, 5, 3],
    [4, 7, 3, 5],
])
def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params):
    cl_ctx = ctx_factory()
    queue = cl.CommandQueue(cl_ctx)
    assert drive_test_to_meshmode_interpolation(
        cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3


# }}} End 3d tests

if __name__ == '__main__':
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    resid = drive_test_to_meshmode_interpolation(cl_ctx,
                                                 queue,
                                                 dim=3,
                                                 degree=9,
                                                 nel_1d=7,
                                                 n_levels=2,
                                                 q_order=10,
                                                 test_case="exact")
Example #6
0
def test_proxy_generator(ctx_factory, ndim, factor, visualize=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    qbx = _build_qbx_discr(queue, ndim=ndim)
    srcindices = _build_block_index(qbx.density_discr, factor=factor)

    from pytential.linalg.proxy import ProxyGenerator
    generator = ProxyGenerator(qbx, ratio=1.1)
    proxies, pxyranges, pxycenters, pxyradii = generator(queue, srcindices)

    proxies = np.vstack([p.get() for p in proxies])
    pxyranges = pxyranges.get()
    pxycenters = np.vstack([c.get() for c in pxycenters])
    pxyradii = pxyradii.get()

    for i in range(srcindices.nblocks):
        ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]

        r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0)
        assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14)

    srcindices = srcindices.get(queue)
    if visualize:
        if qbx.ambient_dim == 2:
            import matplotlib.pyplot as pt

            density_nodes = qbx.density_discr.nodes().get(queue)
            ci = bind(qbx, sym.expansion_centers(qbx.ambient_dim, -1))(queue)
            ci = np.vstack([c.get(queue) for c in ci])
            ce = bind(qbx, sym.expansion_centers(qbx.ambient_dim, +1))(queue)
            ce = np.vstack([c.get(queue) for c in ce])
            r = bind(qbx, sym.expansion_radii(qbx.ambient_dim))(queue).get()

            for i in range(srcindices.nblocks):
                isrc = srcindices.block_indices(i)
                ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]]

                pt.figure(figsize=(10, 8))
                axis = pt.gca()
                for j in isrc:
                    c = pt.Circle(ci[:, j], r[j], color='k', alpha=0.1)
                    axis.add_artist(c)
                    c = pt.Circle(ce[:, j], r[j], color='k', alpha=0.1)
                    axis.add_artist(c)

                pt.plot(density_nodes[0],
                        density_nodes[1],
                        'ko',
                        ms=2.0,
                        alpha=0.5)
                pt.plot(density_nodes[0, srcindices.indices],
                        density_nodes[1, srcindices.indices],
                        'o',
                        ms=2.0)
                pt.plot(density_nodes[0, isrc],
                        density_nodes[1, isrc],
                        'o',
                        ms=2.0)
                pt.plot(proxies[0, ipxy], proxies[1, ipxy], 'o', ms=2.0)
                pt.xlim([-1.5, 1.5])
                pt.ylim([-1.5, 1.5])

                filename = "test_proxy_generator_{}d_{:04}.png".format(ndim, i)
                pt.savefig(filename, dpi=300)
                pt.clf()
        else:
            from meshmode.discretization.visualization import make_visualizer
            from meshmode.mesh.processing import (  # noqa
                affine_map, merge_disjoint_meshes)
            from meshmode.discretization import Discretization
            from meshmode.discretization.poly_element import \
                InterpolatoryQuadratureSimplexGroupFactory

            from meshmode.mesh.generation import generate_icosphere
            ref_mesh = generate_icosphere(1, generator.nproxy)

            # NOTE: this does not plot the actual proxy points
            for i in range(srcindices.nblocks):
                mesh = affine_map(ref_mesh,
                                  A=(pxyradii[i] * np.eye(ndim)),
                                  b=pxycenters[:, i].reshape(-1))

                mesh = merge_disjoint_meshes([mesh, qbx.density_discr.mesh])
                discr = Discretization(
                    ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(10))

                vis = make_visualizer(queue, discr, 10)
                filename = "test_proxy_generator_{}d_{:04}.vtu".format(ndim, i)
                vis.write_vtk_file(filename, [])
Example #7
0
    def __init__(self, coefficients, nb_channel, dtype, chunksize,
                 overlapsize):
        SosFiltfilt_Base.__init__(self, coefficients, nb_channel, dtype,
                                  chunksize, overlapsize)

        assert self.dtype == np.dtype('float32')
        assert self.chunksize is not None, 'chunksize for opencl must be fixed'

        self.coefficients = self.coefficients.astype(self.dtype)
        if self.coefficients.ndim == 2:  #(nb_section, 6) to (nb_channel, nb_section, 6)
            self.coefficients = np.tile(self.coefficients[None, :, :],
                                        (nb_channel, 1, 1))
        if not self.coefficients.flags['C_CONTIGUOUS']:
            self.coefficients = self.coefficients.copy()
        assert self.coefficients.shape[
            0] == self.nb_channel, 'wrong coefficients.shape'
        assert self.coefficients.shape[2] == 6, 'wrong coefficients.shape'

        self.nb_section = self.coefficients.shape[1]

        self.ctx = pyopencl.create_some_context()
        #TODO : add arguments gpu_platform_index/gpu_device_index
        #self.devices =  [pyopencl.get_platforms()[self.gpu_platform_index].get_devices()[self.gpu_device_index] ]
        #self.ctx = pyopencl.Context(self.devices)
        self.queue = pyopencl.CommandQueue(self.ctx)

        #host arrays
        self.zi1 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype)
        self.zi2 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype)
        self.output1 = np.zeros((self.chunksize, self.nb_channel),
                                dtype=self.dtype)
        self.output2 = np.zeros((self.backward_chunksize, self.nb_channel),
                                dtype=self.dtype)

        #GPU buffers
        self.coefficients_cl = pyopencl.Buffer(self.ctx,
                                               mf.READ_ONLY | mf.COPY_HOST_PTR,
                                               hostbuf=self.coefficients)
        self.zi1_cl = pyopencl.Buffer(self.ctx,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=self.zi1)
        self.zi2_cl = pyopencl.Buffer(self.ctx,
                                      mf.READ_WRITE | mf.COPY_HOST_PTR,
                                      hostbuf=self.zi2)
        self.input1_cl = pyopencl.Buffer(self.ctx,
                                         mf.READ_WRITE,
                                         size=self.output1.nbytes)
        self.output1_cl = pyopencl.Buffer(self.ctx,
                                          mf.READ_WRITE,
                                          size=self.output1.nbytes)
        self.input2_cl = pyopencl.Buffer(self.ctx,
                                         mf.READ_WRITE,
                                         size=self.output2.nbytes)
        self.output2_cl = pyopencl.Buffer(self.ctx,
                                          mf.READ_WRITE,
                                          size=self.output2.nbytes)

        #nb works
        kernel = self.kernel % dict(forward_chunksize=self.chunksize,
                                    backward_chunksize=self.backward_chunksize,
                                    nb_section=self.nb_section,
                                    nb_channel=self.nb_channel)
        prg = pyopencl.Program(self.ctx, kernel)
        self.opencl_prg = prg.build(options='-cl-mad-enable')
Example #8
0
import pyopencl as CL
from pyopencl import array
import numpy

CL.tools.clear_first_arg_caches()

c = CL.Context([CL.get_platforms()[0].get_devices()[0]])

with open("kernel.cl", "r") as k_src:
    k = CL.Program(c, k_src.read()).build("-I./src/cl")
q = CL.CommandQueue(c)

flags = CL.mem_flags

# 290 = i i
# 1323270 = (k i) k
# 659718 = (k* i) k
# 72218 = Ω

#λ. 1
#(λ. 1) 0
#λ. ((λ. 1) 0)
#0 (λ. 1)
#[(),(),(),()]
#*Main> map (\t -> encode_b $ reverse $ encode_t t) [a,b,c,d]
#[12,178,712,198]

#*Main> map (\t -> encode_b $ reverse $ encode_t (substitute 0 k t)) [a,b,c,d]
#[192,99074,712,98498]

#*Main> map (\t -> encode_b $ reverse $ encode_t t) [a,b,c,d]
Example #9
0
    def set_cl(self, targetOpenCL='auto', precisionOpenCL='auto'):
        if (targetOpenCL == self.lastTargetOpenCL) and\
                (precisionOpenCL == self.lastPrecisionOpenCL):
            return
        self.lastTargetOpenCL = targetOpenCL
        self.lastPrecisionOpenCL = precisionOpenCL
        if not isOpenCL:
            raise EnvironmentError("pyopencl is not available!")
        else:
            if isinstance(targetOpenCL, (tuple, list)):
                iDevice = []
                targetOpenCL = list(targetOpenCL)
                if isinstance(targetOpenCL[0], int):
                    nPlatform, nDevice = targetOpenCL
                    platform = cl.get_platforms()[nPlatform]
                    iDevice.extend([platform.get_devices()[nDevice]])
                else:
                    for target in targetOpenCL:
                        if isinstance(target, (tuple, list)):
                            target = list(target)
                            if len(target) > 1:
                                nPlatform, nDevice = target
                                platform = cl.get_platforms()[nPlatform]
                                iDevice.extend(
                                    [platform.get_devices()[nDevice]])
                            else:
                                nPlatform = target[0]
                                platform = cl.get_platforms()[nPlatform]
                                iDevice.extend(platform.get_devices())
            elif isinstance(targetOpenCL, int):
                nPlatform = targetOpenCL
                platform = cl.get_platforms()[nPlatform]
                iDevice = platform.get_devices()
            elif isinstance(targetOpenCL, str):
                iDeviceCPU = []
                iDeviceGPU = []
                iDeviceAcc = []
                iDevice = []
                for platform in cl.get_platforms():
                    CPUdevices = []
                    GPUdevices = []
                    AccDevices = []
                    try:  # at old pyopencl versions:
                        CPUdevices =\
                            platform.get_devices(
                                device_type=cl.device_type.CPU)
                        GPUdevices =\
                            platform.get_devices(
                                device_type=cl.device_type.GPU)
                        AccDevices =\
                            platform.get_devices(
                                device_type=cl.device_type.ACCELERATOR)
                    except cl.RuntimeError:
                        pass

                    if len(CPUdevices) > 0:
                        if len(iDeviceCPU) > 0:
                            if CPUdevices[0].vendor == \
                                    CPUdevices[0].platform.vendor:
                                try:
                                    tmpctx = cl.Context(devices=CPUdevices)
                                    iDeviceCPU = CPUdevices
                                except:
                                    pass
                        else:
                            try:
                                tmpctx = cl.Context(devices=CPUdevices)
                                iDeviceCPU.extend(CPUdevices)
                            except:
                                pass
                    for GPUDevice in GPUdevices:
                        try:
                            tmpctx = cl.Context(devices=[GPUDevice])
                            if GPUDevice.double_fp_config > 0:
                                iDeviceGPU.extend([GPUDevice])
                        except:
                            pass
                    iDeviceAcc.extend(AccDevices)
                if _DEBUG > 10:
                    print("OpenCL: bulding {0} ...".format(self.cl_filename))
                    print("OpenCL: found {0} CPU{1}".format(
                        len(iDeviceCPU) if len(iDeviceCPU) > 0 else 'none',
                        's' if len(iDeviceCPU) > 1 else ''))
                    print("OpenCL: found {0} GPU{1}".format(
                        len(iDeviceGPU) if len(iDeviceGPU) > 0 else 'none',
                        's' if len(iDeviceGPU) > 1 else ''))
                    print("OpenCL: found {0} other accelerator{1}".format(
                        len(iDeviceAcc) if len(iDeviceAcc) > 0 else 'none',
                        's' if len(iDeviceAcc) > 1 else ''))

                if targetOpenCL.upper().startswith('GPU'):
                    iDevice.extend(iDeviceGPU)
                elif targetOpenCL.upper().startswith('CPU'):
                    iDevice.extend(iDeviceCPU)
                elif targetOpenCL.upper().startswith('ALL'):
                    iDevice.extend(iDeviceGPU)
                    iDevice.extend(iDeviceCPU)
                    iDevice.extend(iDeviceAcc)
                else:  # auto
                    if len(iDeviceGPU) > 0:
                        iDevice = iDeviceGPU
                    elif len(iDeviceAcc) > 0:
                        iDevice = iDeviceAcc
                    else:
                        iDevice = iDeviceCPU
                if _DEBUG > 10:
                    for idn, idv in enumerate(iDevice):
                        print("OpenCL: Autoselected device {0}: {1}".format(
                            idn, idv.name))
                if len(iDevice) == 0:
                    targetOpenCL = None
            else:  # None
                targetOpenCL = None

        if targetOpenCL is not None:
            if self.kernelsource is None:
                cl_file = os.path.join(os.path.dirname(__file__),
                                       self.cl_filename)
                with open(cl_file, 'r') as f:
                    kernelsource = f.read()
            else:
                kernelsource = self.kernelsource
            if precisionOpenCL == 'auto':
                try:
                    for device in iDevice:
                        if device.double_fp_config == 63:
                            precisionOpenCL = 'float64'
                        else:
                            raise AttributeError
                except AttributeError:
                    precisionOpenCL = 'float32'
            if _DEBUG > 10:
                print('precisionOpenCL = {0}'.format(precisionOpenCL))
            if precisionOpenCL == 'float64':
                self.cl_precisionF = np.float64
                self.cl_precisionC = np.complex128
                kernelsource = kernelsource.replace('float', 'double')
            else:
                self.cl_precisionF = np.float32
                self.cl_precisionC = np.complex64
            self.cl_queue = []
            self.cl_ctx = []
            self.cl_program = []
            for device in iDevice:
                cl_ctx = cl.Context(devices=[device])
                self.cl_queue.extend([cl.CommandQueue(cl_ctx, device)])
                self.cl_program.extend([
                    cl.Program(cl_ctx,
                               kernelsource).build(options=["-I " + __dir__])
                ])
                self.cl_ctx.extend([cl_ctx])

            self.cl_mf = cl.mem_flags
Example #10
0
def main():
    # setup OpenCL
    platforms = cl.get_platforms(
    )  # a platform corresponds to a driver (e.g. AMD)
    platform = platforms[0]  # take first platform
    devices = platform.get_devices(
        cl.device_type.GPU)  # get GPU devices of selected platform
    device = devices[0]  # take first GPU
    context = cl.Context([device])  # put selected GPU into context object
    queue = cl.CommandQueue(
        context, device)  # create command queue for selected GPU and context

    # setup buffer for particles
    particles_buff = cl.Buffer(context,
                               cl.mem_flags.READ_WRITE,
                               size=PARTICLE_STRUCT_SIZE * PARTICLES_NUM,
                               hostbuf=None)

    # setup random values (for random speed and color)
    random.seed()
    rand_values = np.array(
        [random.random() - 0.5 for _ in range(2 * PARTICLES_NUM)],
        dtype=np.float32)
    bufRandVals = cl.Buffer(context,
                            cl.mem_flags.READ_ONLY
                            | cl.mem_flags.COPY_HOST_PTR,
                            hostbuf=rand_values)

    img = np.zeros([WINDOW_SIZE, WINDOW_SIZE, COLOR_CHANNELS],
                   dtype=np.int32)  # must be square to ignore distortion
    img_buff = cl.Buffer(context,
                         cl.mem_flags.WRITE_ONLY,
                         size=WINDOW_SIZE * WINDOW_SIZE * COLOR_CHANNELS *
                         COLOR_CHANNEL_SIZE)

    # load and compile OpenCL program
    compilerSettings = f'-DWINDOW_SIZE={WINDOW_SIZE}'
    program = cl.Program(context,
                         open(KERNEL_PATH).read()).build(compilerSettings)
    init_particles = cl.Kernel(program, 'init_particles')
    update_particles = cl.Kernel(program, 'update_particles')
    clear_canvas = cl.Kernel(program, 'clear_canvas')
    draw_particles = cl.Kernel(program, 'draw_particles')
    saturate = cl.Kernel(program, 'saturate')

    # init particles (https://documen.tician.de/pyopencl/runtime_program.html#pyopencl.enqueue_nd_range_kernel)
    init_particles.set_arg(0, particles_buff)
    init_particles.set_arg(1, bufRandVals)
    cl.enqueue_nd_range_kernel(queue, init_particles, (PARTICLES_NUM, ), None)

    # since all particles start from same place, they will go all up
    for _ in range(100):
        update_particles.set_arg(0, particles_buff)
        cl.enqueue_nd_range_kernel(queue, update_particles, (PARTICLES_NUM, ),
                                   None)
    while True:
        # clear canvas
        clear_canvas.set_arg(0, img_buff)
        cl.enqueue_nd_range_kernel(queue, clear_canvas,
                                   (WINDOW_SIZE, WINDOW_SIZE), None)

        # draw all particles
        draw_particles.set_arg(0, particles_buff)
        draw_particles.set_arg(1, img_buff)
        cl.enqueue_nd_range_kernel(queue, draw_particles, (PARTICLES_NUM, ),
                                   None)

        # saturate
        saturate.set_arg(0, img_buff)
        cl.enqueue_nd_range_kernel(queue, saturate, (WINDOW_SIZE, WINDOW_SIZE),
                                   None)

        # update particles
        update_particles.set_arg(0, particles_buff)
        cl.enqueue_nd_range_kernel(queue, update_particles, (PARTICLES_NUM, ),
                                   None)

        # copy result from GPU and show
        cl.enqueue_copy(queue, img, img_buff, is_blocking=True)
        cv2.imshow("press ESC to exit", img.astype(np.uint8))

        # exit with ESC
        keyPressed = cv2.waitKey(10)
        if keyPressed == 27:
            break
Example #11
0
def demo_cost_model():
    if not SUPPORTS_PROCESS_TIME:
        raise NotImplementedError(
            "Currently this script uses process time which only works on Python>=3.3"
        )

    from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler

    nsources_list = [1000, 2000, 3000, 4000, 5000]
    ntargets_list = [1000, 2000, 3000, 4000, 5000]
    dims = 3
    dtype = np.float64

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    traversals = []
    traversals_dev = []
    level_to_orders = []
    timing_results = []

    def fmm_level_to_nterms(tree, ilevel):
        return 10

    for nsources, ntargets in zip(nsources_list, ntargets_list):
        # {{{ Generate sources, targets and target_radii

        from boxtree.tools import make_normal_particle_array as p_normal
        sources = p_normal(queue, nsources, dims, dtype, seed=15)
        targets = p_normal(queue, ntargets, dims, dtype, seed=18)

        from pyopencl.clrandom import PhiloxGenerator
        rng = PhiloxGenerator(queue.context, seed=22)
        target_radii = rng.uniform(
            queue, ntargets, a=0, b=0.05, dtype=dtype
        ).get()

        # }}}

        # {{{ Generate tree and traversal

        from boxtree import TreeBuilder
        tb = TreeBuilder(ctx)
        tree, _ = tb(
            queue, sources, targets=targets, target_radii=target_radii,
            stick_out_factor=0.15, max_particles_in_box=30, debug=True
        )

        from boxtree.traversal import FMMTraversalBuilder
        tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2)
        trav_dev, _ = tg(queue, tree, debug=True)
        trav = trav_dev.get(queue=queue)

        traversals.append(trav)
        traversals_dev.append(trav_dev)

        # }}}

        wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms)
        level_to_orders.append(wrangler.level_nterms)

        timing_data = {}
        from boxtree.fmm import drive_fmm
        src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype)
        drive_fmm(trav, wrangler, (src_weights,), timing_data=timing_data)

        timing_results.append(timing_data)

    time_field_name = "process_elapsed"

    from boxtree.cost import FMMCostModel
    from boxtree.cost import make_pde_aware_translation_cost_model
    cost_model = FMMCostModel(make_pde_aware_translation_cost_model)

    model_results = []
    for icase in range(len(traversals)-1):
        traversal = traversals_dev[icase]
        model_results.append(
            cost_model.cost_per_stage(
                queue, traversal, level_to_orders[icase],
                FMMCostModel.get_unit_calibration_params(),
            )
        )
    queue.finish()

    params = cost_model.estimate_calibration_params(
        model_results, timing_results[:-1], time_field_name=time_field_name
    )

    predicted_time = cost_model.cost_per_stage(
        queue, traversals_dev[-1], level_to_orders[-1], params,
    )
    queue.finish()

    for field in ["form_multipoles", "eval_direct", "multipole_to_local",
                  "eval_multipoles", "form_locals", "eval_locals",
                  "coarsen_multipoles", "refine_locals"]:
        measured = timing_results[-1][field]["process_elapsed"]
        pred_err = (
                (measured - predicted_time[field])
                / measured)
        logger.info("actual/predicted time for %s: %.3g/%.3g -> %g %% error",
                field,
                measured,
                predicted_time[field],
                abs(100*pred_err))
Example #12
0
    return context, devices[0]


t1 = time.time()
a = 0
#for i in range(A):
#	a = a + 1
t2 = time.time()
timeWithoutGPU = t2 - t1

a = numpy.zeros((5, ), dtype=numpy.float32)
b = numpy.zeros((5, ), dtype=numpy.float32)
c = numpy.zeros((5, ), dtype=numpy.float32)

context, device = CreateContext()
commandQueue = cl.CommandQueue(context, device)
program = cl.Program(context, kernelStr)
program.build(devices=[device])
mf = cl.mem_flags
a2 = a
b2 = b
c2 = c
a = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a2)
b = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b2)
c = cl.Buffer(context, mf.READ_WRITE, c.nbytes)
t3 = time.time()
print("Time taken for setting GPU is " + str(t3 - t2))
for i in range(100):
    program.add(commandQueue, (5, 5, 5), None, a, b, c)
cl.enqueue_read_buffer(commandQueue, a, a2).wait()
cl.enqueue_read_buffer(commandQueue, b, b2).wait()
def find_mode():
    import warnings
    warnings.simplefilter("error", np.ComplexWarning)

    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    k0 = 1.4447
    k1 = k0 * 1.02
    beta_sym = sym.var("beta")

    from pytential.symbolic.pde.scalar import (  # noqa
        DielectricSRep2DBoundaryOperator as SRep,
        DielectricSDRep2DBoundaryOperator as SDRep)
    pde_op = SDRep(mode="te",
                   k_vacuum=1,
                   interfaces=((0, 1, sym.DEFAULT_SOURCE), ),
                   domain_k_exprs=(k0, k1),
                   beta=beta_sym,
                   use_l2_weighting=False)

    u_sym = pde_op.make_unknown("u")
    op = pde_op.operator(u_sym)

    # {{{ discretization setup

    from meshmode.mesh.generation import ellipse, make_curve_mesh
    curve_f = partial(ellipse, 1)

    target_order = 7
    qbx_order = 4
    nelements = 30

    from meshmode.mesh.processing import affine_map
    mesh = make_curve_mesh(curve_f, np.linspace(0, 1, nelements + 1),
                           target_order)
    lambda_ = 1.55
    circle_radius = 3.4 * 2 * np.pi / lambda_
    mesh = affine_map(mesh, A=circle_radius * np.eye(2))

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            InterpolatoryQuadratureSimplexGroupFactory
    from pytential.qbx import QBXLayerPotentialSource
    density_discr = Discretization(
        cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))

    qbx = QBXLayerPotentialSource(
        density_discr,
        4 * target_order,
        qbx_order,
        # Don't use FMM for now
        fmm_order=False)

    # }}}

    x_vec = np.random.randn(len(u_sym) * density_discr.nnodes)
    y_vec = np.random.randn(len(u_sym) * density_discr.nnodes)

    def muller_solve_func(beta):
        from pytential.symbolic.execution import build_matrix
        mat = build_matrix(queue, qbx, op, u_sym, context={"beta": beta}).get()

        return 1 / x_vec.dot(la.solve(mat, y_vec))

    starting_guesses = (1 + 0j) * (k0 + (k1 - k0) * np.random.rand(3))

    from pytential.muller import muller
    beta, niter = muller(muller_solve_func, z_start=starting_guesses)
    print("beta")
Example #14
0
 def __init__(self):
     self.ctx = cl.create_some_context()
     self.queue = cl.CommandQueue(self.ctx)
     self.tick = False
Example #15
0
def refine_and_generate_chart_function(mesh, filename, function):
    from time import clock
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    print("NELEMENTS: ", mesh.nelements)
    #print mesh
    for i in range(len(mesh.groups[0].vertex_indices[0])):
        for k in range(len(mesh.vertices)):
            print(mesh.vertices[k, i])

    #check_nodal_adj_against_geometry(mesh);
    r = Refiner(mesh)
    #random.seed(0)
    #times = 3
    num_elements = []
    time_t = []
    #nelements = mesh.nelements
    while True:
        print("NELS:", mesh.nelements)
        #flags = get_corner_flags(mesh)
        flags = get_function_flags(mesh, function)
        nels = 0
        for i in flags:
            if i:
                nels += 1
        if nels == 0:
            break
        print("LKJASLFKJALKASF:", nels)
        num_elements.append(nels)
        #flags = get_corner_flags(mesh)
        beg = clock()
        mesh = r.refine(flags)
        end = clock()
        time_taken = end - beg
        time_t.append(time_taken)
        #if nelements == mesh.nelements:
        #break
        #nelements = mesh.nelements
        #from meshmode.mesh.visualization import draw_2d_mesh
        #draw_2d_mesh(mesh, True, True, True, fill=None)
        #import matplotlib.pyplot as pt
        #pt.show()

        #poss_flags = np.zeros(len(mesh.groups[0].vertex_indices))
        #for i in range(0, len(flags)):
        #    poss_flags[i] = flags[i]
        #for i in range(len(flags), len(poss_flags)):
        #    poss_flags[i] = 1

    import matplotlib.pyplot as pt
    pt.xlabel('Number of elements being refined')
    pt.ylabel('Time taken')
    pt.plot(num_elements, time_t, "o")
    pt.savefig(filename, format='pdf')
    pt.clf()
    print('DONE REFINING')
    '''
    flags = np.zeros(len(mesh.groups[0].vertex_indices))
    flags[0] = 1
    flags[1] = 1
    mesh = r.refine(flags)
    flags = np.zeros(len(mesh.groups[0].vertex_indices))
    flags[0] = 1
    flags[1] = 1
    flags[2] = 1
    mesh = r.refine(flags)
    '''
    #check_nodal_adj_against_geometry(mesh)
    #r.print_rays(70)
    #r.print_rays(117)
    #r.print_hanging_elements(10)
    #r.print_hanging_elements(117)
    #r.print_hanging_elements(757)
    #from meshmode.mesh.visualization import draw_2d_mesh
    #draw_2d_mesh(mesh, False, False, False, fill=None)
    #import matplotlib.pyplot as pt
    #pt.show()

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            PolynomialWarpAndBlendGroupFactory
    discr = Discretization(cl_ctx, mesh,
                           PolynomialWarpAndBlendGroupFactory(order))
    from meshmode.discretization.visualization import make_visualizer
    vis = make_visualizer(queue, discr, order)
    remove_if_exists("connectivity2.vtu")
    remove_if_exists("geometry2.vtu")
    vis.write_vtk_file("geometry2.vtu", [
        ("f", discr.nodes()[0]),
    ])

    from meshmode.discretization.visualization import \
            write_nodal_adjacency_vtk_file

    write_nodal_adjacency_vtk_file("connectivity2.vtu", mesh)
Example #16
0
 def __init__(self):
     
     self.ctx = self.__create_context()
     self.queue = cl.CommandQueue(self.ctx)
     self.programs = {}  # The built programs
Example #17
0
def test_partition_points(ctx_factory, use_tree, ndim, visualize=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    qbx = _build_qbx_discr(queue, ndim=ndim)
    _build_block_index(qbx.density_discr, use_tree=use_tree, factor=0.6)
Example #18
0
                          usage=GL_DYNAMIC_DRAW,
                          target=GL_ARRAY_BUFFER)
    gl_position.bind()
    np_color = np.ndarray((num_particles, 4), dtype=np.float32)
    gl_color = vbo.VBO(data=np_color,
                       usage=GL_DYNAMIC_DRAW,
                       target=GL_ARRAY_BUFFER)
    gl_color.bind()

    # Define pyopencl context and queue based on available hardware
    platform = cl.get_platforms()[0]
    dev = platform.get_devices(device_type=cl.device_type.GPU)
    context = cl.Context(
        properties=[(cl.context_properties.PLATFORM, platform)] +
        get_gl_sharing_context_properties())
    queue = cl.CommandQueue(context)

    cl_velocity = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_velocity)
    cl_zmel = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_zmel)
    cl_start_position = cl.Buffer(context,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=np_position)
    cl_start_velocity = cl.Buffer(context,
                                  mf.READ_ONLY | mf.COPY_HOST_PTR,
                                  hostbuf=np_velocity)

    # Buffer object depends on version of PyOpenCL
    if hasattr(gl_position, 'buffers'):
        cl_gl_position = cl.GLBuffer(context, mf.READ_WRITE,
                                     int(gl_position.buffers[0]))
        cl_gl_color = cl.GLBuffer(context, mf.READ_WRITE,
Example #19
0
def test_interaction_points(ctx_factory, ndim, factor, visualize=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    qbx = _build_qbx_discr(queue, ndim=ndim)
    srcindices = _build_block_index(qbx.density_discr, factor=factor)

    # generate proxy points
    from pytential.linalg.proxy import ProxyGenerator
    generator = ProxyGenerator(qbx)
    _, _, pxycenters, pxyradii = generator(queue, srcindices)

    from pytential.linalg.proxy import (  # noqa
        gather_block_neighbor_points, gather_block_interaction_points)
    nbrindices = gather_block_neighbor_points(qbx.density_discr, srcindices,
                                              pxycenters, pxyradii)
    nodes, ranges = gather_block_interaction_points(qbx, srcindices)

    srcindices = srcindices.get(queue)
    nbrindices = nbrindices.get(queue)

    for i in range(srcindices.nblocks):
        isrc = srcindices.block_indices(i)
        inbr = nbrindices.block_indices(i)

        assert not np.any(np.isin(inbr, isrc))

    if visualize:
        if ndim == 2:
            import matplotlib.pyplot as pt
            density_nodes = qbx.density_discr.nodes().get(queue)
            nodes = nodes.get(queue)
            ranges = ranges.get(queue)

            for i in range(srcindices.nblocks):
                isrc = srcindices.block_indices(i)
                inbr = nbrindices.block_indices(i)
                iall = np.s_[ranges[i]:ranges[i + 1]]

                pt.figure(figsize=(10, 8))
                pt.plot(density_nodes[0],
                        density_nodes[1],
                        'ko',
                        ms=2.0,
                        alpha=0.5)
                pt.plot(density_nodes[0, srcindices.indices],
                        density_nodes[1, srcindices.indices],
                        'o',
                        ms=2.0)
                pt.plot(density_nodes[0, isrc],
                        density_nodes[1, isrc],
                        'o',
                        ms=2.0)
                pt.plot(density_nodes[0, inbr],
                        density_nodes[1, inbr],
                        'o',
                        ms=2.0)
                pt.plot(nodes[0, iall], nodes[1, iall], 'x', ms=2.0)
                pt.xlim([-1.5, 1.5])
                pt.ylim([-1.5, 1.5])

                filename = "test_area_query_{}d_{:04}.png".format(ndim, i)
                pt.savefig(filename, dpi=300)
                pt.clf()
        elif ndim == 3:
            from meshmode.discretization.visualization import make_visualizer
            marker = np.empty(qbx.density_discr.nnodes)

            for i in range(srcindices.nblocks):
                isrc = srcindices.block_indices(i)
                inbr = nbrindices.block_indices(i)

                marker.fill(0.0)
                marker[srcindices.indices] = 0.0
                marker[isrc] = -42.0
                marker[inbr] = +42.0
                marker_dev = cl.array.to_device(queue, marker)

                vis = make_visualizer(queue, qbx.density_discr, 10)
                filename = "test_area_query_{}d_{:04}.vtu".format(ndim, i)
                vis.write_vtk_file(filename, [
                    ("marker", marker_dev),
                ])
Example #20
0
def main():
    platform_ID = None
    xclbin = None
    globalbuffersize = 1024*1024*16    #16 MB
    typesize = 512
    threshold = 40000
    expected = np.array([[300,240,450,250,250,250],       # 32 bits
                         [600,500,1000,500,500,500],      # 64 bits
                         [1100,900,1500,1100,1100,1100],  #128 bits
                         [1500,1500,1900,2200,2200,2200], #256 bits
                         [1900,2000,2300,3800,3800,3800]  #512 bits
                     ])
 
    # Process cmd line args
    parser = OptionParser()
    parser.add_option("-k", "--kernel", help="xclbin path")
    parser.add_option("-d", "--device", help="device index")
 
    (options, args) = parser.parse_args()
    xclbin = options.kernel
    index = options.device
    
    if xclbin is None:
       print("No xclbin specified\nUsage: -k <path to xclbin>")
       sys.exit(1)
    
    if index is None:
       index = 0 #get default device
    
    platforms = cl.get_platforms()
    # get Xilinx platform 
    for i in platforms:
       if i.name == "Xilinx":
          platform_ID = platforms.index(i)
          print("\nPlatform Information:")
          print("Platform name:       %s" %platforms[platform_ID].name)
          print("Platform version:    %s" %platforms[platform_ID].version)
          print("Platform profile:    %s" %platforms[platform_ID].profile)
          print("Platform extensions: %s" %platforms[platform_ID].extensions)
          break
 
    if platform_ID is None:
       #make sure xrt is sourced
       #run clinfo to make sure Xilinx platform is discoverable
       print("ERROR: Plaform not found")
       sys.exit(1)
 
    # choose device
    devices = platforms[platform_ID].get_devices()
    if int(index) > len(devices)-1:
       print("\nERROR: Index out of range. %d devices were found" %len(devices))
       sys.exit(1)
    else:
       dev = devices[int(index)]

    if "qdma" in str(dev):
       threshold = 30000
    
    if "U2x4" in str(dev):
       threshold = 15000

    if "gen3x4" in str(dev):
       threshold = 20000

    ctx = cl.Context(devices = [dev])
    if not ctx:
       print("ERROR: Failed to create context")
       sys.exit(1)
 
    commands = cl.CommandQueue(ctx, dev, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE)
 
    if not commands:
       print("ERROR: Failed to create command queue")
       sys.exit(1)
 
    print("Loading xclbin")
 
    prg = cl.Program(ctx, [dev], [open(xclbin).read()])
 
    try:
       prg.build()
    except:
       print("ERROR:")
       print(prg.get_build_info(ctx, cl.program_build_info.LOG))
       raise
 
    knl1 = prg.bandwidth1
    knl2 = prg.bandwidth2
    
    #input host and buffer
    lst = [i%256 for i in range(globalbuffersize)]
    input_host1 = np.array(lst).astype(np.uint8)
    input_host2 = np.array(lst).astype(np.uint8)

    input_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf = input_host1)
    input_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf = input_host2)

    if input_buf1.int_ptr is None or input_buf2.int_ptr is None:
       print("ERROR: Failed to allocate source buffer")
       sys.exit(1)
    
    #output host and buffer
    output_host1 = np.empty_like(input_host1, dtype=np.uint8)
    output_host2 = np.empty_like(input_host2, dtype=np.uint8)
    
    output_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host1.nbytes)
    output_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host2.nbytes)

    if output_buf1.int_ptr is None or output_buf2.int_ptr is None:
       print("ERROR: Failed to allocate destination buffer")
       sys.exit(1)

    #copy dataset to OpenCL buffer
    globalbuffersizeinbeats = globalbuffersize/(typesize/8)
    tests= int(math.log(globalbuffersizeinbeats, 2.0))+1
 
    #lists
    dnsduration = []
    dsduration  = []
    dbytes      = []
    dmbytes     = []
    bpersec     = []
    mbpersec    = []

    #run tests with burst length 1 beat to globalbuffersize
    #double burst length each test
    test=0
    beats = 16
    throughput = []
    while beats <= 1024:
        print("LOOP PIPELINE %d beats" %beats)
 
        usduration = 0
        fiveseconds = 5*1000000
        reps = 64
        while usduration < fiveseconds:

            start = current_micro_time()
            knl1(commands, (1, ), (1, ), output_buf1, input_buf1, np.uint32(beats), np.uint32(reps))
            knl2(commands, (1, ), (1, ), output_buf2, input_buf2, np.uint32(beats), np.uint32(reps))
            commands.finish()
            end = current_micro_time()

            usduration = end-start

            cl.enqueue_copy(commands, output_host1, output_buf1).wait()
            cl.enqueue_copy(commands, output_host2, output_buf2).wait()
            
            # need to check, currently fails
            limit = beats*(typesize/8)
            if not np.array_equal(output_host1[:limit], input_host1[:limit]):
               print("ERROR: Failed to copy entries")
               input_buf1.release()
               input_buf2.release()
               output_buf1.release()
               output_buf2.release()
               sys.exit(1)

            if not np.array_equal(output_host2[:limit], input_host2[:limit]):
               print("ERROR: Failed to copy entries")
               input_buf1.release()
               input_buf2.release()
               output_buf1.release()
               output_buf2.release()
               sys.exit(1)

            # print("Reps = %d, Beats = %d, Duration = %lf us" %(reps, beats, usduration)) # for debug

            if usduration < fiveseconds:
                reps = reps*2


        dnsduration.append(usduration)
        dsduration.append(dnsduration[test]/1000000)
        dbytes.append(reps*beats*(typesize/8))
        dmbytes.append(dbytes[test]/(1024 * 1024))
        bpersec.append(2*dbytes[test]/dsduration[test])
        mbpersec.append(2*bpersec[test]/(1024 * 1024))
        throughput.append(mbpersec[test])
        print("Test %d, Throughput: %d MB/s" %(test, throughput[test]))
        beats = beats*4
        test+=1
    
    #cleanup
    input_buf1.release()
    input_buf2.release()
    output_buf1.release()
    output_buf2.release()
    del ctx

    print("TTTT: %d" %throughput[0])
    print("Maximum throughput: %d MB/s" %max(throughput))
    if max(throughput) < threshold:
        print("ERROR: Throughput is less than expected value of %d GB/sec" %(threshold/1000))
        sys.exit(1)
    
    print("PASSED")
Example #21
0
def kMerCount(file, nK):
    K = nK
    h_seq = genSeq(file)
    h_seq = np.concatenate(
        (np.zeros(2 + 4 + 4**K).astype(CPU_SIDE_INT), h_seq))

    kernelsource = '''
	__kernel void mapToNumb(
		const int N,
		const int M,
		const int numbKmer,
		__global int* seq,
		__global int* numb_seq
	)
	{
		int gid = get_global_id(0);
		int idx = gid * M + numbKmer + 2 + 4;
		int i, letter;

		if(idx < N*M + numbKmer + 2 + 4) {
			for(i=0; i < M; i++) {
				letter = seq[idx+i];
				if(letter == 65) {
					numb_seq[idx+i] = 0;
					atomic_inc(&numb_seq[2]);
				} else {
				if(letter == 67) {
					numb_seq[idx+i] = 1;
					atomic_inc(&numb_seq[3]);
				} else {
				if(letter == 71) {
					numb_seq[idx+i] = 2;
					atomic_inc(&numb_seq[4]);
				} else {
				if(letter == 84) {
					numb_seq[idx+i] = 3;
					atomic_inc(&numb_seq[5]);
				} else {
				if(letter == 78) {
					numb_seq[idx+i] = -1;
				} else {
					numb_seq[idx+i] = -2;
				}
				}
				}
				}
				}
			}
		}
	}
	__kernel void freqTab(
		const int N,
		const int M,
		const int nK,
		const int numbKmer,
		__global int* numb_seq
	) {
		int gid = get_global_id(0);
		int idx = gid * M + numbKmer + 2 + 4;
		int i, numb;
		int k, p, loc_idx, ptn_idx;
		int dgt;
		int kmin;
		for(i=0; i < M; i++) {
			ptn_idx = 0;
			loc_idx = idx + i;
			kmin = 0;
			if(loc_idx <= (N*M + numbKmer + 2 + 4 - nK)) {
				for(k=0; k < nK; k++) {
					numb = numb_seq[loc_idx + k];
					switch(numb) {
						case (-1):
							atomic_inc(&numb_seq[1]);
							break;
						case (-2):
							atomic_inc(&numb_seq[0]);
							break;
						default:
							dgt = (int)(pow(4, (float)(nK-1-k)));
							ptn_idx += dgt * numb;
							break;
					}
					if(numb < kmin) {
						kmin = numb;
					}
				}
				if(kmin >= 0) {
					atomic_inc(&numb_seq[ptn_idx+2+4]);
				}
			}
		}
	}
	'''

    context = cl.create_some_context()
    device = context.devices[0]

    work_group_size = device.max_work_group_size
    work_item_size = device.max_work_item_sizes[0]
    print(work_group_size)
    print(work_item_size)

    numbGroups = work_group_size
    numbItems = work_item_size

    seqLen = np.size(h_seq) - 4**K - 2 - 4
    q, r = divmod(seqLen, numbGroups * numbItems)
    q = q + 1
    h_seq = np.concatenate(
        (h_seq, np.repeat(78,
                          numbGroups * numbItems - r).astype(CPU_SIDE_INT)))
    h_numb_seq = np.zeros(np.size(h_seq)).astype(CPU_SIDE_INT)
    print(q)
    print(r)

    queue = cl.CommandQueue(context)
    program = cl.Program(context, kernelsource).build()
    mapToNumb = program.mapToNumb
    mapToNumb.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None])
    freqTab = program.freqTab
    freqTab.set_scalar_arg_dtypes(
        [np.int32, np.int32, np.int32, np.int32, None])

    d_seq = cl.Buffer(context,
                      cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR,
                      hostbuf=h_seq)
    d_numb_seq = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_numb_seq.nbytes)
    cl.enqueue_fill_buffer(queue, d_numb_seq,
                           np.zeros(1).astype(np.int), 0, h_numb_seq.nbytes)

    N = numbGroups * numbItems
    M = q
    numbKmer = 4**K
    globalsize = (N, )
    localsize = (numbItems, )

    mapToNumb(queue, globalsize, None, N, M, numbKmer, d_seq, d_numb_seq)

    queue.finish()

    freqTab(queue, globalsize, None, N, M, K, numbKmer, d_numb_seq)

    queue.finish()

    cl.enqueue_copy(queue, h_numb_seq, d_numb_seq)

    print("Counting Done")

    print(h_numb_seq[:numbKmer + 2 + 4])
    assert (h_numb_seq[0] == 0
            ), "File contains unknown nucleotide characters"  #Sanity check

    return h_numb_seq[2:numbKmer + 2 + 4]
Example #22
0
class Runner:
    def __init__(self, dims):
        self.width = dims[0]
        self.height = dims[1]
        import numpy as np
        from matplotlib import cm
        from itertools import cycle
        self.np = np

        color_maps = [
            'inferno', 'gnuplot', 'magma', 'viridis', 'plasma', 'cubehelix',
            'gnuplot2', 'ocean', 'terrain', 'CMRmap', 'nipy_spectral'
        ]
        maps = [
            np.array(
                map(
                    lambda i: (np.array(cm.get_cmap(x, 256)
                                        (i)[:-1]) * 255).astype(np.uint8),
                    np.arange(0, 256))) for x in color_maps
        ]

        # tup = []
        # for m in maps:
        #     tup.extend([m, m[::-1]])
        tup = (maps[0], maps[0][::-1], maps[1], maps[1][::-1], maps[2],
               maps[2][::-1], maps[5], maps[5][::-1], maps[3], maps[4][::-1],
               maps[6], maps[7][::-1], maps[8], maps[9][::-1], maps[10],
               maps[10][::-1])
        self.cols = np.concatenate(tup)
        self.cols = np.concatenate((self.cols, self.cols[::-1]))
        self.step = 0
        # print("Colors length:", len(self.cols))
        denoms = np.cos(np.arange(0, 3 * np.pi, 0.01)) + (2 * np.pi)
        self.denom = cycle(denoms)
        self.up = True
        self.half_len = len(self.cols) / 64.
        # print("Half length", self.half_len)
        self.init_gpu()

    def init_gpu(self):
        self.use_cl = False
        try:
            import pyopencl as cl
            from pyopencl import array
        except Exception, e:
            import traceback
            traceback.print_exc()
            return
        self.use_cl = True
        self.cl = cl
        self.ctx = cl.Context([cl.get_platforms()[1].get_devices()[0]])
        self.queue = cl.CommandQueue(self.ctx)
        self.lut = self.np.empty(len(self.cols), cl.array.vec.char3)
        for idx, i in enumerate(self.cols):
            self.lut[idx][0] = i[0]
            self.lut[idx][1] = i[1]
            self.lut[idx][2] = i[2]
        mf = cl.mem_flags
        self.lut_opencl = cl.Buffer(self.ctx,
                                    mf.READ_ONLY | mf.COPY_HOST_PTR,
                                    hostbuf=self.lut)

        self.prg = cl.Program(
            self.ctx, """
               __kernel void plasma(__global uchar4 *img, __constant uchar4 *lut, float const denom, uint step,
                                         uint const height, uint const width, uint const colours) {{
                   const int x = get_global_id(0);
                   const int y = get_global_id(1);
                   const int index = y * height + x;
                   const float half_len = {0};
                   const int h = step + half_len + (half_len * {1}sin((float){1}sqrt(pow(x - width / 2.,2)+ pow(y - height / 2.,2)) / denom));
                   if( h  < colours) {{
                       img[index] = lut[h];
                   }} else {{
                       img[index] = lut[colours - h];
                   }}
               }}
           """.format(self.half_len, 'native_')).build()
Example #23
0
def main(snapshot_pattern="wave-mpi-{step:04d}-{rank:04d}.pkl", restart_step=None,
         use_profiling=False, use_logmgr=False, actx_class=PyOpenCLArrayContext):
    """Drive the example."""
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)

    from mpi4py import MPI
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    num_parts = comm.Get_size()

    logmgr = initialize_logmgr(use_logmgr,
        filename="wave-mpi.sqlite", mode="wu", mpi_comm=comm)
    if use_profiling:
        queue = cl.CommandQueue(cl_ctx,
            properties=cl.command_queue_properties.PROFILING_ENABLE)
        actx = actx_class(queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)),
            logmgr=logmgr)
    else:
        queue = cl.CommandQueue(cl_ctx)
        actx = actx_class(queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))

    if restart_step is None:

        from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis
        mesh_dist = MPIMeshDistributor(comm)

        dim = 2
        nel_1d = 16

        if mesh_dist.is_mananger_rank():
            from meshmode.mesh.generation import generate_regular_rect_mesh
            mesh = generate_regular_rect_mesh(
                a=(-0.5,)*dim, b=(0.5,)*dim,
                nelements_per_axis=(nel_1d,)*dim)

            print("%d elements" % mesh.nelements)
            part_per_element = get_partition_by_pymetis(mesh, num_parts)
            local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts)

            del mesh

        else:
            local_mesh = mesh_dist.receive_mesh_part()

        fields = None

    else:
        from mirgecom.restart import read_restart_data
        restart_data = read_restart_data(
            actx, snapshot_pattern.format(step=restart_step, rank=rank)
        )
        local_mesh = restart_data["local_mesh"]
        nel_1d = restart_data["nel_1d"]
        assert comm.Get_size() == restart_data["num_parts"]

    order = 3

    discr = EagerDGDiscretization(actx, local_mesh, order=order,
                                  mpi_communicator=comm)

    current_cfl = 0.485
    wave_speed = 1.0
    from grudge.dt_utils import characteristic_lengthscales
    dt = current_cfl * characteristic_lengthscales(actx, discr) / wave_speed

    from grudge.op import nodal_min
    dt = nodal_min(discr, "vol", dt)

    t_final = 1

    if restart_step is None:
        t = 0
        istep = 0

        fields = flat_obj_array(
            bump(actx, discr),
            [discr.zeros(actx) for i in range(discr.dim)]
            )

    else:
        t = restart_data["t"]
        istep = restart_step
        assert istep == restart_step
        restart_fields = restart_data["fields"]
        old_order = restart_data["order"]
        if old_order != order:
            old_discr = EagerDGDiscretization(actx, local_mesh, order=old_order,
                                              mpi_communicator=comm)
            from meshmode.discretization.connection import make_same_mesh_connection
            connection = make_same_mesh_connection(actx, discr.discr_from_dd("vol"),
                                                   old_discr.discr_from_dd("vol"))
            fields = connection(restart_fields)
        else:
            fields = restart_fields

    if logmgr:
        logmgr_add_cl_device_info(logmgr, queue)
        logmgr_add_device_memory_usage(logmgr, queue)

        logmgr.add_watches(["step.max", "t_step.max", "t_log.max"])

        try:
            logmgr.add_watches(["memory_usage_python.max", "memory_usage_gpu.max"])
        except KeyError:
            pass

        if use_profiling:
            logmgr.add_watches(["multiply_time.max"])

        vis_timer = IntervalTimer("t_vis", "Time spent visualizing")
        logmgr.add_quantity(vis_timer)

    vis = make_visualizer(discr)

    def rhs(t, w):
        return wave_operator(discr, c=wave_speed, w=w)

    compiled_rhs = actx.compile(rhs)

    while t < t_final:
        if logmgr:
            logmgr.tick_before()

        # restart must happen at beginning of step
        if istep % 100 == 0 and (
                # Do not overwrite the restart file that we just read.
                istep != restart_step):
            from mirgecom.restart import write_restart_file
            write_restart_file(
                actx, restart_data={
                    "local_mesh": local_mesh,
                    "order": order,
                    "fields": fields,
                    "t": t,
                    "step": istep,
                    "nel_1d": nel_1d,
                    "num_parts": num_parts},
                filename=snapshot_pattern.format(step=istep, rank=rank),
                comm=comm
            )

        if istep % 10 == 0:
            print(istep, t, discr.norm(fields[0]))
            vis.write_parallel_vtk_file(
                comm,
                "fld-wave-mpi-%03d-%04d.vtu" % (rank, istep),
                [
                    ("u", fields[0]),
                    ("v", fields[1:]),
                ], overwrite=True
            )

        fields = thaw(freeze(fields, actx), actx)
        fields = rk4_step(fields, t, dt, compiled_rhs)

        t += dt
        istep += 1

        if logmgr:
            set_dt(logmgr, dt)
            logmgr.tick_after()

    final_soln = discr.norm(fields[0])
    assert np.abs(final_soln - 0.04409852463947439) < 1e-14
Example #24
0
def main(use_profiling=False):
    """Drive the example."""
    cl_ctx = cl.create_some_context()
    if use_profiling:
        queue = cl.CommandQueue(
            cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE)
        actx = PyOpenCLProfilingArrayContext(
            queue,
            allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)))
    else:
        queue = cl.CommandQueue(cl_ctx)
        actx = PyOpenCLArrayContext(queue,
                                    allocator=cl_tools.MemoryPool(
                                        cl_tools.ImmediateAllocator(queue)))

    dim = 2
    nel_1d = 16
    from meshmode.mesh.generation import generate_regular_rect_mesh

    mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim,
                                      b=(0.5, ) * dim,
                                      nelements_per_axis=(nel_1d, ) * dim)

    order = 3

    if dim == 2:
        # no deep meaning here, just a fudge factor
        dt = 0.7 / (nel_1d * order**2)
    elif dim == 3:
        # no deep meaning here, just a fudge factor
        dt = 0.4 / (nel_1d * order**2)
    else:
        raise ValueError("don't have a stable time step guesstimate")

    print("%d elements" % mesh.nelements)

    discr = EagerDGDiscretization(actx, mesh, order=order)

    fields = flat_obj_array(bump(actx, discr),
                            [discr.zeros(actx) for i in range(discr.dim)])

    vis = make_visualizer(discr)

    def rhs(t, w):
        return wave_operator(discr, c=1, w=w)

    t = 0
    t_final = 3
    istep = 0
    while t < t_final:
        fields = rk4_step(fields, t, dt, rhs)

        if istep % 10 == 0:
            if use_profiling:
                print(actx.tabulate_profiling_data())
            print(istep, t, discr.norm(fields[0], np.inf))
            vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep, [
                ("u", fields[0]),
                ("v", fields[1:]),
            ])

        t += dt
        istep += 1
Example #25
0
def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params):
    cl_ctx = ctx_factory()
    queue = cl.CommandQueue(cl_ctx)
    assert drive_test_to_meshmode_interpolation(
        cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3
Example #26
0
def initialise_opencl_object(self,
                             program_src='',
                             command_queue=None,
                             interactive=False,
                             platform_pref=None,
                             device_pref=None,
                             default_group_size=None,
                             default_num_groups=None,
                             default_tile_size=None,
                             default_threshold=None,
                             size_heuristics=[],
                             required_types=[],
                             all_sizes={},
                             user_sizes={}):
    if command_queue is None:
        self.ctx = get_prefered_context(interactive, platform_pref,
                                        device_pref)
        self.queue = cl.CommandQueue(self.ctx)
    else:
        self.ctx = command_queue.context
        self.queue = command_queue
    self.device = self.queue.device
    self.platform = self.device.platform
    self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue))
    device_type = self.device.type

    check_types(self, required_types)

    max_group_size = int(self.device.max_work_group_size)
    max_tile_size = int(np.sqrt(self.device.max_work_group_size))

    self.max_group_size = max_group_size
    self.max_tile_size = max_tile_size
    self.max_threshold = 0
    self.max_num_groups = 0

    self.max_local_memory = int(self.device.local_mem_size)

    # Futhark reserves 4 bytes of local memory for its own purposes.
    self.max_local_memory -= 4

    # See comment in rts/c/opencl.h.
    if self.platform.name.find('NVIDIA CUDA') >= 0:
        self.max_local_memory -= 12
    elif self.platform.name.find('AMD') >= 0:
        self.max_local_memory -= 16

    self.free_list = {}

    self.global_failure = self.pool.allocate(np.int32().itemsize)
    cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0,
                           np.int32().itemsize)
    self.global_failure_args = self.pool.allocate(
        np.int32().itemsize * (self.global_failure_args_max + 1))
    self.failure_is_an_option = np.int32(0)

    if 'default_group_size' in sizes:
        default_group_size = sizes['default_group_size']
        del sizes['default_group_size']

    if 'default_num_groups' in sizes:
        default_num_groups = sizes['default_num_groups']
        del sizes['default_num_groups']

    if 'default_tile_size' in sizes:
        default_tile_size = sizes['default_tile_size']
        del sizes['default_tile_size']

    if 'default_threshold' in sizes:
        default_threshold = sizes['default_threshold']
        del sizes['default_threshold']

    default_group_size_set = default_group_size != None
    default_tile_size_set = default_tile_size != None
    default_sizes = apply_size_heuristics(
        self, size_heuristics, {
            'group_size': default_group_size,
            'tile_size': default_tile_size,
            'num_groups': default_num_groups,
            'lockstep_width': None,
            'threshold': default_threshold
        })
    default_group_size = default_sizes['group_size']
    default_num_groups = default_sizes['num_groups']
    default_threshold = default_sizes['threshold']
    default_tile_size = default_sizes['tile_size']
    lockstep_width = default_sizes['lockstep_width']

    if default_group_size > max_group_size:
        if default_group_size_set:
            sys.stderr.write(
                'Note: Device limits group size to {} (down from {})\n'.format(
                    max_tile_size, default_group_size))
        default_group_size = max_group_size

    if default_tile_size > max_tile_size:
        if default_tile_size_set:
            sys.stderr.write(
                'Note: Device limits tile size to {} (down from {})\n'.format(
                    max_tile_size, default_tile_size))
        default_tile_size = max_tile_size

    for (k, v) in user_sizes.items():
        if k in all_sizes:
            all_sizes[k]['value'] = v
        else:
            raise Exception('Unknown size: {}\nKnown sizes: {}'.format(
                k, ' '.join(all_sizes.keys())))

    self.sizes = {}
    for (k, v) in all_sizes.items():
        if v['class'] == 'group_size':
            max_value = max_group_size
            default_value = default_group_size
        elif v['class'] == 'num_groups':
            max_value = max_group_size  # Intentional!
            default_value = default_num_groups
        elif v['class'] == 'tile_size':
            max_value = max_tile_size
            default_value = default_tile_size
        elif v['class'].startswith('threshold'):
            max_value = None
            default_value = default_threshold
        else:
            # Bespoke sizes have no limit or default.
            max_value = None
        if v['value'] == None:
            self.sizes[k] = default_value
        elif max_value != None and v['value'] > max_value:
            sys.stderr.write(
                'Note: Device limits {} to {} (down from {}\n'.format(
                    k, max_value, v['value']))
            self.sizes[k] = max_value
        else:
            self.sizes[k] = v['value']

    # XXX: we perform only a subset of z-encoding here.  Really, the
    # compiler should provide us with the variables to which
    # parameters are mapped.
    if (len(program_src) >= 0):
        return cl.Program(
            self.ctx, program_src
        ).build(["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] + [
            "-D{}={}".format(
                s.replace('z', 'zz').replace('.', 'zi').replace('#', 'zh'), v)
            for (s, v) in self.sizes.items()
        ])
Example #27
0
def test_source_target_tree(ctx_factory, dims, do_plot=False):
    logging.basicConfig(level=logging.INFO)

    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    nsources = 2 * 10**5
    ntargets = 3 * 10**5
    dtype = np.float64

    sources = make_normal_particle_array(queue, nsources, dims, dtype, seed=12)
    targets = make_normal_particle_array(queue, ntargets, dims, dtype, seed=19)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(sources[0].get(), sources[1].get(), "rx")
        pt.plot(targets[0].get(), targets[1].get(), "g+")
        pt.show()

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    tree, _ = tb(queue,
                 sources,
                 targets=targets,
                 max_particles_in_box=10,
                 debug=True)
    tree = tree.get(queue=queue)

    sorted_sources = np.array(list(tree.sources))
    sorted_targets = np.array(list(tree.targets))

    unsorted_sources = np.array([pi.get() for pi in sources])
    unsorted_targets = np.array([pi.get() for pi in targets])
    assert (sorted_sources == unsorted_sources[:, tree.user_source_ids]).all()

    user_target_ids = np.empty(tree.ntargets, dtype=np.intp)
    user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets,
                                                        dtype=np.intp)
    assert (sorted_targets == unsorted_targets[:, user_target_ids]).all()

    all_good_so_far = True

    if do_plot:
        from boxtree.visualization import TreePlotter
        plotter = TreePlotter(tree)
        plotter.draw_tree(fill=False, edgecolor="black", zorder=10)
        plotter.set_bounding_box()

    tol = 1e-15

    for ibox in range(tree.nboxes):
        extent_low, extent_high = tree.get_box_extent(ibox)

        assert (extent_low >=
                tree.bounding_box[0] - 1e-12 * tree.root_extent).all(), ibox
        assert (extent_high <=
                tree.bounding_box[1] + 1e-12 * tree.root_extent).all(), ibox

        src_start = tree.box_source_starts[ibox]
        tgt_start = tree.box_target_starts[ibox]

        box_children = tree.box_child_ids[:, ibox]
        existing_children = box_children[box_children != 0]

        assert (tree.box_source_counts_nonchild[ibox] +
                np.sum(tree.box_source_counts_cumul[existing_children]) ==
                tree.box_source_counts_cumul[ibox])
        assert (tree.box_target_counts_nonchild[ibox] +
                np.sum(tree.box_target_counts_cumul[existing_children]) ==
                tree.box_target_counts_cumul[ibox])

        for what, particles in [
            ("sources", sorted_sources[:, src_start:src_start +
                                       tree.box_source_counts_cumul[ibox]]),
            ("targets", sorted_targets[:, tgt_start:tgt_start +
                                       tree.box_target_counts_cumul[ibox]]),
        ]:
            good = ((particles < extent_high[:, np.newaxis] + tol)
                    &
                    (extent_low[:, np.newaxis] - tol <= particles)).all(axis=0)

            all_good_here = good.all()

            if do_plot and not all_good_here:
                pt.plot(particles[0, np.where(~good)[0]],
                        particles[1, np.where(~good)[0]], "ro")

                plotter.draw_box(ibox, edgecolor="red")
                pt.show()

        if not all_good_here:
            print("BAD BOX %s %d" % (what, ibox))

        all_good_so_far = all_good_so_far and all_good_here
        assert all_good_so_far

    if do_plot:
        pt.gca().set_aspect("equal", "datalim")
        pt.show()
Example #28
0
def main(mesh_name="ellipsoid"):
    import logging
    logger = logging.getLogger(__name__)
    logging.basicConfig(level=logging.WARNING)  # INFO for more progress info

    import pyopencl as cl
    cl_ctx = cl.create_some_context()
    queue = cl.CommandQueue(cl_ctx)
    actx = PyOpenCLArrayContext(queue, force_device_scalars=True)

    if mesh_name == "ellipsoid":
        cad_file_name = "geometries/ellipsoid.step"
        h = 0.6
    elif mesh_name == "two-cylinders":
        cad_file_name = "geometries/two-cylinders-smooth.step"
        h = 0.4
    else:
        raise ValueError("unknown mesh name: %s" % mesh_name)

    from meshmode.mesh.io import generate_gmsh, FileSource
    mesh = generate_gmsh(
        FileSource(cad_file_name),
        2,
        order=2,
        other_options=["-string",
                       "Mesh.CharacteristicLengthMax = %g;" % h],
        target_unit="MM")

    from meshmode.mesh.processing import perform_flips
    # Flip elements--gmsh generates inside-out geometry.
    mesh = perform_flips(mesh, np.ones(mesh.nelements))

    from meshmode.mesh.processing import find_bounding_box
    bbox_min, bbox_max = find_bounding_box(mesh)
    bbox_center = 0.5 * (bbox_min + bbox_max)
    bbox_size = max(bbox_max - bbox_min) / 2

    logger.info("%d elements" % mesh.nelements)

    from pytential.qbx import QBXLayerPotentialSource
    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import \
            InterpolatoryQuadratureSimplexGroupFactory

    density_discr = Discretization(
        actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order))

    qbx = QBXLayerPotentialSource(density_discr,
                                  4 * target_order,
                                  qbx_order,
                                  fmm_order=qbx_order + 3,
                                  target_association_tolerance=0.15)

    from pytential.target import PointsTarget
    fplot = FieldPlotter(bbox_center, extent=3.5 * bbox_size, npoints=150)

    from pytential import GeometryCollection
    places = GeometryCollection(
        {
            "qbx": qbx,
            "targets": PointsTarget(actx.from_numpy(fplot.points))
        },
        auto_where="qbx")
    density_discr = places.get_discretization("qbx")

    nodes = thaw(density_discr.nodes(), actx)
    angle = actx.np.arctan2(nodes[1], nodes[0])

    if k:
        kernel = HelmholtzKernel(3)
    else:
        kernel = LaplaceKernel(3)

    #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None))
    op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None)
    #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)

    if 0:
        from random import randrange
        sigma = actx.zeros(density_discr.ndofs, angle.entry_dtype)
        for _ in range(5):
            sigma[randrange(len(sigma))] = 1

        from arraycontext import unflatten
        sigma = unflatten(angle, sigma, actx)
    else:
        sigma = actx.np.cos(mode_nr * angle)

    if isinstance(kernel, HelmholtzKernel):
        for i, elem in np.ndenumerate(sigma):
            sigma[i] = elem.astype(np.complex128)

    fld_in_vol = actx.to_numpy(
        bind(places, op, auto_where=("qbx", "targets"))(actx, sigma=sigma,
                                                        k=k))

    #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5)
    fplot.write_vtk_file("layerpot-3d-potential.vts",
                         [("potential", fld_in_vol)])

    bdry_normals = bind(places, sym.normal(
        density_discr.ambient_dim))(actx).as_vector(dtype=object)

    from meshmode.discretization.visualization import make_visualizer
    bdry_vis = make_visualizer(actx, density_discr, target_order)
    bdry_vis.write_vtk_file("layerpot-3d-density.vtu", [
        ("sigma", sigma),
        ("bdry_normals", bdry_normals),
    ])
Example #29
0
def test_level_restriction(ctx_factory,
                           dims,
                           skip_prune,
                           lookbehind,
                           do_plot=False):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)

    nparticles = 10**5
    dtype = np.float64

    from boxtree.tools import make_surface_particle_array
    particles = make_surface_particle_array(queue,
                                            nparticles,
                                            dims,
                                            dtype,
                                            seed=15)

    if do_plot:
        import matplotlib.pyplot as pt
        pt.plot(particles[0].get(), particles[1].get(), "x")

    from boxtree import TreeBuilder
    tb = TreeBuilder(ctx)

    queue.finish()
    tree_dev, _ = tb(
        queue,
        particles,
        kind="adaptive-level-restricted",
        max_particles_in_box=30,
        debug=True,
        skip_prune=skip_prune,
        lr_lookbehind=lookbehind,

        # Artificially low to exercise reallocation code
        nboxes_guess=10)

    def find_neighbors(leaf_box_centers, leaf_box_radii):
        # We use an area query with a ball that is slightly larger than
        # the size of a leaf box to find the neighboring leaves.
        #
        # Note that since this comes from an area query, the self box will be
        # included in the neighbor list.
        from boxtree.area_query import AreaQueryBuilder
        aqb = AreaQueryBuilder(ctx)

        ball_radii = cl.array.to_device(
            queue,
            np.min(leaf_box_radii) / 2 + leaf_box_radii)
        leaf_box_centers = [
            cl.array.to_device(queue, axis) for axis in leaf_box_centers
        ]

        area_query, _ = aqb(queue, tree_dev, leaf_box_centers, ball_radii)
        area_query = area_query.get(queue=queue)
        return (area_query.leaves_near_ball_starts,
                area_query.leaves_near_ball_lists)

    # Get data to host for test.
    tree = tree_dev.get(queue=queue)

    # Find leaf boxes.
    from boxtree import box_flags_enum
    leaf_boxes, = (tree.box_flags & box_flags_enum.HAS_CHILDREN == 0).nonzero()

    leaf_box_radii = np.empty(len(leaf_boxes))
    leaf_box_centers = np.empty((dims, len(leaf_boxes)))

    for idx, leaf_box in enumerate(leaf_boxes):
        box_center = tree.box_centers[:, leaf_box]
        ext_l, ext_h = tree.get_box_extent(leaf_box)
        leaf_box_radii[idx] = np.max(ext_h - ext_l) * 0.5
        leaf_box_centers[:, idx] = box_center

    neighbor_starts, neighbor_and_self_lists = find_neighbors(
        leaf_box_centers, leaf_box_radii)

    # Check level restriction.
    for leaf_idx, leaf in enumerate(leaf_boxes):
        neighbors = neighbor_and_self_lists[
            neighbor_starts[leaf_idx]:neighbor_starts[leaf_idx + 1]]
        neighbor_levels = np.array(tree.box_levels[neighbors], dtype=int)
        leaf_level = int(tree.box_levels[leaf])
        assert (np.abs(neighbor_levels - leaf_level) <= 1).all(), \
                (neighbor_levels, leaf_level)
Example #30
0
def test_wait_for_events(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    evt1 = cl.enqueue_marker(queue)
    evt2 = cl.enqueue_marker(queue)
    cl.wait_for_events([evt1, evt2])