def match_dtype_to_c_struct(device, name, dtype, context=None): """Return a tuple `(dtype, c_decl)` such that the C struct declaration in `c_decl` and the structure :class:`numpy.dtype` instance `dtype` have the same memory layout. Note that *dtype* may be modified from the value that was passed in, for example to insert padding. (As a remark on implementation, this routine runs a small kernel on the given *device* to ensure that :mod:`numpy` and C offsets and sizes match.) .. versionadded: 2013.1 This example explains the use of this function:: >>> import numpy as np >>> import pyopencl as cl >>> import pyopencl.tools >>> ctx = cl.create_some_context() >>> dtype = np.dtype([("id", np.uint32), ("value", np.float32)]) >>> dtype, c_decl = pyopencl.tools.match_dtype_to_c_struct( ... ctx.devices[0], 'id_val', dtype) >>> print c_decl typedef struct { unsigned id; float value; } id_val; >>> print dtype [('id', '<u4'), ('value', '<f4')] >>> cl.tools.get_or_register_dtype('id_val', dtype) As this example shows, it is important to call :func:`get_or_register_dtype` on the modified `dtype` returned by this function, not the original one. """ import pyopencl as cl fields = sorted(dtype.fields.items(), key=lambda name_dtype_offset: name_dtype_offset[1][1]) c_fields = [] for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] if hasattr(field_dtype, "subdtype") and field_dtype.subdtype is not None: array_dtype = field_dtype.subdtype[0] if hasattr(array_dtype, "subdtype") and array_dtype.subdtype is not None: raise NotImplementedError( "nested array dtypes are not supported") array_dims = field_dtype.subdtype[1] dims_str = "" try: for dim in array_dims: dims_str += "[%d]" % dim except TypeError: dims_str = "[%d]" % array_dims c_fields.append(" {} {}{};".format(dtype_to_ctype(array_dtype), field_name, dims_str)) else: c_fields.append(" {} {};".format(dtype_to_ctype(field_dtype), field_name)) c_decl = "typedef struct {{\n{}\n}} {};\n\n".format( "\n".join(c_fields), name) cdl = _CDeclList(device) for field_name, dtype_and_offset in fields: field_dtype, offset = dtype_and_offset[:2] cdl.add_dtype(field_dtype) pre_decls = cdl.get_declarations() offset_code = "\n".join("result[%d] = pycl_offsetof(%s, %s);" % (i + 1, name, field_name) for i, (field_name, _) in enumerate(fields)) src = r""" #define pycl_offsetof(st, m) \ ((uint) ((__local char *) &(dummy.m) \ - (__local char *)&dummy )) %(pre_decls)s %(my_decl)s __kernel void get_size_and_offsets(__global uint *result) { result[0] = sizeof(%(my_type)s); __local %(my_type)s dummy; %(offset_code)s } """ % dict(pre_decls=pre_decls, my_decl=c_decl, my_type=name, offset_code=offset_code) if context is None: context = cl.Context([device]) queue = cl.CommandQueue(context) prg = cl.Program(context, src) knl = prg.build(devices=[device]).get_size_and_offsets import pyopencl.array # noqa result_buf = cl.array.empty(queue, 1 + len(fields), np.uint32) knl(queue, (1, ), (1, ), result_buf.data) queue.finish() size_and_offsets = result_buf.get() size = int(size_and_offsets[0]) offsets = size_and_offsets[1:] if any(ofs >= size for ofs in offsets): # offsets not plausible if dtype.itemsize == size: # If sizes match, use numpy's idea of the offsets. offsets = [ dtype_and_offset[1] for field_name, dtype_and_offset in fields ] else: raise RuntimeError( "OpenCL compiler reported offsetof() past sizeof() " "for struct layout on '%s'. " "This makes no sense, and it's usually indicates a " "compiler bug. " "Refusing to discover struct layout." % device) result_buf.data.release() del knl del prg del queue del context try: dtype_arg_dict = { "names": [field_name for field_name, (field_dtype, offset) in fields], "formats": [field_dtype for field_name, (field_dtype, offset) in fields], "offsets": [int(x) for x in offsets], "itemsize": int(size_and_offsets[0]), } dtype = np.dtype(dtype_arg_dict) if dtype.itemsize != size_and_offsets[0]: # "Old" versions of numpy (1.6.x?) silently ignore "itemsize". Boo. dtype_arg_dict["names"].append("_pycl_size_fixer") dtype_arg_dict["formats"].append(np.uint8) dtype_arg_dict["offsets"].append(int(size_and_offsets[0]) - 1) dtype = np.dtype(dtype_arg_dict) except NotImplementedError: def calc_field_type(): total_size = 0 padding_count = 0 for offset, (field_name, (field_dtype, _)) in zip(offsets, fields): if offset > total_size: padding_count += 1 yield ("__pycl_padding%d" % padding_count, "V%d" % offset - total_size) yield field_name, field_dtype total_size = field_dtype.itemsize + offset dtype = np.dtype(list(calc_field_type())) assert dtype.itemsize == size_and_offsets[0] return dtype, c_decl
def test_space_invader_query(ctx_factory, dims, dtype, do_plot=False): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() queue = cl.CommandQueue(ctx) dtype = np.dtype(dtype) nparticles = 10**5 particles = make_normal_particle_array(queue, nparticles, dims, dtype) if do_plot: import matplotlib.pyplot as pt pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree, _ = tb(queue, particles, max_particles_in_box=30, debug=True) nballs = 10**4 ball_centers = make_normal_particle_array(queue, nballs, dims, dtype) ball_radii = cl.array.empty(queue, nballs, dtype).fill(0.1) from boxtree.area_query import (LeavesToBallsLookupBuilder, SpaceInvaderQueryBuilder) siqb = SpaceInvaderQueryBuilder(ctx) # We can use leaves-to-balls lookup to get the set of overlapping balls for # each box, and from there to compute the outer space invader distance. lblb = LeavesToBallsLookupBuilder(ctx) siq, _ = siqb(queue, tree, ball_centers, ball_radii) lbl, _ = lblb(queue, tree, ball_centers, ball_radii) # get data to host for test tree = tree.get(queue=queue) siq = siq.get(queue=queue) lbl = lbl.get(queue=queue) ball_centers = np.array([x.get() for x in ball_centers]) ball_radii = ball_radii.get() # Find leaf boxes. from boxtree import box_flags_enum outer_space_invader_dist = np.zeros(tree.nboxes) for ibox in range(tree.nboxes): # We only want leaves here. if tree.box_flags[ibox] & box_flags_enum.HAS_CHILDREN: continue start, end = lbl.balls_near_box_starts[ibox:ibox + 2] space_invaders = lbl.balls_near_box_lists[start:end] if len(space_invaders) > 0: outer_space_invader_dist[ibox] = np.max( np.abs(tree.box_centers[:, ibox].reshape((-1, 1)) - ball_centers[:, space_invaders])) assert np.allclose(siq, outer_space_invader_dist)
def test_extent_tree(ctx_factory, dims, extent_norm, do_plot=False): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() queue = cl.CommandQueue(ctx) nsources = 100000 ntargets = 200000 dtype = np.float64 npoint_sources_per_source = 16 sources = make_normal_particle_array(queue, nsources, dims, dtype, seed=12) targets = make_normal_particle_array(queue, ntargets, dims, dtype, seed=19) refine_weights = cl.array.zeros(queue, nsources + ntargets, np.int32) refine_weights[:nsources] = 1 from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=13) source_radii = 2**rng.uniform(queue, nsources, dtype=dtype, a=-10, b=0) target_radii = 2**rng.uniform(queue, ntargets, dtype=dtype, a=-10, b=0) from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() dev_tree, _ = tb( queue, sources, targets=targets, source_radii=source_radii, target_radii=target_radii, extent_norm=extent_norm, refine_weights=refine_weights, max_leaf_refine_weight=20, #max_particles_in_box=10, # Set artificially small, to exercise the reallocation code. nboxes_guess=10, debug=True, stick_out_factor=0) logger.info("transfer tree, check orderings") tree = dev_tree.get(queue=queue) if do_plot: import matplotlib.pyplot as pt pt.plot(sources[0].get(), sources[1].get(), "rx") pt.plot(targets[0].get(), targets[1].get(), "g+") from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) plotter.draw_tree(fill=False, edgecolor="black", zorder=10) plotter.draw_box_numbers() plotter.set_bounding_box() pt.gca().set_aspect("equal", "datalim") pt.show() sorted_sources = np.array(list(tree.sources)) sorted_targets = np.array(list(tree.targets)) sorted_source_radii = tree.source_radii sorted_target_radii = tree.target_radii unsorted_sources = np.array([pi.get() for pi in sources]) unsorted_targets = np.array([pi.get() for pi in targets]) unsorted_source_radii = source_radii.get() unsorted_target_radii = target_radii.get() assert (sorted_sources == unsorted_sources[:, tree.user_source_ids]).all() assert (sorted_source_radii == unsorted_source_radii[tree.user_source_ids] ).all() # {{{ test box structure, stick-out criterion logger.info("test box structure, stick-out criterion") user_target_ids = np.empty(tree.ntargets, dtype=np.intp) user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets, dtype=np.intp) if ntargets: assert (sorted_targets == unsorted_targets[:, user_target_ids]).all() assert (sorted_target_radii == unsorted_target_radii[user_target_ids] ).all() all_good_so_far = True # {{{ check sources, targets assert np.sum(tree.box_source_counts_nonchild) == nsources assert np.sum(tree.box_target_counts_nonchild) == ntargets for ibox in range(tree.nboxes): kid_sum = sum(tree.box_target_counts_cumul[ichild_box] for ichild_box in tree.box_child_ids[:, ibox] if ichild_box != 0) assert (tree.box_target_counts_cumul[ibox] == ( tree.box_target_counts_nonchild[ibox] + kid_sum)), ibox for ibox in range(tree.nboxes): extent_low, extent_high = tree.get_box_extent(ibox) assert (extent_low >= tree.bounding_box[0] - 1e-12 * tree.root_extent).all(), ibox assert (extent_high <= tree.bounding_box[1] + 1e-12 * tree.root_extent).all(), ibox box_children = tree.box_child_ids[:, ibox] existing_children = box_children[box_children != 0] assert (tree.box_source_counts_nonchild[ibox] + np.sum(tree.box_source_counts_cumul[existing_children]) == tree.box_source_counts_cumul[ibox]) assert (tree.box_target_counts_nonchild[ibox] + np.sum(tree.box_target_counts_cumul[existing_children]) == tree.box_target_counts_cumul[ibox]) del existing_children del box_children for ibox in range(tree.nboxes): lev = int(tree.box_levels[ibox]) box_radius = 0.5 * tree.root_extent / (1 << lev) box_center = tree.box_centers[:, ibox] extent_low = box_center - box_radius extent_high = box_center + box_radius stick_out_dist = tree.stick_out_factor * box_radius radius_with_stickout = (1 + tree.stick_out_factor) * box_radius for what, starts, counts, points, radii in [ ("source", tree.box_source_starts, tree.box_source_counts_cumul, sorted_sources, sorted_source_radii), ("target", tree.box_target_starts, tree.box_target_counts_cumul, sorted_targets, sorted_target_radii), ]: bstart = starts[ibox] bslice = slice(bstart, bstart + counts[ibox]) check_particles = points[:, bslice] check_radii = radii[bslice] if extent_norm == "linf": good = ((check_particles + check_radii < extent_high[:, np.newaxis] + stick_out_dist) & # noqa: W504 (extent_low[:, np.newaxis] - stick_out_dist <= check_particles - check_radii)).all(axis=0) elif extent_norm == "l2": center_dists = np.sqrt( np.sum((check_particles - box_center.reshape(-1, 1))**2, axis=0)) good = ((center_dists + check_radii)**2 < dims * radius_with_stickout**2) else: raise ValueError("unexpected value of extent_norm") all_good_here = good.all() if not all_good_here: print("BAD BOX %s %d level %d" % (what, ibox, tree.box_levels[ibox])) all_good_so_far = all_good_so_far and all_good_here assert all_good_here # }}} assert all_good_so_far # }}} # {{{ create, link point sources logger.info("creating point sources") np.random.seed(20) from pytools.obj_array import make_obj_array point_sources = make_obj_array([ cl.array.to_device( queue, unsorted_sources[i][:, np.newaxis] + unsorted_source_radii[:, np.newaxis] * np.random.uniform( -1, 1, size=(nsources, npoint_sources_per_source))) for i in range(dims) ]) point_source_starts = cl.array.arange(queue, 0, (nsources + 1) * npoint_sources_per_source, npoint_sources_per_source, dtype=tree.particle_id_dtype) from boxtree.tree import link_point_sources dev_tree = link_point_sources(queue, dev_tree, point_source_starts, point_sources, debug=True)
} """ import pyopencl as cl from time import time import numpy block_size = 16 ctx = cl.create_some_context() for dev in ctx.devices: assert dev.local_mem_size > 0 queue = cl.CommandQueue( ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) #queue = cl.CommandQueue(ctx) if False: a_height = 4096 #a_height = 1024 a_width = 2048 #a_width = 256 #b_height == a_width b_width = a_height elif False: # like PyCUDA a_height = 2516 a_width = 1472
3, *params, test_case='exact') @pytest.mark.parametrize("params", [ [2, 5, 4, 4], [3, 7, 5, 3], [4, 7, 3, 5], ]) def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) assert drive_test_to_meshmode_interpolation( cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3 # }}} End 3d tests if __name__ == '__main__': cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) resid = drive_test_to_meshmode_interpolation(cl_ctx, queue, dim=3, degree=9, nel_1d=7, n_levels=2, q_order=10, test_case="exact")
def test_proxy_generator(ctx_factory, ndim, factor, visualize=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) qbx = _build_qbx_discr(queue, ndim=ndim) srcindices = _build_block_index(qbx.density_discr, factor=factor) from pytential.linalg.proxy import ProxyGenerator generator = ProxyGenerator(qbx, ratio=1.1) proxies, pxyranges, pxycenters, pxyradii = generator(queue, srcindices) proxies = np.vstack([p.get() for p in proxies]) pxyranges = pxyranges.get() pxycenters = np.vstack([c.get() for c in pxycenters]) pxyradii = pxyradii.get() for i in range(srcindices.nblocks): ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]] r = la.norm(proxies[:, ipxy] - pxycenters[:, i].reshape(-1, 1), axis=0) assert np.allclose(r - pxyradii[i], 0.0, atol=1.0e-14) srcindices = srcindices.get(queue) if visualize: if qbx.ambient_dim == 2: import matplotlib.pyplot as pt density_nodes = qbx.density_discr.nodes().get(queue) ci = bind(qbx, sym.expansion_centers(qbx.ambient_dim, -1))(queue) ci = np.vstack([c.get(queue) for c in ci]) ce = bind(qbx, sym.expansion_centers(qbx.ambient_dim, +1))(queue) ce = np.vstack([c.get(queue) for c in ce]) r = bind(qbx, sym.expansion_radii(qbx.ambient_dim))(queue).get() for i in range(srcindices.nblocks): isrc = srcindices.block_indices(i) ipxy = np.s_[pxyranges[i]:pxyranges[i + 1]] pt.figure(figsize=(10, 8)) axis = pt.gca() for j in isrc: c = pt.Circle(ci[:, j], r[j], color='k', alpha=0.1) axis.add_artist(c) c = pt.Circle(ce[:, j], r[j], color='k', alpha=0.1) axis.add_artist(c) pt.plot(density_nodes[0], density_nodes[1], 'ko', ms=2.0, alpha=0.5) pt.plot(density_nodes[0, srcindices.indices], density_nodes[1, srcindices.indices], 'o', ms=2.0) pt.plot(density_nodes[0, isrc], density_nodes[1, isrc], 'o', ms=2.0) pt.plot(proxies[0, ipxy], proxies[1, ipxy], 'o', ms=2.0) pt.xlim([-1.5, 1.5]) pt.ylim([-1.5, 1.5]) filename = "test_proxy_generator_{}d_{:04}.png".format(ndim, i) pt.savefig(filename, dpi=300) pt.clf() else: from meshmode.discretization.visualization import make_visualizer from meshmode.mesh.processing import ( # noqa affine_map, merge_disjoint_meshes) from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory from meshmode.mesh.generation import generate_icosphere ref_mesh = generate_icosphere(1, generator.nproxy) # NOTE: this does not plot the actual proxy points for i in range(srcindices.nblocks): mesh = affine_map(ref_mesh, A=(pxyradii[i] * np.eye(ndim)), b=pxycenters[:, i].reshape(-1)) mesh = merge_disjoint_meshes([mesh, qbx.density_discr.mesh]) discr = Discretization( ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(10)) vis = make_visualizer(queue, discr, 10) filename = "test_proxy_generator_{}d_{:04}.vtu".format(ndim, i) vis.write_vtk_file(filename, [])
def __init__(self, coefficients, nb_channel, dtype, chunksize, overlapsize): SosFiltfilt_Base.__init__(self, coefficients, nb_channel, dtype, chunksize, overlapsize) assert self.dtype == np.dtype('float32') assert self.chunksize is not None, 'chunksize for opencl must be fixed' self.coefficients = self.coefficients.astype(self.dtype) if self.coefficients.ndim == 2: #(nb_section, 6) to (nb_channel, nb_section, 6) self.coefficients = np.tile(self.coefficients[None, :, :], (nb_channel, 1, 1)) if not self.coefficients.flags['C_CONTIGUOUS']: self.coefficients = self.coefficients.copy() assert self.coefficients.shape[ 0] == self.nb_channel, 'wrong coefficients.shape' assert self.coefficients.shape[2] == 6, 'wrong coefficients.shape' self.nb_section = self.coefficients.shape[1] self.ctx = pyopencl.create_some_context() #TODO : add arguments gpu_platform_index/gpu_device_index #self.devices = [pyopencl.get_platforms()[self.gpu_platform_index].get_devices()[self.gpu_device_index] ] #self.ctx = pyopencl.Context(self.devices) self.queue = pyopencl.CommandQueue(self.ctx) #host arrays self.zi1 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype) self.zi2 = np.zeros((nb_channel, self.nb_section, 2), dtype=self.dtype) self.output1 = np.zeros((self.chunksize, self.nb_channel), dtype=self.dtype) self.output2 = np.zeros((self.backward_chunksize, self.nb_channel), dtype=self.dtype) #GPU buffers self.coefficients_cl = pyopencl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.coefficients) self.zi1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.zi1) self.zi2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=self.zi2) self.input1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output1.nbytes) self.output1_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output1.nbytes) self.input2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output2.nbytes) self.output2_cl = pyopencl.Buffer(self.ctx, mf.READ_WRITE, size=self.output2.nbytes) #nb works kernel = self.kernel % dict(forward_chunksize=self.chunksize, backward_chunksize=self.backward_chunksize, nb_section=self.nb_section, nb_channel=self.nb_channel) prg = pyopencl.Program(self.ctx, kernel) self.opencl_prg = prg.build(options='-cl-mad-enable')
import pyopencl as CL from pyopencl import array import numpy CL.tools.clear_first_arg_caches() c = CL.Context([CL.get_platforms()[0].get_devices()[0]]) with open("kernel.cl", "r") as k_src: k = CL.Program(c, k_src.read()).build("-I./src/cl") q = CL.CommandQueue(c) flags = CL.mem_flags # 290 = i i # 1323270 = (k i) k # 659718 = (k* i) k # 72218 = Ω #λ. 1 #(λ. 1) 0 #λ. ((λ. 1) 0) #0 (λ. 1) #[(),(),(),()] #*Main> map (\t -> encode_b $ reverse $ encode_t t) [a,b,c,d] #[12,178,712,198] #*Main> map (\t -> encode_b $ reverse $ encode_t (substitute 0 k t)) [a,b,c,d] #[192,99074,712,98498] #*Main> map (\t -> encode_b $ reverse $ encode_t t) [a,b,c,d]
def set_cl(self, targetOpenCL='auto', precisionOpenCL='auto'): if (targetOpenCL == self.lastTargetOpenCL) and\ (precisionOpenCL == self.lastPrecisionOpenCL): return self.lastTargetOpenCL = targetOpenCL self.lastPrecisionOpenCL = precisionOpenCL if not isOpenCL: raise EnvironmentError("pyopencl is not available!") else: if isinstance(targetOpenCL, (tuple, list)): iDevice = [] targetOpenCL = list(targetOpenCL) if isinstance(targetOpenCL[0], int): nPlatform, nDevice = targetOpenCL platform = cl.get_platforms()[nPlatform] iDevice.extend([platform.get_devices()[nDevice]]) else: for target in targetOpenCL: if isinstance(target, (tuple, list)): target = list(target) if len(target) > 1: nPlatform, nDevice = target platform = cl.get_platforms()[nPlatform] iDevice.extend( [platform.get_devices()[nDevice]]) else: nPlatform = target[0] platform = cl.get_platforms()[nPlatform] iDevice.extend(platform.get_devices()) elif isinstance(targetOpenCL, int): nPlatform = targetOpenCL platform = cl.get_platforms()[nPlatform] iDevice = platform.get_devices() elif isinstance(targetOpenCL, str): iDeviceCPU = [] iDeviceGPU = [] iDeviceAcc = [] iDevice = [] for platform in cl.get_platforms(): CPUdevices = [] GPUdevices = [] AccDevices = [] try: # at old pyopencl versions: CPUdevices =\ platform.get_devices( device_type=cl.device_type.CPU) GPUdevices =\ platform.get_devices( device_type=cl.device_type.GPU) AccDevices =\ platform.get_devices( device_type=cl.device_type.ACCELERATOR) except cl.RuntimeError: pass if len(CPUdevices) > 0: if len(iDeviceCPU) > 0: if CPUdevices[0].vendor == \ CPUdevices[0].platform.vendor: try: tmpctx = cl.Context(devices=CPUdevices) iDeviceCPU = CPUdevices except: pass else: try: tmpctx = cl.Context(devices=CPUdevices) iDeviceCPU.extend(CPUdevices) except: pass for GPUDevice in GPUdevices: try: tmpctx = cl.Context(devices=[GPUDevice]) if GPUDevice.double_fp_config > 0: iDeviceGPU.extend([GPUDevice]) except: pass iDeviceAcc.extend(AccDevices) if _DEBUG > 10: print("OpenCL: bulding {0} ...".format(self.cl_filename)) print("OpenCL: found {0} CPU{1}".format( len(iDeviceCPU) if len(iDeviceCPU) > 0 else 'none', 's' if len(iDeviceCPU) > 1 else '')) print("OpenCL: found {0} GPU{1}".format( len(iDeviceGPU) if len(iDeviceGPU) > 0 else 'none', 's' if len(iDeviceGPU) > 1 else '')) print("OpenCL: found {0} other accelerator{1}".format( len(iDeviceAcc) if len(iDeviceAcc) > 0 else 'none', 's' if len(iDeviceAcc) > 1 else '')) if targetOpenCL.upper().startswith('GPU'): iDevice.extend(iDeviceGPU) elif targetOpenCL.upper().startswith('CPU'): iDevice.extend(iDeviceCPU) elif targetOpenCL.upper().startswith('ALL'): iDevice.extend(iDeviceGPU) iDevice.extend(iDeviceCPU) iDevice.extend(iDeviceAcc) else: # auto if len(iDeviceGPU) > 0: iDevice = iDeviceGPU elif len(iDeviceAcc) > 0: iDevice = iDeviceAcc else: iDevice = iDeviceCPU if _DEBUG > 10: for idn, idv in enumerate(iDevice): print("OpenCL: Autoselected device {0}: {1}".format( idn, idv.name)) if len(iDevice) == 0: targetOpenCL = None else: # None targetOpenCL = None if targetOpenCL is not None: if self.kernelsource is None: cl_file = os.path.join(os.path.dirname(__file__), self.cl_filename) with open(cl_file, 'r') as f: kernelsource = f.read() else: kernelsource = self.kernelsource if precisionOpenCL == 'auto': try: for device in iDevice: if device.double_fp_config == 63: precisionOpenCL = 'float64' else: raise AttributeError except AttributeError: precisionOpenCL = 'float32' if _DEBUG > 10: print('precisionOpenCL = {0}'.format(precisionOpenCL)) if precisionOpenCL == 'float64': self.cl_precisionF = np.float64 self.cl_precisionC = np.complex128 kernelsource = kernelsource.replace('float', 'double') else: self.cl_precisionF = np.float32 self.cl_precisionC = np.complex64 self.cl_queue = [] self.cl_ctx = [] self.cl_program = [] for device in iDevice: cl_ctx = cl.Context(devices=[device]) self.cl_queue.extend([cl.CommandQueue(cl_ctx, device)]) self.cl_program.extend([ cl.Program(cl_ctx, kernelsource).build(options=["-I " + __dir__]) ]) self.cl_ctx.extend([cl_ctx]) self.cl_mf = cl.mem_flags
def main(): # setup OpenCL platforms = cl.get_platforms( ) # a platform corresponds to a driver (e.g. AMD) platform = platforms[0] # take first platform devices = platform.get_devices( cl.device_type.GPU) # get GPU devices of selected platform device = devices[0] # take first GPU context = cl.Context([device]) # put selected GPU into context object queue = cl.CommandQueue( context, device) # create command queue for selected GPU and context # setup buffer for particles particles_buff = cl.Buffer(context, cl.mem_flags.READ_WRITE, size=PARTICLE_STRUCT_SIZE * PARTICLES_NUM, hostbuf=None) # setup random values (for random speed and color) random.seed() rand_values = np.array( [random.random() - 0.5 for _ in range(2 * PARTICLES_NUM)], dtype=np.float32) bufRandVals = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=rand_values) img = np.zeros([WINDOW_SIZE, WINDOW_SIZE, COLOR_CHANNELS], dtype=np.int32) # must be square to ignore distortion img_buff = cl.Buffer(context, cl.mem_flags.WRITE_ONLY, size=WINDOW_SIZE * WINDOW_SIZE * COLOR_CHANNELS * COLOR_CHANNEL_SIZE) # load and compile OpenCL program compilerSettings = f'-DWINDOW_SIZE={WINDOW_SIZE}' program = cl.Program(context, open(KERNEL_PATH).read()).build(compilerSettings) init_particles = cl.Kernel(program, 'init_particles') update_particles = cl.Kernel(program, 'update_particles') clear_canvas = cl.Kernel(program, 'clear_canvas') draw_particles = cl.Kernel(program, 'draw_particles') saturate = cl.Kernel(program, 'saturate') # init particles (https://documen.tician.de/pyopencl/runtime_program.html#pyopencl.enqueue_nd_range_kernel) init_particles.set_arg(0, particles_buff) init_particles.set_arg(1, bufRandVals) cl.enqueue_nd_range_kernel(queue, init_particles, (PARTICLES_NUM, ), None) # since all particles start from same place, they will go all up for _ in range(100): update_particles.set_arg(0, particles_buff) cl.enqueue_nd_range_kernel(queue, update_particles, (PARTICLES_NUM, ), None) while True: # clear canvas clear_canvas.set_arg(0, img_buff) cl.enqueue_nd_range_kernel(queue, clear_canvas, (WINDOW_SIZE, WINDOW_SIZE), None) # draw all particles draw_particles.set_arg(0, particles_buff) draw_particles.set_arg(1, img_buff) cl.enqueue_nd_range_kernel(queue, draw_particles, (PARTICLES_NUM, ), None) # saturate saturate.set_arg(0, img_buff) cl.enqueue_nd_range_kernel(queue, saturate, (WINDOW_SIZE, WINDOW_SIZE), None) # update particles update_particles.set_arg(0, particles_buff) cl.enqueue_nd_range_kernel(queue, update_particles, (PARTICLES_NUM, ), None) # copy result from GPU and show cl.enqueue_copy(queue, img, img_buff, is_blocking=True) cv2.imshow("press ESC to exit", img.astype(np.uint8)) # exit with ESC keyPressed = cv2.waitKey(10) if keyPressed == 27: break
def demo_cost_model(): if not SUPPORTS_PROCESS_TIME: raise NotImplementedError( "Currently this script uses process time which only works on Python>=3.3" ) from boxtree.pyfmmlib_integration import FMMLibExpansionWrangler nsources_list = [1000, 2000, 3000, 4000, 5000] ntargets_list = [1000, 2000, 3000, 4000, 5000] dims = 3 dtype = np.float64 ctx = cl.create_some_context() queue = cl.CommandQueue(ctx) traversals = [] traversals_dev = [] level_to_orders = [] timing_results = [] def fmm_level_to_nterms(tree, ilevel): return 10 for nsources, ntargets in zip(nsources_list, ntargets_list): # {{{ Generate sources, targets and target_radii from boxtree.tools import make_normal_particle_array as p_normal sources = p_normal(queue, nsources, dims, dtype, seed=15) targets = p_normal(queue, ntargets, dims, dtype, seed=18) from pyopencl.clrandom import PhiloxGenerator rng = PhiloxGenerator(queue.context, seed=22) target_radii = rng.uniform( queue, ntargets, a=0, b=0.05, dtype=dtype ).get() # }}} # {{{ Generate tree and traversal from boxtree import TreeBuilder tb = TreeBuilder(ctx) tree, _ = tb( queue, sources, targets=targets, target_radii=target_radii, stick_out_factor=0.15, max_particles_in_box=30, debug=True ) from boxtree.traversal import FMMTraversalBuilder tg = FMMTraversalBuilder(ctx, well_sep_is_n_away=2) trav_dev, _ = tg(queue, tree, debug=True) trav = trav_dev.get(queue=queue) traversals.append(trav) traversals_dev.append(trav_dev) # }}} wrangler = FMMLibExpansionWrangler(trav.tree, 0, fmm_level_to_nterms) level_to_orders.append(wrangler.level_nterms) timing_data = {} from boxtree.fmm import drive_fmm src_weights = np.random.rand(tree.nsources).astype(tree.coord_dtype) drive_fmm(trav, wrangler, (src_weights,), timing_data=timing_data) timing_results.append(timing_data) time_field_name = "process_elapsed" from boxtree.cost import FMMCostModel from boxtree.cost import make_pde_aware_translation_cost_model cost_model = FMMCostModel(make_pde_aware_translation_cost_model) model_results = [] for icase in range(len(traversals)-1): traversal = traversals_dev[icase] model_results.append( cost_model.cost_per_stage( queue, traversal, level_to_orders[icase], FMMCostModel.get_unit_calibration_params(), ) ) queue.finish() params = cost_model.estimate_calibration_params( model_results, timing_results[:-1], time_field_name=time_field_name ) predicted_time = cost_model.cost_per_stage( queue, traversals_dev[-1], level_to_orders[-1], params, ) queue.finish() for field in ["form_multipoles", "eval_direct", "multipole_to_local", "eval_multipoles", "form_locals", "eval_locals", "coarsen_multipoles", "refine_locals"]: measured = timing_results[-1][field]["process_elapsed"] pred_err = ( (measured - predicted_time[field]) / measured) logger.info("actual/predicted time for %s: %.3g/%.3g -> %g %% error", field, measured, predicted_time[field], abs(100*pred_err))
return context, devices[0] t1 = time.time() a = 0 #for i in range(A): # a = a + 1 t2 = time.time() timeWithoutGPU = t2 - t1 a = numpy.zeros((5, ), dtype=numpy.float32) b = numpy.zeros((5, ), dtype=numpy.float32) c = numpy.zeros((5, ), dtype=numpy.float32) context, device = CreateContext() commandQueue = cl.CommandQueue(context, device) program = cl.Program(context, kernelStr) program.build(devices=[device]) mf = cl.mem_flags a2 = a b2 = b c2 = c a = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=a2) b = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=b2) c = cl.Buffer(context, mf.READ_WRITE, c.nbytes) t3 = time.time() print("Time taken for setting GPU is " + str(t3 - t2)) for i in range(100): program.add(commandQueue, (5, 5, 5), None, a, b, c) cl.enqueue_read_buffer(commandQueue, a, a2).wait() cl.enqueue_read_buffer(commandQueue, b, b2).wait()
def find_mode(): import warnings warnings.simplefilter("error", np.ComplexWarning) cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) k0 = 1.4447 k1 = k0 * 1.02 beta_sym = sym.var("beta") from pytential.symbolic.pde.scalar import ( # noqa DielectricSRep2DBoundaryOperator as SRep, DielectricSDRep2DBoundaryOperator as SDRep) pde_op = SDRep(mode="te", k_vacuum=1, interfaces=((0, 1, sym.DEFAULT_SOURCE), ), domain_k_exprs=(k0, k1), beta=beta_sym, use_l2_weighting=False) u_sym = pde_op.make_unknown("u") op = pde_op.operator(u_sym) # {{{ discretization setup from meshmode.mesh.generation import ellipse, make_curve_mesh curve_f = partial(ellipse, 1) target_order = 7 qbx_order = 4 nelements = 30 from meshmode.mesh.processing import affine_map mesh = make_curve_mesh(curve_f, np.linspace(0, 1, nelements + 1), target_order) lambda_ = 1.55 circle_radius = 3.4 * 2 * np.pi / lambda_ mesh = affine_map(mesh, A=circle_radius * np.eye(2)) from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory from pytential.qbx import QBXLayerPotentialSource density_discr = Discretization( cl_ctx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) qbx = QBXLayerPotentialSource( density_discr, 4 * target_order, qbx_order, # Don't use FMM for now fmm_order=False) # }}} x_vec = np.random.randn(len(u_sym) * density_discr.nnodes) y_vec = np.random.randn(len(u_sym) * density_discr.nnodes) def muller_solve_func(beta): from pytential.symbolic.execution import build_matrix mat = build_matrix(queue, qbx, op, u_sym, context={"beta": beta}).get() return 1 / x_vec.dot(la.solve(mat, y_vec)) starting_guesses = (1 + 0j) * (k0 + (k1 - k0) * np.random.rand(3)) from pytential.muller import muller beta, niter = muller(muller_solve_func, z_start=starting_guesses) print("beta")
def __init__(self): self.ctx = cl.create_some_context() self.queue = cl.CommandQueue(self.ctx) self.tick = False
def refine_and_generate_chart_function(mesh, filename, function): from time import clock cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) print("NELEMENTS: ", mesh.nelements) #print mesh for i in range(len(mesh.groups[0].vertex_indices[0])): for k in range(len(mesh.vertices)): print(mesh.vertices[k, i]) #check_nodal_adj_against_geometry(mesh); r = Refiner(mesh) #random.seed(0) #times = 3 num_elements = [] time_t = [] #nelements = mesh.nelements while True: print("NELS:", mesh.nelements) #flags = get_corner_flags(mesh) flags = get_function_flags(mesh, function) nels = 0 for i in flags: if i: nels += 1 if nels == 0: break print("LKJASLFKJALKASF:", nels) num_elements.append(nels) #flags = get_corner_flags(mesh) beg = clock() mesh = r.refine(flags) end = clock() time_taken = end - beg time_t.append(time_taken) #if nelements == mesh.nelements: #break #nelements = mesh.nelements #from meshmode.mesh.visualization import draw_2d_mesh #draw_2d_mesh(mesh, True, True, True, fill=None) #import matplotlib.pyplot as pt #pt.show() #poss_flags = np.zeros(len(mesh.groups[0].vertex_indices)) #for i in range(0, len(flags)): # poss_flags[i] = flags[i] #for i in range(len(flags), len(poss_flags)): # poss_flags[i] = 1 import matplotlib.pyplot as pt pt.xlabel('Number of elements being refined') pt.ylabel('Time taken') pt.plot(num_elements, time_t, "o") pt.savefig(filename, format='pdf') pt.clf() print('DONE REFINING') ''' flags = np.zeros(len(mesh.groups[0].vertex_indices)) flags[0] = 1 flags[1] = 1 mesh = r.refine(flags) flags = np.zeros(len(mesh.groups[0].vertex_indices)) flags[0] = 1 flags[1] = 1 flags[2] = 1 mesh = r.refine(flags) ''' #check_nodal_adj_against_geometry(mesh) #r.print_rays(70) #r.print_rays(117) #r.print_hanging_elements(10) #r.print_hanging_elements(117) #r.print_hanging_elements(757) #from meshmode.mesh.visualization import draw_2d_mesh #draw_2d_mesh(mesh, False, False, False, fill=None) #import matplotlib.pyplot as pt #pt.show() from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ PolynomialWarpAndBlendGroupFactory discr = Discretization(cl_ctx, mesh, PolynomialWarpAndBlendGroupFactory(order)) from meshmode.discretization.visualization import make_visualizer vis = make_visualizer(queue, discr, order) remove_if_exists("connectivity2.vtu") remove_if_exists("geometry2.vtu") vis.write_vtk_file("geometry2.vtu", [ ("f", discr.nodes()[0]), ]) from meshmode.discretization.visualization import \ write_nodal_adjacency_vtk_file write_nodal_adjacency_vtk_file("connectivity2.vtu", mesh)
def __init__(self): self.ctx = self.__create_context() self.queue = cl.CommandQueue(self.ctx) self.programs = {} # The built programs
def test_partition_points(ctx_factory, use_tree, ndim, visualize=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) qbx = _build_qbx_discr(queue, ndim=ndim) _build_block_index(qbx.density_discr, use_tree=use_tree, factor=0.6)
usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER) gl_position.bind() np_color = np.ndarray((num_particles, 4), dtype=np.float32) gl_color = vbo.VBO(data=np_color, usage=GL_DYNAMIC_DRAW, target=GL_ARRAY_BUFFER) gl_color.bind() # Define pyopencl context and queue based on available hardware platform = cl.get_platforms()[0] dev = platform.get_devices(device_type=cl.device_type.GPU) context = cl.Context( properties=[(cl.context_properties.PLATFORM, platform)] + get_gl_sharing_context_properties()) queue = cl.CommandQueue(context) cl_velocity = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_velocity) cl_zmel = cl.Buffer(context, mf.COPY_HOST_PTR, hostbuf=np_zmel) cl_start_position = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_position) cl_start_velocity = cl.Buffer(context, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=np_velocity) # Buffer object depends on version of PyOpenCL if hasattr(gl_position, 'buffers'): cl_gl_position = cl.GLBuffer(context, mf.READ_WRITE, int(gl_position.buffers[0])) cl_gl_color = cl.GLBuffer(context, mf.READ_WRITE,
def test_interaction_points(ctx_factory, ndim, factor, visualize=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) qbx = _build_qbx_discr(queue, ndim=ndim) srcindices = _build_block_index(qbx.density_discr, factor=factor) # generate proxy points from pytential.linalg.proxy import ProxyGenerator generator = ProxyGenerator(qbx) _, _, pxycenters, pxyradii = generator(queue, srcindices) from pytential.linalg.proxy import ( # noqa gather_block_neighbor_points, gather_block_interaction_points) nbrindices = gather_block_neighbor_points(qbx.density_discr, srcindices, pxycenters, pxyradii) nodes, ranges = gather_block_interaction_points(qbx, srcindices) srcindices = srcindices.get(queue) nbrindices = nbrindices.get(queue) for i in range(srcindices.nblocks): isrc = srcindices.block_indices(i) inbr = nbrindices.block_indices(i) assert not np.any(np.isin(inbr, isrc)) if visualize: if ndim == 2: import matplotlib.pyplot as pt density_nodes = qbx.density_discr.nodes().get(queue) nodes = nodes.get(queue) ranges = ranges.get(queue) for i in range(srcindices.nblocks): isrc = srcindices.block_indices(i) inbr = nbrindices.block_indices(i) iall = np.s_[ranges[i]:ranges[i + 1]] pt.figure(figsize=(10, 8)) pt.plot(density_nodes[0], density_nodes[1], 'ko', ms=2.0, alpha=0.5) pt.plot(density_nodes[0, srcindices.indices], density_nodes[1, srcindices.indices], 'o', ms=2.0) pt.plot(density_nodes[0, isrc], density_nodes[1, isrc], 'o', ms=2.0) pt.plot(density_nodes[0, inbr], density_nodes[1, inbr], 'o', ms=2.0) pt.plot(nodes[0, iall], nodes[1, iall], 'x', ms=2.0) pt.xlim([-1.5, 1.5]) pt.ylim([-1.5, 1.5]) filename = "test_area_query_{}d_{:04}.png".format(ndim, i) pt.savefig(filename, dpi=300) pt.clf() elif ndim == 3: from meshmode.discretization.visualization import make_visualizer marker = np.empty(qbx.density_discr.nnodes) for i in range(srcindices.nblocks): isrc = srcindices.block_indices(i) inbr = nbrindices.block_indices(i) marker.fill(0.0) marker[srcindices.indices] = 0.0 marker[isrc] = -42.0 marker[inbr] = +42.0 marker_dev = cl.array.to_device(queue, marker) vis = make_visualizer(queue, qbx.density_discr, 10) filename = "test_area_query_{}d_{:04}.vtu".format(ndim, i) vis.write_vtk_file(filename, [ ("marker", marker_dev), ])
def main(): platform_ID = None xclbin = None globalbuffersize = 1024*1024*16 #16 MB typesize = 512 threshold = 40000 expected = np.array([[300,240,450,250,250,250], # 32 bits [600,500,1000,500,500,500], # 64 bits [1100,900,1500,1100,1100,1100], #128 bits [1500,1500,1900,2200,2200,2200], #256 bits [1900,2000,2300,3800,3800,3800] #512 bits ]) # Process cmd line args parser = OptionParser() parser.add_option("-k", "--kernel", help="xclbin path") parser.add_option("-d", "--device", help="device index") (options, args) = parser.parse_args() xclbin = options.kernel index = options.device if xclbin is None: print("No xclbin specified\nUsage: -k <path to xclbin>") sys.exit(1) if index is None: index = 0 #get default device platforms = cl.get_platforms() # get Xilinx platform for i in platforms: if i.name == "Xilinx": platform_ID = platforms.index(i) print("\nPlatform Information:") print("Platform name: %s" %platforms[platform_ID].name) print("Platform version: %s" %platforms[platform_ID].version) print("Platform profile: %s" %platforms[platform_ID].profile) print("Platform extensions: %s" %platforms[platform_ID].extensions) break if platform_ID is None: #make sure xrt is sourced #run clinfo to make sure Xilinx platform is discoverable print("ERROR: Plaform not found") sys.exit(1) # choose device devices = platforms[platform_ID].get_devices() if int(index) > len(devices)-1: print("\nERROR: Index out of range. %d devices were found" %len(devices)) sys.exit(1) else: dev = devices[int(index)] if "qdma" in str(dev): threshold = 30000 if "U2x4" in str(dev): threshold = 15000 if "gen3x4" in str(dev): threshold = 20000 ctx = cl.Context(devices = [dev]) if not ctx: print("ERROR: Failed to create context") sys.exit(1) commands = cl.CommandQueue(ctx, dev, properties=cl.command_queue_properties.OUT_OF_ORDER_EXEC_MODE_ENABLE) if not commands: print("ERROR: Failed to create command queue") sys.exit(1) print("Loading xclbin") prg = cl.Program(ctx, [dev], [open(xclbin).read()]) try: prg.build() except: print("ERROR:") print(prg.get_build_info(ctx, cl.program_build_info.LOG)) raise knl1 = prg.bandwidth1 knl2 = prg.bandwidth2 #input host and buffer lst = [i%256 for i in range(globalbuffersize)] input_host1 = np.array(lst).astype(np.uint8) input_host2 = np.array(lst).astype(np.uint8) input_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf = input_host1) input_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE | cl.mem_flags.COPY_HOST_PTR, hostbuf = input_host2) if input_buf1.int_ptr is None or input_buf2.int_ptr is None: print("ERROR: Failed to allocate source buffer") sys.exit(1) #output host and buffer output_host1 = np.empty_like(input_host1, dtype=np.uint8) output_host2 = np.empty_like(input_host2, dtype=np.uint8) output_buf1 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host1.nbytes) output_buf2 = cl.Buffer(ctx, cl.mem_flags.READ_WRITE, output_host2.nbytes) if output_buf1.int_ptr is None or output_buf2.int_ptr is None: print("ERROR: Failed to allocate destination buffer") sys.exit(1) #copy dataset to OpenCL buffer globalbuffersizeinbeats = globalbuffersize/(typesize/8) tests= int(math.log(globalbuffersizeinbeats, 2.0))+1 #lists dnsduration = [] dsduration = [] dbytes = [] dmbytes = [] bpersec = [] mbpersec = [] #run tests with burst length 1 beat to globalbuffersize #double burst length each test test=0 beats = 16 throughput = [] while beats <= 1024: print("LOOP PIPELINE %d beats" %beats) usduration = 0 fiveseconds = 5*1000000 reps = 64 while usduration < fiveseconds: start = current_micro_time() knl1(commands, (1, ), (1, ), output_buf1, input_buf1, np.uint32(beats), np.uint32(reps)) knl2(commands, (1, ), (1, ), output_buf2, input_buf2, np.uint32(beats), np.uint32(reps)) commands.finish() end = current_micro_time() usduration = end-start cl.enqueue_copy(commands, output_host1, output_buf1).wait() cl.enqueue_copy(commands, output_host2, output_buf2).wait() # need to check, currently fails limit = beats*(typesize/8) if not np.array_equal(output_host1[:limit], input_host1[:limit]): print("ERROR: Failed to copy entries") input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() sys.exit(1) if not np.array_equal(output_host2[:limit], input_host2[:limit]): print("ERROR: Failed to copy entries") input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() sys.exit(1) # print("Reps = %d, Beats = %d, Duration = %lf us" %(reps, beats, usduration)) # for debug if usduration < fiveseconds: reps = reps*2 dnsduration.append(usduration) dsduration.append(dnsduration[test]/1000000) dbytes.append(reps*beats*(typesize/8)) dmbytes.append(dbytes[test]/(1024 * 1024)) bpersec.append(2*dbytes[test]/dsduration[test]) mbpersec.append(2*bpersec[test]/(1024 * 1024)) throughput.append(mbpersec[test]) print("Test %d, Throughput: %d MB/s" %(test, throughput[test])) beats = beats*4 test+=1 #cleanup input_buf1.release() input_buf2.release() output_buf1.release() output_buf2.release() del ctx print("TTTT: %d" %throughput[0]) print("Maximum throughput: %d MB/s" %max(throughput)) if max(throughput) < threshold: print("ERROR: Throughput is less than expected value of %d GB/sec" %(threshold/1000)) sys.exit(1) print("PASSED")
def kMerCount(file, nK): K = nK h_seq = genSeq(file) h_seq = np.concatenate( (np.zeros(2 + 4 + 4**K).astype(CPU_SIDE_INT), h_seq)) kernelsource = ''' __kernel void mapToNumb( const int N, const int M, const int numbKmer, __global int* seq, __global int* numb_seq ) { int gid = get_global_id(0); int idx = gid * M + numbKmer + 2 + 4; int i, letter; if(idx < N*M + numbKmer + 2 + 4) { for(i=0; i < M; i++) { letter = seq[idx+i]; if(letter == 65) { numb_seq[idx+i] = 0; atomic_inc(&numb_seq[2]); } else { if(letter == 67) { numb_seq[idx+i] = 1; atomic_inc(&numb_seq[3]); } else { if(letter == 71) { numb_seq[idx+i] = 2; atomic_inc(&numb_seq[4]); } else { if(letter == 84) { numb_seq[idx+i] = 3; atomic_inc(&numb_seq[5]); } else { if(letter == 78) { numb_seq[idx+i] = -1; } else { numb_seq[idx+i] = -2; } } } } } } } } __kernel void freqTab( const int N, const int M, const int nK, const int numbKmer, __global int* numb_seq ) { int gid = get_global_id(0); int idx = gid * M + numbKmer + 2 + 4; int i, numb; int k, p, loc_idx, ptn_idx; int dgt; int kmin; for(i=0; i < M; i++) { ptn_idx = 0; loc_idx = idx + i; kmin = 0; if(loc_idx <= (N*M + numbKmer + 2 + 4 - nK)) { for(k=0; k < nK; k++) { numb = numb_seq[loc_idx + k]; switch(numb) { case (-1): atomic_inc(&numb_seq[1]); break; case (-2): atomic_inc(&numb_seq[0]); break; default: dgt = (int)(pow(4, (float)(nK-1-k))); ptn_idx += dgt * numb; break; } if(numb < kmin) { kmin = numb; } } if(kmin >= 0) { atomic_inc(&numb_seq[ptn_idx+2+4]); } } } } ''' context = cl.create_some_context() device = context.devices[0] work_group_size = device.max_work_group_size work_item_size = device.max_work_item_sizes[0] print(work_group_size) print(work_item_size) numbGroups = work_group_size numbItems = work_item_size seqLen = np.size(h_seq) - 4**K - 2 - 4 q, r = divmod(seqLen, numbGroups * numbItems) q = q + 1 h_seq = np.concatenate( (h_seq, np.repeat(78, numbGroups * numbItems - r).astype(CPU_SIDE_INT))) h_numb_seq = np.zeros(np.size(h_seq)).astype(CPU_SIDE_INT) print(q) print(r) queue = cl.CommandQueue(context) program = cl.Program(context, kernelsource).build() mapToNumb = program.mapToNumb mapToNumb.set_scalar_arg_dtypes([np.int32, np.int32, np.int32, None, None]) freqTab = program.freqTab freqTab.set_scalar_arg_dtypes( [np.int32, np.int32, np.int32, np.int32, None]) d_seq = cl.Buffer(context, cl.mem_flags.READ_ONLY | cl.mem_flags.COPY_HOST_PTR, hostbuf=h_seq) d_numb_seq = cl.Buffer(context, cl.mem_flags.READ_WRITE, h_numb_seq.nbytes) cl.enqueue_fill_buffer(queue, d_numb_seq, np.zeros(1).astype(np.int), 0, h_numb_seq.nbytes) N = numbGroups * numbItems M = q numbKmer = 4**K globalsize = (N, ) localsize = (numbItems, ) mapToNumb(queue, globalsize, None, N, M, numbKmer, d_seq, d_numb_seq) queue.finish() freqTab(queue, globalsize, None, N, M, K, numbKmer, d_numb_seq) queue.finish() cl.enqueue_copy(queue, h_numb_seq, d_numb_seq) print("Counting Done") print(h_numb_seq[:numbKmer + 2 + 4]) assert (h_numb_seq[0] == 0 ), "File contains unknown nucleotide characters" #Sanity check return h_numb_seq[2:numbKmer + 2 + 4]
class Runner: def __init__(self, dims): self.width = dims[0] self.height = dims[1] import numpy as np from matplotlib import cm from itertools import cycle self.np = np color_maps = [ 'inferno', 'gnuplot', 'magma', 'viridis', 'plasma', 'cubehelix', 'gnuplot2', 'ocean', 'terrain', 'CMRmap', 'nipy_spectral' ] maps = [ np.array( map( lambda i: (np.array(cm.get_cmap(x, 256) (i)[:-1]) * 255).astype(np.uint8), np.arange(0, 256))) for x in color_maps ] # tup = [] # for m in maps: # tup.extend([m, m[::-1]]) tup = (maps[0], maps[0][::-1], maps[1], maps[1][::-1], maps[2], maps[2][::-1], maps[5], maps[5][::-1], maps[3], maps[4][::-1], maps[6], maps[7][::-1], maps[8], maps[9][::-1], maps[10], maps[10][::-1]) self.cols = np.concatenate(tup) self.cols = np.concatenate((self.cols, self.cols[::-1])) self.step = 0 # print("Colors length:", len(self.cols)) denoms = np.cos(np.arange(0, 3 * np.pi, 0.01)) + (2 * np.pi) self.denom = cycle(denoms) self.up = True self.half_len = len(self.cols) / 64. # print("Half length", self.half_len) self.init_gpu() def init_gpu(self): self.use_cl = False try: import pyopencl as cl from pyopencl import array except Exception, e: import traceback traceback.print_exc() return self.use_cl = True self.cl = cl self.ctx = cl.Context([cl.get_platforms()[1].get_devices()[0]]) self.queue = cl.CommandQueue(self.ctx) self.lut = self.np.empty(len(self.cols), cl.array.vec.char3) for idx, i in enumerate(self.cols): self.lut[idx][0] = i[0] self.lut[idx][1] = i[1] self.lut[idx][2] = i[2] mf = cl.mem_flags self.lut_opencl = cl.Buffer(self.ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=self.lut) self.prg = cl.Program( self.ctx, """ __kernel void plasma(__global uchar4 *img, __constant uchar4 *lut, float const denom, uint step, uint const height, uint const width, uint const colours) {{ const int x = get_global_id(0); const int y = get_global_id(1); const int index = y * height + x; const float half_len = {0}; const int h = step + half_len + (half_len * {1}sin((float){1}sqrt(pow(x - width / 2.,2)+ pow(y - height / 2.,2)) / denom)); if( h < colours) {{ img[index] = lut[h]; }} else {{ img[index] = lut[colours - h]; }} }} """.format(self.half_len, 'native_')).build()
def main(snapshot_pattern="wave-mpi-{step:04d}-{rank:04d}.pkl", restart_step=None, use_profiling=False, use_logmgr=False, actx_class=PyOpenCLArrayContext): """Drive the example.""" cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) from mpi4py import MPI comm = MPI.COMM_WORLD rank = comm.Get_rank() num_parts = comm.Get_size() logmgr = initialize_logmgr(use_logmgr, filename="wave-mpi.sqlite", mode="wu", mpi_comm=comm) if use_profiling: queue = cl.CommandQueue(cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) actx = actx_class(queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue)), logmgr=logmgr) else: queue = cl.CommandQueue(cl_ctx) actx = actx_class(queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) if restart_step is None: from meshmode.distributed import MPIMeshDistributor, get_partition_by_pymetis mesh_dist = MPIMeshDistributor(comm) dim = 2 nel_1d = 16 if mesh_dist.is_mananger_rank(): from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh( a=(-0.5,)*dim, b=(0.5,)*dim, nelements_per_axis=(nel_1d,)*dim) print("%d elements" % mesh.nelements) part_per_element = get_partition_by_pymetis(mesh, num_parts) local_mesh = mesh_dist.send_mesh_parts(mesh, part_per_element, num_parts) del mesh else: local_mesh = mesh_dist.receive_mesh_part() fields = None else: from mirgecom.restart import read_restart_data restart_data = read_restart_data( actx, snapshot_pattern.format(step=restart_step, rank=rank) ) local_mesh = restart_data["local_mesh"] nel_1d = restart_data["nel_1d"] assert comm.Get_size() == restart_data["num_parts"] order = 3 discr = EagerDGDiscretization(actx, local_mesh, order=order, mpi_communicator=comm) current_cfl = 0.485 wave_speed = 1.0 from grudge.dt_utils import characteristic_lengthscales dt = current_cfl * characteristic_lengthscales(actx, discr) / wave_speed from grudge.op import nodal_min dt = nodal_min(discr, "vol", dt) t_final = 1 if restart_step is None: t = 0 istep = 0 fields = flat_obj_array( bump(actx, discr), [discr.zeros(actx) for i in range(discr.dim)] ) else: t = restart_data["t"] istep = restart_step assert istep == restart_step restart_fields = restart_data["fields"] old_order = restart_data["order"] if old_order != order: old_discr = EagerDGDiscretization(actx, local_mesh, order=old_order, mpi_communicator=comm) from meshmode.discretization.connection import make_same_mesh_connection connection = make_same_mesh_connection(actx, discr.discr_from_dd("vol"), old_discr.discr_from_dd("vol")) fields = connection(restart_fields) else: fields = restart_fields if logmgr: logmgr_add_cl_device_info(logmgr, queue) logmgr_add_device_memory_usage(logmgr, queue) logmgr.add_watches(["step.max", "t_step.max", "t_log.max"]) try: logmgr.add_watches(["memory_usage_python.max", "memory_usage_gpu.max"]) except KeyError: pass if use_profiling: logmgr.add_watches(["multiply_time.max"]) vis_timer = IntervalTimer("t_vis", "Time spent visualizing") logmgr.add_quantity(vis_timer) vis = make_visualizer(discr) def rhs(t, w): return wave_operator(discr, c=wave_speed, w=w) compiled_rhs = actx.compile(rhs) while t < t_final: if logmgr: logmgr.tick_before() # restart must happen at beginning of step if istep % 100 == 0 and ( # Do not overwrite the restart file that we just read. istep != restart_step): from mirgecom.restart import write_restart_file write_restart_file( actx, restart_data={ "local_mesh": local_mesh, "order": order, "fields": fields, "t": t, "step": istep, "nel_1d": nel_1d, "num_parts": num_parts}, filename=snapshot_pattern.format(step=istep, rank=rank), comm=comm ) if istep % 10 == 0: print(istep, t, discr.norm(fields[0])) vis.write_parallel_vtk_file( comm, "fld-wave-mpi-%03d-%04d.vtu" % (rank, istep), [ ("u", fields[0]), ("v", fields[1:]), ], overwrite=True ) fields = thaw(freeze(fields, actx), actx) fields = rk4_step(fields, t, dt, compiled_rhs) t += dt istep += 1 if logmgr: set_dt(logmgr, dt) logmgr.tick_after() final_soln = discr.norm(fields[0]) assert np.abs(final_soln - 0.04409852463947439) < 1e-14
def main(use_profiling=False): """Drive the example.""" cl_ctx = cl.create_some_context() if use_profiling: queue = cl.CommandQueue( cl_ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) actx = PyOpenCLProfilingArrayContext( queue, allocator=cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))) else: queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue, allocator=cl_tools.MemoryPool( cl_tools.ImmediateAllocator(queue))) dim = 2 nel_1d = 16 from meshmode.mesh.generation import generate_regular_rect_mesh mesh = generate_regular_rect_mesh(a=(-0.5, ) * dim, b=(0.5, ) * dim, nelements_per_axis=(nel_1d, ) * dim) order = 3 if dim == 2: # no deep meaning here, just a fudge factor dt = 0.7 / (nel_1d * order**2) elif dim == 3: # no deep meaning here, just a fudge factor dt = 0.4 / (nel_1d * order**2) else: raise ValueError("don't have a stable time step guesstimate") print("%d elements" % mesh.nelements) discr = EagerDGDiscretization(actx, mesh, order=order) fields = flat_obj_array(bump(actx, discr), [discr.zeros(actx) for i in range(discr.dim)]) vis = make_visualizer(discr) def rhs(t, w): return wave_operator(discr, c=1, w=w) t = 0 t_final = 3 istep = 0 while t < t_final: fields = rk4_step(fields, t, dt, rhs) if istep % 10 == 0: if use_profiling: print(actx.tabulate_profiling_data()) print(istep, t, discr.norm(fields[0], np.inf)) vis.write_vtk_file("fld-wave-eager-%04d.vtu" % istep, [ ("u", fields[0]), ("v", fields[1:]), ]) t += dt istep += 1
def test_to_meshmode_interpolation_3d_nonexact(ctx_factory, params): cl_ctx = ctx_factory() queue = cl.CommandQueue(cl_ctx) assert drive_test_to_meshmode_interpolation( cl_ctx, queue, 3, *params, test_case='non-exact') < 1e-3
def initialise_opencl_object(self, program_src='', command_queue=None, interactive=False, platform_pref=None, device_pref=None, default_group_size=None, default_num_groups=None, default_tile_size=None, default_threshold=None, size_heuristics=[], required_types=[], all_sizes={}, user_sizes={}): if command_queue is None: self.ctx = get_prefered_context(interactive, platform_pref, device_pref) self.queue = cl.CommandQueue(self.ctx) else: self.ctx = command_queue.context self.queue = command_queue self.device = self.queue.device self.platform = self.device.platform self.pool = cl.tools.MemoryPool(cl.tools.ImmediateAllocator(self.queue)) device_type = self.device.type check_types(self, required_types) max_group_size = int(self.device.max_work_group_size) max_tile_size = int(np.sqrt(self.device.max_work_group_size)) self.max_group_size = max_group_size self.max_tile_size = max_tile_size self.max_threshold = 0 self.max_num_groups = 0 self.max_local_memory = int(self.device.local_mem_size) # Futhark reserves 4 bytes of local memory for its own purposes. self.max_local_memory -= 4 # See comment in rts/c/opencl.h. if self.platform.name.find('NVIDIA CUDA') >= 0: self.max_local_memory -= 12 elif self.platform.name.find('AMD') >= 0: self.max_local_memory -= 16 self.free_list = {} self.global_failure = self.pool.allocate(np.int32().itemsize) cl.enqueue_fill_buffer(self.queue, self.global_failure, np.int32(-1), 0, np.int32().itemsize) self.global_failure_args = self.pool.allocate( np.int32().itemsize * (self.global_failure_args_max + 1)) self.failure_is_an_option = np.int32(0) if 'default_group_size' in sizes: default_group_size = sizes['default_group_size'] del sizes['default_group_size'] if 'default_num_groups' in sizes: default_num_groups = sizes['default_num_groups'] del sizes['default_num_groups'] if 'default_tile_size' in sizes: default_tile_size = sizes['default_tile_size'] del sizes['default_tile_size'] if 'default_threshold' in sizes: default_threshold = sizes['default_threshold'] del sizes['default_threshold'] default_group_size_set = default_group_size != None default_tile_size_set = default_tile_size != None default_sizes = apply_size_heuristics( self, size_heuristics, { 'group_size': default_group_size, 'tile_size': default_tile_size, 'num_groups': default_num_groups, 'lockstep_width': None, 'threshold': default_threshold }) default_group_size = default_sizes['group_size'] default_num_groups = default_sizes['num_groups'] default_threshold = default_sizes['threshold'] default_tile_size = default_sizes['tile_size'] lockstep_width = default_sizes['lockstep_width'] if default_group_size > max_group_size: if default_group_size_set: sys.stderr.write( 'Note: Device limits group size to {} (down from {})\n'.format( max_tile_size, default_group_size)) default_group_size = max_group_size if default_tile_size > max_tile_size: if default_tile_size_set: sys.stderr.write( 'Note: Device limits tile size to {} (down from {})\n'.format( max_tile_size, default_tile_size)) default_tile_size = max_tile_size for (k, v) in user_sizes.items(): if k in all_sizes: all_sizes[k]['value'] = v else: raise Exception('Unknown size: {}\nKnown sizes: {}'.format( k, ' '.join(all_sizes.keys()))) self.sizes = {} for (k, v) in all_sizes.items(): if v['class'] == 'group_size': max_value = max_group_size default_value = default_group_size elif v['class'] == 'num_groups': max_value = max_group_size # Intentional! default_value = default_num_groups elif v['class'] == 'tile_size': max_value = max_tile_size default_value = default_tile_size elif v['class'].startswith('threshold'): max_value = None default_value = default_threshold else: # Bespoke sizes have no limit or default. max_value = None if v['value'] == None: self.sizes[k] = default_value elif max_value != None and v['value'] > max_value: sys.stderr.write( 'Note: Device limits {} to {} (down from {}\n'.format( k, max_value, v['value'])) self.sizes[k] = max_value else: self.sizes[k] = v['value'] # XXX: we perform only a subset of z-encoding here. Really, the # compiler should provide us with the variables to which # parameters are mapped. if (len(program_src) >= 0): return cl.Program( self.ctx, program_src ).build(["-DLOCKSTEP_WIDTH={}".format(lockstep_width)] + [ "-D{}={}".format( s.replace('z', 'zz').replace('.', 'zi').replace('#', 'zh'), v) for (s, v) in self.sizes.items() ])
def test_source_target_tree(ctx_factory, dims, do_plot=False): logging.basicConfig(level=logging.INFO) ctx = ctx_factory() queue = cl.CommandQueue(ctx) nsources = 2 * 10**5 ntargets = 3 * 10**5 dtype = np.float64 sources = make_normal_particle_array(queue, nsources, dims, dtype, seed=12) targets = make_normal_particle_array(queue, ntargets, dims, dtype, seed=19) if do_plot: import matplotlib.pyplot as pt pt.plot(sources[0].get(), sources[1].get(), "rx") pt.plot(targets[0].get(), targets[1].get(), "g+") pt.show() from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree, _ = tb(queue, sources, targets=targets, max_particles_in_box=10, debug=True) tree = tree.get(queue=queue) sorted_sources = np.array(list(tree.sources)) sorted_targets = np.array(list(tree.targets)) unsorted_sources = np.array([pi.get() for pi in sources]) unsorted_targets = np.array([pi.get() for pi in targets]) assert (sorted_sources == unsorted_sources[:, tree.user_source_ids]).all() user_target_ids = np.empty(tree.ntargets, dtype=np.intp) user_target_ids[tree.sorted_target_ids] = np.arange(tree.ntargets, dtype=np.intp) assert (sorted_targets == unsorted_targets[:, user_target_ids]).all() all_good_so_far = True if do_plot: from boxtree.visualization import TreePlotter plotter = TreePlotter(tree) plotter.draw_tree(fill=False, edgecolor="black", zorder=10) plotter.set_bounding_box() tol = 1e-15 for ibox in range(tree.nboxes): extent_low, extent_high = tree.get_box_extent(ibox) assert (extent_low >= tree.bounding_box[0] - 1e-12 * tree.root_extent).all(), ibox assert (extent_high <= tree.bounding_box[1] + 1e-12 * tree.root_extent).all(), ibox src_start = tree.box_source_starts[ibox] tgt_start = tree.box_target_starts[ibox] box_children = tree.box_child_ids[:, ibox] existing_children = box_children[box_children != 0] assert (tree.box_source_counts_nonchild[ibox] + np.sum(tree.box_source_counts_cumul[existing_children]) == tree.box_source_counts_cumul[ibox]) assert (tree.box_target_counts_nonchild[ibox] + np.sum(tree.box_target_counts_cumul[existing_children]) == tree.box_target_counts_cumul[ibox]) for what, particles in [ ("sources", sorted_sources[:, src_start:src_start + tree.box_source_counts_cumul[ibox]]), ("targets", sorted_targets[:, tgt_start:tgt_start + tree.box_target_counts_cumul[ibox]]), ]: good = ((particles < extent_high[:, np.newaxis] + tol) & (extent_low[:, np.newaxis] - tol <= particles)).all(axis=0) all_good_here = good.all() if do_plot and not all_good_here: pt.plot(particles[0, np.where(~good)[0]], particles[1, np.where(~good)[0]], "ro") plotter.draw_box(ibox, edgecolor="red") pt.show() if not all_good_here: print("BAD BOX %s %d" % (what, ibox)) all_good_so_far = all_good_so_far and all_good_here assert all_good_so_far if do_plot: pt.gca().set_aspect("equal", "datalim") pt.show()
def main(mesh_name="ellipsoid"): import logging logger = logging.getLogger(__name__) logging.basicConfig(level=logging.WARNING) # INFO for more progress info import pyopencl as cl cl_ctx = cl.create_some_context() queue = cl.CommandQueue(cl_ctx) actx = PyOpenCLArrayContext(queue, force_device_scalars=True) if mesh_name == "ellipsoid": cad_file_name = "geometries/ellipsoid.step" h = 0.6 elif mesh_name == "two-cylinders": cad_file_name = "geometries/two-cylinders-smooth.step" h = 0.4 else: raise ValueError("unknown mesh name: %s" % mesh_name) from meshmode.mesh.io import generate_gmsh, FileSource mesh = generate_gmsh( FileSource(cad_file_name), 2, order=2, other_options=["-string", "Mesh.CharacteristicLengthMax = %g;" % h], target_unit="MM") from meshmode.mesh.processing import perform_flips # Flip elements--gmsh generates inside-out geometry. mesh = perform_flips(mesh, np.ones(mesh.nelements)) from meshmode.mesh.processing import find_bounding_box bbox_min, bbox_max = find_bounding_box(mesh) bbox_center = 0.5 * (bbox_min + bbox_max) bbox_size = max(bbox_max - bbox_min) / 2 logger.info("%d elements" % mesh.nelements) from pytential.qbx import QBXLayerPotentialSource from meshmode.discretization import Discretization from meshmode.discretization.poly_element import \ InterpolatoryQuadratureSimplexGroupFactory density_discr = Discretization( actx, mesh, InterpolatoryQuadratureSimplexGroupFactory(target_order)) qbx = QBXLayerPotentialSource(density_discr, 4 * target_order, qbx_order, fmm_order=qbx_order + 3, target_association_tolerance=0.15) from pytential.target import PointsTarget fplot = FieldPlotter(bbox_center, extent=3.5 * bbox_size, npoints=150) from pytential import GeometryCollection places = GeometryCollection( { "qbx": qbx, "targets": PointsTarget(actx.from_numpy(fplot.points)) }, auto_where="qbx") density_discr = places.get_discretization("qbx") nodes = thaw(density_discr.nodes(), actx) angle = actx.np.arctan2(nodes[1], nodes[0]) if k: kernel = HelmholtzKernel(3) else: kernel = LaplaceKernel(3) #op = sym.d_dx(sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None)) op = sym.D(kernel, sym.var("sigma"), qbx_forced_limit=None) #op = sym.S(kernel, sym.var("sigma"), qbx_forced_limit=None) if 0: from random import randrange sigma = actx.zeros(density_discr.ndofs, angle.entry_dtype) for _ in range(5): sigma[randrange(len(sigma))] = 1 from arraycontext import unflatten sigma = unflatten(angle, sigma, actx) else: sigma = actx.np.cos(mode_nr * angle) if isinstance(kernel, HelmholtzKernel): for i, elem in np.ndenumerate(sigma): sigma[i] = elem.astype(np.complex128) fld_in_vol = actx.to_numpy( bind(places, op, auto_where=("qbx", "targets"))(actx, sigma=sigma, k=k)) #fplot.show_scalar_in_mayavi(fld_in_vol.real, max_val=5) fplot.write_vtk_file("layerpot-3d-potential.vts", [("potential", fld_in_vol)]) bdry_normals = bind(places, sym.normal( density_discr.ambient_dim))(actx).as_vector(dtype=object) from meshmode.discretization.visualization import make_visualizer bdry_vis = make_visualizer(actx, density_discr, target_order) bdry_vis.write_vtk_file("layerpot-3d-density.vtu", [ ("sigma", sigma), ("bdry_normals", bdry_normals), ])
def test_level_restriction(ctx_factory, dims, skip_prune, lookbehind, do_plot=False): ctx = ctx_factory() queue = cl.CommandQueue(ctx) nparticles = 10**5 dtype = np.float64 from boxtree.tools import make_surface_particle_array particles = make_surface_particle_array(queue, nparticles, dims, dtype, seed=15) if do_plot: import matplotlib.pyplot as pt pt.plot(particles[0].get(), particles[1].get(), "x") from boxtree import TreeBuilder tb = TreeBuilder(ctx) queue.finish() tree_dev, _ = tb( queue, particles, kind="adaptive-level-restricted", max_particles_in_box=30, debug=True, skip_prune=skip_prune, lr_lookbehind=lookbehind, # Artificially low to exercise reallocation code nboxes_guess=10) def find_neighbors(leaf_box_centers, leaf_box_radii): # We use an area query with a ball that is slightly larger than # the size of a leaf box to find the neighboring leaves. # # Note that since this comes from an area query, the self box will be # included in the neighbor list. from boxtree.area_query import AreaQueryBuilder aqb = AreaQueryBuilder(ctx) ball_radii = cl.array.to_device( queue, np.min(leaf_box_radii) / 2 + leaf_box_radii) leaf_box_centers = [ cl.array.to_device(queue, axis) for axis in leaf_box_centers ] area_query, _ = aqb(queue, tree_dev, leaf_box_centers, ball_radii) area_query = area_query.get(queue=queue) return (area_query.leaves_near_ball_starts, area_query.leaves_near_ball_lists) # Get data to host for test. tree = tree_dev.get(queue=queue) # Find leaf boxes. from boxtree import box_flags_enum leaf_boxes, = (tree.box_flags & box_flags_enum.HAS_CHILDREN == 0).nonzero() leaf_box_radii = np.empty(len(leaf_boxes)) leaf_box_centers = np.empty((dims, len(leaf_boxes))) for idx, leaf_box in enumerate(leaf_boxes): box_center = tree.box_centers[:, leaf_box] ext_l, ext_h = tree.get_box_extent(leaf_box) leaf_box_radii[idx] = np.max(ext_h - ext_l) * 0.5 leaf_box_centers[:, idx] = box_center neighbor_starts, neighbor_and_self_lists = find_neighbors( leaf_box_centers, leaf_box_radii) # Check level restriction. for leaf_idx, leaf in enumerate(leaf_boxes): neighbors = neighbor_and_self_lists[ neighbor_starts[leaf_idx]:neighbor_starts[leaf_idx + 1]] neighbor_levels = np.array(tree.box_levels[neighbors], dtype=int) leaf_level = int(tree.box_levels[leaf]) assert (np.abs(neighbor_levels - leaf_level) <= 1).all(), \ (neighbor_levels, leaf_level)
def test_wait_for_events(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) evt1 = cl.enqueue_marker(queue) evt2 = cl.enqueue_marker(queue) cl.wait_for_events([evt1, evt2])