def potential_assembler(device_interface, space, operator_descriptor, points, parameters): """Assemble dense with OpenCL.""" import bempp.api from bempp.api.integration.triangle_gauss import rule from bempp.api.utils.helpers import get_type from bempp.core.opencl_kernels import get_kernel_from_name from bempp.core.opencl_kernels import get_kernel_from_operator_descriptor from bempp.core.opencl_kernels import ( default_context, default_device, get_vector_width, ) if bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE == "gpu": device_type = "gpu" elif bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE == "cpu": device_type = "cpu" else: raise RuntimeError( f"Unknown device type {bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE}") mf = _cl.mem_flags ctx = default_context(device_type) device = default_device(device_type) quad_points, quad_weights = rule(parameters.quadrature.regular) precision = operator_descriptor.precision dtype = get_type(precision).real kernel_options = operator_descriptor.options kernel_dimension = operator_descriptor.kernel_dimension if operator_descriptor.is_complex: result_type = _np.dtype(get_type(precision).complex) else: result_type = dtype result_type = _np.dtype(result_type) indices = space.support_elements nelements = len(indices) vector_width = get_vector_width(precision, device_type=device_type) npoints = points.shape[1] remainder_size = nelements % WORKGROUP_SIZE_POTENTIAL main_size = nelements - remainder_size main_kernel = None remainder_kernel = None sum_kernel = None options = { "NUMBER_OF_QUAD_POINTS": len(quad_weights), "SHAPESET": space.shapeset.identifier, "NUMBER_OF_SHAPE_FUNCTIONS": space.number_of_shape_functions, "WORKGROUP_SIZE": WORKGROUP_SIZE_POTENTIAL // vector_width, } if operator_descriptor.is_complex: options["COMPLEX_KERNEL"] = None options["COMPLEX_COEFFICIENTS"] = None options["COMPLEX_RESULT"] = None if main_size > 0: main_kernel = get_kernel_from_operator_descriptor( operator_descriptor, options, "potential", device_type=device_type) sum_kernel = get_kernel_from_name("sum_for_potential_novec", options, precision, device_type=device_type) if remainder_size > 0: options["WORKGROUP_SIZE"] = remainder_size remainder_kernel = get_kernel_from_operator_descriptor( operator_descriptor, options, "potential", force_novec=True, device_type=device_type, ) indices_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=indices) normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=space.normal_multipliers) points_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=points.ravel(order="F").astype(dtype), ) grid_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=space.grid.as_array.astype(dtype)) # elements_buffer = _cl.Buffer( # ctx, # mf.READ_ONLY | mf.COPY_HOST_PTR, # hostbuf=space.grid.elements.ravel(order="F"), # ) quad_points_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_points.ravel(order="F").astype(dtype), ) quad_weights_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_weights.astype(dtype)) result_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=result_type.itemsize * kernel_dimension * npoints) coefficients_buffer = _cl.Buffer(ctx, mf.READ_ONLY, size=result_type.itemsize * space.map_to_full_grid.shape[0]) if main_size > 0: sum_size = (kernel_dimension * npoints * (nelements // WORKGROUP_SIZE_POTENTIAL) * result_type.itemsize) sum_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=sum_size) if not kernel_options: kernel_options = [0.0] kernel_options_array = _np.array(kernel_options, dtype=dtype) kernel_options_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_options_array) def evaluator(x): """Evaluate a potential.""" result = _np.empty(kernel_dimension * npoints, dtype=result_type) with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_copy(queue, coefficients_buffer, x.astype(result_type)) _cl.enqueue_fill_buffer( queue, result_buffer, _np.uint8(0), 0, kernel_dimension * npoints * result_type.itemsize, ) if main_size > 0: _cl.enqueue_fill_buffer(queue, sum_buffer, _np.uint8(0), 0, sum_size) queue.finish() main_kernel( queue, (npoints, main_size // vector_width), (1, WORKGROUP_SIZE_POTENTIAL // vector_width), grid_buffer, indices_buffer, normals_buffer, points_buffer, coefficients_buffer, quad_points_buffer, quad_weights_buffer, sum_buffer, kernel_options_buffer, ) sum_kernel( queue, (kernel_dimension * npoints, ), (1, ), sum_buffer, result_buffer, _np.uint32(nelements // WORKGROUP_SIZE_POTENTIAL), ) if remainder_size > 0: remainder_kernel( queue, (npoints, remainder_size), (1, remainder_size), grid_buffer, indices_buffer, normals_buffer, points_buffer, coefficients_buffer, quad_points_buffer, quad_weights_buffer, result_buffer, kernel_options_buffer, global_offset=(0, main_size), ) _cl.enqueue_copy(queue, result, result_buffer) return result return evaluator
def singular_assembler( device_interface, operator_descriptor, grid, domain, dual_to_range, test_points, trial_points, quad_weights, test_elements, trial_elements, test_offsets, trial_offsets, weights_offsets, number_of_quad_points, kernel_options, result, ): """Assemble singular part of integral operators with OpenCL.""" from bempp.api.utils.helpers import get_type from bempp.core.opencl_kernels import get_kernel_from_operator_descriptor from bempp.core.opencl_kernels import default_context, default_device mf = _cl.mem_flags ctx = default_context() device = default_device() precision = operator_descriptor.precision dtype = get_type(precision).real options = { "WORKGROUP_SIZE": WORKGROUP_SIZE_GALERKIN, "TEST": dual_to_range.shapeset.identifier, "TRIAL": domain.shapeset.identifier, "NUMBER_OF_TEST_SHAPE_FUNCTIONS": dual_to_range.number_of_shape_functions, "NUMBER_OF_TRIAL_SHAPE_FUNCTIONS": domain.number_of_shape_functions, } if operator_descriptor.is_complex: options["COMPLEX_KERNEL"] = None kernel = get_kernel_from_operator_descriptor(operator_descriptor, options, "singular") # Initialize OpenCL Buffers grid_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=grid.as_array.astype(dtype)) test_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.normal_multipliers) trial_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.normal_multipliers) test_points_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=test_points.astype(dtype)) trial_points_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=trial_points.astype(dtype)) quad_weights_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_weights.astype(dtype)) test_elements_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=test_elements) trial_elements_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=trial_elements) test_offsets_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=test_offsets) trial_offsets_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=trial_offsets) weights_offsets_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=weights_offsets) local_quad_points = number_of_quad_points // WORKGROUP_SIZE_GALERKIN local_quad_points_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=local_quad_points) result_buffer = _cl.Buffer(ctx, mf.WRITE_ONLY, size=result.nbytes) if not kernel_options: kernel_options = [0.0] kernel_options_array = _np.array(kernel_options, dtype=dtype) kernel_options_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_options_array) number_of_singular_indices = len(test_elements) with _cl.CommandQueue(ctx, device=device) as queue: kernel( queue, (number_of_singular_indices, ), (WORKGROUP_SIZE_GALERKIN, ), grid_buffer, test_normals_buffer, trial_normals_buffer, test_points_buffer, trial_points_buffer, quad_weights_buffer, test_elements_buffer, trial_elements_buffer, test_offsets_buffer, trial_offsets_buffer, weights_offsets_buffer, local_quad_points_buffer, result_buffer, kernel_options_buffer, g_times_l=True, ) _cl.enqueue_copy(queue, result, result_buffer)
def dense_assembler(device_interface, operator_descriptor, domain, dual_to_range, parameters, result): """Assemble dense with OpenCL.""" import bempp.api from bempp.api.integration.triangle_gauss import rule from bempp.api.utils.helpers import get_type from bempp.core.opencl_kernels import get_kernel_from_operator_descriptor from bempp.core.opencl_kernels import ( default_context, default_device, get_vector_width, ) if bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "gpu": device_type = "gpu" elif bempp.api.BOUNDARY_OPERATOR_DEVICE_TYPE == "cpu": device_type = "cpu" else: raise RuntimeError( f"Unknown device type {bempp.api.POTENTIAL_OPERATOR_DEVICE_TYPE}") mf = _cl.mem_flags ctx = default_context(device_type) device = default_device(device_type) precision = operator_descriptor.precision dtype = get_type(precision).real kernel_options = operator_descriptor.options quad_points, quad_weights = rule(parameters.quadrature.regular) test_indices, test_color_indexptr = dual_to_range.get_elements_by_color() trial_indices, trial_color_indexptr = domain.get_elements_by_color() number_of_test_colors = len(test_color_indexptr) - 1 number_of_trial_colors = len(trial_color_indexptr) - 1 options = { "NUMBER_OF_QUAD_POINTS": len(quad_weights), "TEST": dual_to_range.shapeset.identifier, "TRIAL": domain.shapeset.identifier, "TRIAL_NUMBER_OF_ELEMENTS": domain.number_of_support_elements, "TEST_NUMBER_OF_ELEMENTS": dual_to_range.number_of_support_elements, "NUMBER_OF_TEST_SHAPE_FUNCTIONS": dual_to_range.number_of_shape_functions, "NUMBER_OF_TRIAL_SHAPE_FUNCTIONS": domain.number_of_shape_functions, } if operator_descriptor.is_complex: options["COMPLEX_KERNEL"] = None main_kernel = get_kernel_from_operator_descriptor(operator_descriptor, options, "regular", device_type=device_type) remainder_kernel = get_kernel_from_operator_descriptor( operator_descriptor, options, "regular", force_novec=True, device_type=device_type, ) test_indices_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=test_indices) trial_indices_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=trial_indices) test_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.normal_multipliers) trial_normals_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.normal_multipliers) test_grid_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.grid.as_array.astype(dtype), ) trial_grid_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.grid.as_array.astype(dtype)) test_elements_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.grid.elements.ravel(order="F"), ) trial_elements_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.grid.elements.ravel(order="F"), ) test_local2global_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.local2global) trial_local2global_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.local2global) test_multipliers_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=dual_to_range.local_multipliers.astype(dtype), ) trial_multipliers_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=domain.local_multipliers.astype(dtype), ) quad_points_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_points.ravel(order="F").astype(dtype), ) quad_weights_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=quad_weights.astype(dtype)) result_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=result.nbytes) if not kernel_options: kernel_options = [0.0] kernel_options_array = _np.array(kernel_options, dtype=dtype) kernel_options_buffer = _cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=kernel_options_array) vector_width = get_vector_width(precision, device_type=device_type) def kernel_runner( queue, test_offset, trial_offset, test_number_of_indices, trial_number_of_indices, ): """Actually run the kernel for a given range.""" remainder_size = trial_number_of_indices % vector_width main_size = trial_number_of_indices - remainder_size buffers = [ test_indices_buffer, trial_indices_buffer, test_normals_buffer, trial_normals_buffer, test_grid_buffer, trial_grid_buffer, test_elements_buffer, trial_elements_buffer, test_local2global_buffer, trial_local2global_buffer, test_multipliers_buffer, trial_multipliers_buffer, quad_points_buffer, quad_weights_buffer, result_buffer, kernel_options_buffer, _np.int32(dual_to_range.global_dof_count), _np.int32(domain.global_dof_count), _np.uint8(domain.grid != dual_to_range.grid), ] if main_size > 0: main_kernel( queue, (test_number_of_indices, main_size // vector_width), (1, 1), *buffers, global_offset=(test_offset, trial_offset), ) if remainder_size > 0: remainder_kernel( queue, (test_number_of_indices, remainder_size), (1, 1), *buffers, global_offset=(test_offset, trial_offset + main_size), ) with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_fill_buffer(queue, result_buffer, _np.uint8(0), 0, result.nbytes) for test_index in range(number_of_test_colors): test_offset = test_color_indexptr[test_index] n_test_indices = (test_color_indexptr[1 + test_index] - test_color_indexptr[test_index]) for trial_index in range(number_of_trial_colors): n_trial_indices = (trial_color_indexptr[1 + trial_index] - trial_color_indexptr[trial_index]) trial_offset = trial_color_indexptr[trial_index] kernel_runner(queue, test_offset, trial_offset, n_test_indices, n_trial_indices) _cl.enqueue_copy(queue, result, result_buffer)
def get_local_interaction_evaluator_opencl(grid, local_points, kernel_function, kernel_parameters, dtype, result_type): """Return an evaluator for the local interactions.""" import pyopencl as _cl import bempp.api from bempp.core.opencl_kernels import get_kernel_from_name from bempp.core.opencl_kernels import default_context, default_device if "laplace" in kernel_function: mode = "laplace" elif "modified_helmholtz" in kernel_function: mode = "modified_helmholtz" elif "helmholtz" in kernel_function: mode = "helmholtz" else: raise ValueError("Unknown value for kernel_function.") mf = _cl.mem_flags ctx = default_context() device = default_device() # vector_width = get_vector_width("double") npoints = local_points.shape[1] ncoeffs = npoints * grid.number_of_elements max_nneighbors = _np.max(_np.diff(grid.element_neighbors.indexptr)) grid_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=grid.as_array.astype(dtype), ) # elements_buffer = _cl.Buffer( # ctx, # mf.READ_ONLY | mf.COPY_HOST_PTR, # hostbuf=grid.elements.ravel(order="F"), # ) points_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=local_points.ravel(order="F"), ) neighbor_indices_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=grid.element_neighbors.indices, ) neighbor_indexptr_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=grid.element_neighbors.indexptr) coefficients_buffer = _cl.Buffer(ctx, mf.READ_ONLY, size=result_type.itemsize * ncoeffs) result_buffer = _cl.Buffer(ctx, mf.READ_WRITE, size=4 * result_type.itemsize * ncoeffs) if len(kernel_parameters) == 0: kernel_parameters = [0] kernel_parameters_buffer = _cl.Buffer( ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=_np.array(kernel_parameters, dtype="float64"), ) options = {"MAX_POINTS": max_nneighbors * npoints, "NPOINTS": npoints} if result_type == "complex128": options["COMPLEX_KERNEL"] = None kernel_name = "near_field_evaluator_" + mode kernel = get_kernel_from_name(kernel_name, options) def evaluator(coeffs): """Actually evaluate the near-field correction.""" result = _np.empty(4 * ncoeffs, dtype=result_type) with bempp.api.Timer(message="Singular Corrections Evaluator"): with _cl.CommandQueue(ctx, device=device) as queue: _cl.enqueue_copy(queue, coefficients_buffer, coeffs.astype(result_type)) _cl.enqueue_fill_buffer( queue, result_buffer, _np.uint8(0), 0, result_type.itemsize * ncoeffs, ) kernel( queue, (grid.number_of_elements, ), (1, ), grid_buffer, neighbor_indices_buffer, neighbor_indexptr_buffer, points_buffer, coefficients_buffer, result_buffer, kernel_parameters_buffer, _np.uint32(grid.number_of_elements), ) _cl.enqueue_copy(queue, result, result_buffer) return result return evaluator