Example #1
0
    def __call__(self, queue, tree, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(pl, event)*, where *pl* is an instance of
            :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
        from pytools import div_ceil

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)

        pl_plog = ProcessLogger(logger, "find peer lists")

        result, evt = peer_list_finder_kernel(
                queue, tree.nboxes,
                tree.box_centers.data, tree.root_extent,
                tree.box_levels.data, tree.aligned_nboxes,
                tree.box_child_ids.data, tree.box_flags.data,
                wait_for=wait_for)

        pl_plog.done()

        return PeerListLookup(
                tree=tree,
                peer_list_starts=result["peers"].starts,
                peer_lists=result["peers"].lists).with_queue(None), evt
Example #2
0
    def __call__(self, queue, tree, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(pl, event)*, where *pl* is an instance of
            :class:`PeerListLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """
        from pytools import div_ceil

        # Round up level count--this gets included in the kernel as
        # a stack bound. Rounding avoids too many kernel versions.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        peer_list_finder_kernel = self.get_peer_list_finder_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype, max_levels)

        pl_plog = ProcessLogger(logger, "find peer lists")

        result, evt = peer_list_finder_kernel(queue,
                                              tree.nboxes,
                                              tree.box_centers.data,
                                              tree.root_extent,
                                              tree.box_levels,
                                              tree.aligned_nboxes,
                                              tree.box_child_ids.data,
                                              tree.box_flags,
                                              wait_for=wait_for)

        pl_plog.done()

        return PeerListLookup(
            tree=tree,
            peer_list_starts=result["peers"].starts,
            peer_lists=result["peers"].lists).with_queue(None), evt
Example #3
0
    def __call__(self,
                 queue,
                 tree,
                 ball_centers,
                 ball_radii,
                 peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
            for dependency management. The *dtype* of *sqi* is
            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
            The entries of *sqi* are indexed by the global box index and are
            as follows:

            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
            * if *i* is the index of a leaf box, *sqi[i]* is the
              outer space invader distance for *i*.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue,
                                                    tree,
                                                    wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError(
                "size of peer lists must match with number of boxes")

        space_invader_query_kernel = self.get_space_invader_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        si_plog = ProcessLogger(logger, "space invader query")

        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes,
                                                   np.float32)
        if not wait_for:
            wait_for = []
        wait_for = wait_for + outer_space_invader_dists.events

        evt = space_invader_query_kernel(
            *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
                tree, peer_lists, ball_radii, outer_space_invader_dists,
                *tuple(bc for bc in ball_centers)),
            wait_for=wait_for,
            queue=queue,
            range=slice(len(ball_radii)))

        if tree.coord_dtype != np.dtype(np.float32):
            # The kernel output is always an array of float32 due to limited
            # support for atomic operations with float64 in OpenCL.
            # Here the output is cast to match the coord dtype.
            outer_space_invader_dists.finish()
            outer_space_invader_dists = outer_space_invader_dists.astype(
                tree.coord_dtype)
            evt, = outer_space_invader_dists.events

        si_plog.done()

        return outer_space_invader_dists, evt
Example #4
0
    def __call__(self,
                 queue,
                 tree,
                 ball_centers,
                 ball_radii,
                 peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ltb_plog = ProcessLogger(logger,
                                 "leaves-to-balls lookup: run area query")

        area_query, evt = self.area_query_builder(queue, tree, ball_centers,
                                                  ball_radii, peer_lists,
                                                  wait_for)
        wait_for = [evt]

        logger.debug("leaves-to-balls lookup: expand starts")

        nkeys = tree.nboxes
        nballs_p_1 = len(area_query.leaves_near_ball_starts)
        assert nballs_p_1 == len(ball_radii) + 1

        # We invert the area query in two steps:
        #
        # 1. Turn the area query result into (ball number, box number) pairs.
        #    This is done in the "starts expander kernel."
        #
        # 2. Key-value sort the (ball number, box number) pairs by box number.

        starts_expander_knl = self.get_starts_expander_kernel(
            tree.box_id_dtype)
        expanded_starts = cl.array.empty(
            queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
        evt = starts_expander_knl(
            expanded_starts,
            area_query.leaves_near_ball_starts.with_queue(queue), nballs_p_1)
        wait_for = [evt]

        logger.debug("leaves-to-balls lookup: key-value sort")

        balls_near_box_starts, balls_near_box_lists, evt \
                = self.key_value_sorter(
                        queue,
                        # keys
                        area_query.leaves_near_ball_lists.with_queue(queue),
                        # values
                        expanded_starts,
                        nkeys, starts_dtype=tree.box_id_dtype,
                        wait_for=wait_for)

        ltb_plog.done()

        return LeavesToBallsLookup(
            tree=tree,
            balls_near_box_starts=balls_near_box_starts,
            balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
Example #5
0
    def __call__(self,
                 queue,
                 tree,
                 ball_centers,
                 ball_radii,
                 peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(aq, event)*, where *aq* is an instance of
            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue,
                                                    tree,
                                                    wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError(
                "size of peer lists must match with number of boxes")

        area_query_kernel = self.get_area_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            ball_id_dtype, peer_lists.peer_list_starts.dtype, max_levels)

        aq_plog = ProcessLogger(logger, "area query")

        result, evt = area_query_kernel(queue,
                                        len(ball_radii),
                                        tree.box_centers.data,
                                        tree.root_extent,
                                        tree.box_levels,
                                        tree.aligned_nboxes,
                                        tree.box_child_ids.data,
                                        tree.box_flags,
                                        peer_lists.peer_list_starts,
                                        peer_lists.peer_lists,
                                        ball_radii,
                                        *(tuple(tree.bounding_box[0]) +
                                          tuple(bc for bc in ball_centers)),
                                        wait_for=wait_for)

        aq_plog.done()

        return AreaQueryResult(
            tree=tree,
            leaves_near_ball_starts=result["leaves"].starts,
            leaves_near_ball_lists=result["leaves"].lists).with_queue(
                None), evt
Example #6
0
def generate_code_v2(kernel):
    """
    :returns: a :class:`CodeGenerationResult`
    """

    from loopy.kernel import KernelState
    if kernel.state == KernelState.INITIAL:
        from loopy.preprocess import preprocess_kernel
        kernel = preprocess_kernel(kernel)

    if kernel.schedule is None:
        from loopy.schedule import get_one_scheduled_kernel
        kernel = get_one_scheduled_kernel(kernel)

    if kernel.state != KernelState.LINEARIZED:
        raise LoopyError("cannot generate code for a kernel that has not been "
                         "scheduled")

    # {{{ cache retrieval

    from loopy import CACHING_ENABLED

    if CACHING_ENABLED:
        input_kernel = kernel
        try:
            result = code_gen_cache[input_kernel]
            logger.debug("%s: code generation cache hit" % kernel.name)
            return result
        except KeyError:
            pass

    # }}}

    from loopy.type_inference import infer_unknown_types
    kernel = infer_unknown_types(kernel, expect_completion=True)

    from loopy.check import pre_codegen_checks
    pre_codegen_checks(kernel)

    codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code")

    # {{{ examine arg list

    from loopy.kernel.data import ValueArg
    from loopy.kernel.array import ArrayBase

    implemented_data_info = []

    for arg in kernel.args:
        is_written = arg.name in kernel.get_written_variables()
        if isinstance(arg, ArrayBase):
            implemented_data_info.extend(
                arg.decl_info(kernel.target,
                              is_written=is_written,
                              index_dtype=kernel.index_dtype))

        elif isinstance(arg, ValueArg):
            implemented_data_info.append(
                ImplementedDataInfo(target=kernel.target,
                                    name=arg.name,
                                    dtype=arg.dtype,
                                    arg_class=ValueArg,
                                    is_written=is_written))

        else:
            raise ValueError("argument type not understood: '%s'" % type(arg))

    allow_complex = False
    for var in kernel.args + list(kernel.temporary_variables.values()):
        if var.dtype.involves_complex():
            allow_complex = True

    # }}}

    seen_dtypes = set()
    seen_functions = set()
    seen_atomic_dtypes = set()

    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)

    from loopy.codegen.tools import CodegenOperationCacheManager

    codegen_state = CodeGenerationState(
        kernel=kernel,
        implemented_data_info=implemented_data_info,
        implemented_domain=initial_implemented_domain,
        implemented_predicates=frozenset(),
        seen_dtypes=seen_dtypes,
        seen_functions=seen_functions,
        seen_atomic_dtypes=seen_atomic_dtypes,
        var_subst_map={},
        allow_complex=allow_complex,
        var_name_generator=kernel.get_var_name_generator(),
        is_generating_device_code=False,
        gen_program_name=(kernel.target.host_program_name_prefix +
                          kernel.name +
                          kernel.target.host_program_name_suffix),
        schedule_index_end=len(kernel.schedule),
        codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel),
    )

    from loopy.codegen.result import generate_host_or_device_program
    codegen_result = generate_host_or_device_program(codegen_state,
                                                     schedule_index=0)

    device_code_str = codegen_result.device_code()

    from loopy.check import check_implemented_domains
    assert check_implemented_domains(kernel,
                                     codegen_result.implemented_domains,
                                     device_code_str)

    # {{{ handle preambles

    for idi in codegen_state.implemented_data_info:
        seen_dtypes.add(idi.dtype)

    for tv in kernel.temporary_variables.values():
        for idi in tv.decl_info(kernel.target, index_dtype=kernel.index_dtype):
            seen_dtypes.add(idi.dtype)

    if kernel.all_inames():
        seen_dtypes.add(kernel.index_dtype)

    preambles = kernel.preambles[:]

    preamble_info = PreambleInfo(
        kernel=kernel,
        seen_dtypes=seen_dtypes,
        seen_functions=seen_functions,
        # a set of LoopyTypes (!)
        seen_atomic_dtypes=seen_atomic_dtypes,
        codegen_state=codegen_state)

    preamble_generators = (
        kernel.preamble_generators +
        kernel.target.get_device_ast_builder().preamble_generators())
    for prea_gen in preamble_generators:
        preambles.extend(prea_gen(preamble_info))

    codegen_result = codegen_result.copy(device_preambles=preambles)

    # }}}

    # For faster unpickling in the common case when implemented_domains isn't needed.
    from loopy.tools import LazilyUnpicklingDict
    codegen_result = codegen_result.copy(
        implemented_domains=LazilyUnpicklingDict(
            codegen_result.implemented_domains))

    codegen_plog.done()

    if CACHING_ENABLED:
        code_gen_cache.store_if_not_present(input_kernel, codegen_result)

    return codegen_result
Example #7
0
    def get_stored_ids_and_unscaled_projection_matrix(self):
        from pytools import ProcessLogger
        plog = ProcessLogger(logger, "compute PDE for Taylor coefficients")

        mis = self.get_full_coefficient_identifiers()
        coeff_ident_enumerate_dict = {
            tuple(mi): i
            for (i, mi) in enumerate(mis)
        }

        diff_op = self.get_pde_as_diff_op()
        assert len(diff_op.eqs) == 1
        pde_dict = {k.mi: v for k, v in diff_op.eqs[0].items()}
        for ident in pde_dict.keys():
            if ident not in coeff_ident_enumerate_dict:
                # Order of the expansion is less than the order of the PDE.
                # In that case, the compression matrix is the identity matrix
                # and there's nothing to project
                from_input_coeffs_by_row = [[(i, 1)] for i in range(len(mis))]
                from_output_coeffs_by_row = [[] for _ in range(len(mis))]
                shape = (len(mis), len(mis))
                op = CSEMatVecOperator(from_input_coeffs_by_row,
                                       from_output_coeffs_by_row, shape)
                return mis, op

        # Calculate the multi-index that appears last in in the PDE in
        # reverse degree lexicographic order (degrevlex).
        max_mi_idx = max(coeff_ident_enumerate_dict[ident]
                         for ident in pde_dict.keys())
        max_mi = mis[max_mi_idx]
        max_mi_coeff = pde_dict[max_mi]
        max_mi_mult = -1 / sym.sympify(max_mi_coeff)

        def is_stored(mi):
            """
            A multi_index mi is not stored if mi >= max_mi
            """
            return any(mi[d] < max_mi[d] for d in range(self.dim))

        stored_identifiers = []

        from_input_coeffs_by_row = []
        from_output_coeffs_by_row = []
        for i, mi in enumerate(mis):
            # If the multi-index is to be stored, keep the projection matrix
            # entry empty
            if is_stored(mi):
                idx = len(stored_identifiers)
                stored_identifiers.append(mi)
                from_input_coeffs_by_row.append([(idx, 1)])
                from_output_coeffs_by_row.append([])
                continue
            diff = [mi[d] - max_mi[d] for d in range(self.dim)]

            # eg: u_xx + u_yy + u_zz is represented as
            # [((2, 0, 0), 1), ((0, 2, 0), 1), ((0, 0, 2), 1)]
            assignment = []
            for other_mi, coeff in pde_dict.items():
                j = coeff_ident_enumerate_dict[add_mi(other_mi, diff)]
                if i == j:
                    # Skip the u_zz part here.
                    continue
                # PDE might not have max_mi_coeff = -1, divide by -max_mi_coeff
                # to get a relation of the form, u_zz = - u_xx - u_yy for Laplace 3D.
                assignment.append((j, coeff * max_mi_mult))
            from_input_coeffs_by_row.append([])
            from_output_coeffs_by_row.append(assignment)

        plog.done()

        logger.debug(
            "number of Taylor coefficients was reduced from {orig} to {red}".
            format(orig=len(self.get_full_coefficient_identifiers()),
                   red=len(stored_identifiers)))

        shape = (len(mis), len(stored_identifiers))
        op = CSEMatVecOperator(from_input_coeffs_by_row,
                               from_output_coeffs_by_row, shape)
        return stored_identifiers, op
Example #8
0
def drive_fmm(expansion_wrangler, src_weights, timing_data=None):
    """Top-level driver routine for the QBX fast multipole calculation.

    :arg geo_data: A :class:`QBXFMMGeometryData` instance.
    :arg expansion_wrangler: An object exhibiting the
        :class:`ExpansionWranglerInterface`.
    :arg src_weights: Source 'density/weights/charges'.
        Passed unmodified to *expansion_wrangler*.
    :arg timing_data: Either *None* or a dictionary that collects
        timing data.

    Returns the potentials computed by *expansion_wrangler*.

    See also :func:`boxtree.fmm.drive_fmm`.
    """
    wrangler = expansion_wrangler

    geo_data = wrangler.geo_data
    traversal = geo_data.traversal()
    tree = traversal.tree
    recorder = TimingRecorder()

    # Interface guidelines: Attributes of the tree are assumed to be known
    # to the expansion wrangler and should not be passed.

    fmm_proc = ProcessLogger(logger, "qbx fmm")

    src_weights = wrangler.reorder_sources(src_weights)

    # {{{ construct local multipoles

    mpole_exps, timing_future = wrangler.form_multipoles(
            traversal.level_start_source_box_nrs,
            traversal.source_boxes,
            src_weights)

    recorder.add("form_multipoles", timing_future)

    # }}}

    # {{{ propagate multipoles upward

    mpole_exps, timing_future = wrangler.coarsen_multipoles(
            traversal.level_start_source_parent_box_nrs,
            traversal.source_parent_boxes,
            mpole_exps)

    recorder.add("coarsen_multipoles", timing_future)

    # }}}

    # {{{ direct evaluation from neighbor source boxes ("list 1")

    non_qbx_potentials, timing_future = wrangler.eval_direct(
            traversal.target_boxes,
            traversal.neighbor_source_boxes_starts,
            traversal.neighbor_source_boxes_lists,
            src_weights)

    recorder.add("eval_direct", timing_future)

    # }}}

    # {{{ translate separated siblings' ("list 2") mpoles to local

    local_exps, timing_future = wrangler.multipole_to_local(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_siblings_starts,
            traversal.from_sep_siblings_lists,
            mpole_exps)

    recorder.add("multipole_to_local", timing_future)

    # }}}

    # {{{ evaluate sep. smaller mpoles ("list 3") at particles

    # (the point of aiming this stage at particles is specifically to keep its
    # contribution *out* of the downward-propagating local expansions)

    mpole_result, timing_future = wrangler.eval_multipoles(
            traversal.target_boxes_sep_smaller_by_source_level,
            traversal.from_sep_smaller_by_level,
            mpole_exps)

    recorder.add("eval_multipoles", timing_future)

    non_qbx_potentials = non_qbx_potentials + mpole_result

    # assert that list 3 close has been merged into list 1
    assert traversal.from_sep_close_smaller_starts is None

    # }}}

    # {{{ form locals for separated bigger source boxes ("list 4")

    local_result, timing_future = wrangler.form_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_bigger_starts,
            traversal.from_sep_bigger_lists,
            src_weights)

    recorder.add("form_locals", timing_future)

    local_exps = local_exps + local_result

    # assert that list 4 close has been merged into list 1
    assert traversal.from_sep_close_bigger_starts is None

    # }}}

    # {{{ propagate local_exps downward

    local_exps, timing_future = wrangler.refine_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            local_exps)

    recorder.add("refine_locals", timing_future)

    # }}}

    # {{{ evaluate locals

    local_result, timing_future = wrangler.eval_locals(
            traversal.level_start_target_box_nrs,
            traversal.target_boxes,
            local_exps)

    recorder.add("eval_locals", timing_future)

    non_qbx_potentials = non_qbx_potentials + local_result

    # }}}

    # {{{ wrangle qbx expansions

    qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weights)

    recorder.add("form_global_qbx_locals", timing_future)

    local_result, timing_future = (
            wrangler.translate_box_multipoles_to_qbx_local(mpole_exps))

    recorder.add("translate_box_multipoles_to_qbx_local", timing_future)

    qbx_expansions = qbx_expansions + local_result

    local_result, timing_future = (
            wrangler.translate_box_local_to_qbx_local(local_exps))

    recorder.add("translate_box_local_to_qbx_local", timing_future)

    qbx_expansions = qbx_expansions + local_result

    qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions)

    recorder.add("eval_qbx_expansions", timing_future)

    # }}}

    # {{{ reorder potentials

    nqbtl = geo_data.non_qbx_box_target_lists()

    all_potentials_in_tree_order = wrangler.full_output_zeros()

    for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials):
        ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i

    all_potentials_in_tree_order += qbx_potentials

    def reorder_and_finalize_potentials(x):
        # "finalize" gives host FMMs (like FMMlib) a chance to turn the
        # potential back into a CL array.
        return wrangler.finalize_potentials(x[tree.sorted_target_ids])

    from pytools.obj_array import with_object_array_or_scalar
    result = with_object_array_or_scalar(
            reorder_and_finalize_potentials, all_potentials_in_tree_order)

    # }}}

    fmm_proc.done()

    if timing_data is not None:
        timing_data.update(recorder.summarize())

    return result
Example #9
0
def as_scalar_pde(pde, vec_idx):
    r"""
    Returns a scalar PDE that is satisfied by the *vec_idx* component
    of *pde*.

    :arg pde: An instance of :class:`LinearPDESystemOperator`
    :arg vec_idx: the index of the vector-valued function that we
                  want as a scalar PDE
    """
    from sumpy.tools import nullspace

    indices = set()
    for eq in pde.eqs:
        for deriv_ident in eq.keys():
            indices.add(deriv_ident.vec_idx)

    # this is already a scalar pde
    if len(indices) == 1 and list(indices)[0] == vec_idx:
        return pde

    from pytools import ProcessLogger
    plog = ProcessLogger(logger, "computing single PDE for multiple PDEs")

    from pytools import (
            generate_nonnegative_integer_tuples_summing_to_at_most
            as gnitstam)

    dim = pde.total_dims

    # slowly increase the order of the derivatives that we take of the
    # system of PDEs. Once we reach the order of the scalar PDE, this
    # loop will break
    for order in range(2, 100):
        mis = sorted(gnitstam(order, dim), key=sum)

        pde_mat = []
        coeff_ident_enumerate_dict = dict((tuple(mi), i) for
                                            (i, mi) in enumerate(mis))
        offset = len(mis)

        # Create a matrix of equations that are derivatives of the
        # original system of PDEs
        for mi in mis:
            for pde_dict in pde.eqs:
                eq = [0]*(len(mis)*(max(indices)+1))
                for ident, coeff in pde_dict.items():
                    c = tuple(add_mi(ident.mi, mi))
                    if c not in coeff_ident_enumerate_dict:
                        break
                    idx = offset*ident.vec_idx + coeff_ident_enumerate_dict[c]
                    eq[idx] = coeff
                else:
                    pde_mat.append(eq)

        if len(pde_mat) == 0:
            continue

        # Get the nullspace of the matrix and get the rows related to this
        # vec_idx
        n = nullspace(pde_mat)[offset*vec_idx:offset*(vec_idx+1), :]
        indep_row = find_linear_relationship(n)
        if len(indep_row) > 0:
            pde_dict = {}
            mult = indep_row[max(indep_row.keys())]
            for k, v in indep_row.items():
                pde_dict[DerivativeIdentifier(mis[k], 0)] = v / mult
            plog.done()
            return LinearPDESystemOperator(pde.dim, pmap(pde_dict))

    plog.done()
    assert False
Example #10
0
def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None):
    """Top-level driver routine for a fast multipole calculation.

    In part, this is intended as a template for custom FMMs, in the sense that
    you may copy and paste its
    `source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_
    as a starting point.

    Nonetheless, many common applications (such as point-to-point FMMs) can be
    covered by supplying the right *expansion_wrangler* to this routine.

    :arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance.
    :arg expansion_wrangler: An object exhibiting the
        :class:`ExpansionWranglerInterface`.
    :arg src_weights: Source 'density/weights/charges'.
        Passed unmodified to *expansion_wrangler*.
    :arg timing_data: Either *None*, or a :class:`dict` that is populated with
        timing information for the stages of the algorithm (in the form of
        :class:`TimingResult`), if such information is available.

    Returns the potentials computed by *expansion_wrangler*.

    """
    wrangler = expansion_wrangler

    # Interface guidelines: Attributes of the tree are assumed to be known
    # to the expansion wrangler and should not be passed.

    fmm_proc = ProcessLogger(logger, "qbx fmm")
    recorder = TimingRecorder()

    src_weights = wrangler.reorder_sources(src_weights)

    # {{{ "Step 2.1:" Construct local multipoles

    mpole_exps, timing_future = wrangler.form_multipoles(
            traversal.level_start_source_box_nrs,
            traversal.source_boxes,
            src_weights)

    recorder.add("form_multipoles", timing_future)

    # }}}

    # {{{ "Step 2.2:" Propagate multipoles upward

    mpole_exps, timing_future = wrangler.coarsen_multipoles(
            traversal.level_start_source_parent_box_nrs,
            traversal.source_parent_boxes,
            mpole_exps)

    recorder.add("coarsen_multipoles", timing_future)

    # mpole_exps is called Phi in [1]

    # }}}

    # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")

    potentials, timing_future = wrangler.eval_direct(
            traversal.target_boxes,
            traversal.neighbor_source_boxes_starts,
            traversal.neighbor_source_boxes_lists,
            src_weights)

    recorder.add("eval_direct", timing_future)

    # these potentials are called alpha in [1]

    # }}}

    # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local

    local_exps, timing_future = wrangler.multipole_to_local(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_siblings_starts,
            traversal.from_sep_siblings_lists,
            mpole_exps)

    recorder.add("multipole_to_local", timing_future)

    # local_exps represents both Gamma and Delta in [1]

    # }}}

    # {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles

    # (the point of aiming this stage at particles is specifically to keep its
    # contribution *out* of the downward-propagating local expansions)

    mpole_result, timing_future = wrangler.eval_multipoles(
            traversal.target_boxes_sep_smaller_by_source_level,
            traversal.from_sep_smaller_by_level,
            mpole_exps)

    recorder.add("eval_multipoles", timing_future)

    potentials = potentials + mpole_result

    # these potentials are called beta in [1]

    if traversal.from_sep_close_smaller_starts is not None:
        logger.debug("evaluate separated close smaller interactions directly "
                "('list 3 close')")

        direct_result, timing_future = wrangler.eval_direct(
                traversal.target_boxes,
                traversal.from_sep_close_smaller_starts,
                traversal.from_sep_close_smaller_lists,
                src_weights)

        recorder.add("eval_direct", timing_future)

        potentials = potentials + direct_result

    # }}}

    # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")

    local_result, timing_future = wrangler.form_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_bigger_starts,
            traversal.from_sep_bigger_lists,
            src_weights)

    recorder.add("form_locals", timing_future)

    local_exps = local_exps + local_result

    if traversal.from_sep_close_bigger_starts is not None:
        direct_result, timing_future = wrangler.eval_direct(
                traversal.target_boxes,
                traversal.from_sep_close_bigger_starts,
                traversal.from_sep_close_bigger_lists,
                src_weights)

        recorder.add("eval_direct", timing_future)

        potentials = potentials + direct_result

    # }}}

    # {{{ "Stage 7:" propagate local_exps downward

    local_exps, timing_future = wrangler.refine_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            local_exps)

    recorder.add("refine_locals", timing_future)

    # }}}

    # {{{ "Stage 8:" evaluate locals

    local_result, timing_future = wrangler.eval_locals(
            traversal.level_start_target_box_nrs,
            traversal.target_boxes,
            local_exps)

    recorder.add("eval_locals", timing_future)

    potentials = potentials + local_result

    # }}}

    result = wrangler.reorder_potentials(potentials)

    result = wrangler.finalize_potentials(result)

    fmm_proc.done()

    if timing_data is not None:
        timing_data.update(recorder.summarize())

    return result
Example #11
0
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(sqi, event)*, where *sqi* is an instance of
            :class:`pyopencl.array.Array`, and *event* is a :class:`pyopencl.Event`
            for dependency management. The *dtype* of *sqi* is
            *tree*'s :attr:`boxtree.Tree.coord_dtype` and its shape is
            *(tree.nboxes,)* (see :attr:`boxtree.Tree.nboxes`).
            The entries of *sqi* are indexed by the global box index and are
            as follows:

            * if *i* is not the index of a leaf box, *sqi[i] = 0*.
            * if *i* is the index of a leaf box, *sqi[i]* is the
              outer space invader distance for *i*.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError("size of peer lists must match with number of boxes")

        space_invader_query_kernel = self.get_space_invader_query_kernel(
            tree.dimensions, tree.coord_dtype, tree.box_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        si_plog = ProcessLogger(logger, "space invader query")

        outer_space_invader_dists = cl.array.zeros(queue, tree.nboxes, np.float32)
        if not wait_for:
            wait_for = []
        wait_for = wait_for + outer_space_invader_dists.events

        evt = space_invader_query_kernel(
                *SPACE_INVADER_QUERY_TEMPLATE.unwrap_args(
                    tree, peer_lists,
                    ball_radii,
                    outer_space_invader_dists,
                    *tuple(bc for bc in ball_centers)),
                wait_for=wait_for,
                queue=queue,
                range=slice(len(ball_radii)))

        if tree.coord_dtype != np.dtype(np.float32):
            # The kernel output is always an array of float32 due to limited
            # support for atomic operations with float64 in OpenCL.
            # Here the output is cast to match the coord dtype.
            outer_space_invader_dists.finish()
            outer_space_invader_dists = outer_space_invader_dists.astype(
                    tree.coord_dtype)
            evt, = outer_space_invader_dists.events

        si_plog.done()

        return outer_space_invader_dists, evt
Example #12
0
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            execution.
        :returns: a tuple *(lbl, event)*, where *lbl* is an instance of
            :class:`LeavesToBallsLookup`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ltb_plog = ProcessLogger(logger, "leaves-to-balls lookup: run area query")

        area_query, evt = self.area_query_builder(
                queue, tree, ball_centers, ball_radii, peer_lists, wait_for)
        wait_for = [evt]

        logger.debug("leaves-to-balls lookup: expand starts")

        nkeys = tree.nboxes
        nballs_p_1 = len(area_query.leaves_near_ball_starts)
        assert nballs_p_1 == len(ball_radii) + 1

        # We invert the area query in two steps:
        #
        # 1. Turn the area query result into (ball number, box number) pairs.
        #    This is done in the "starts expander kernel."
        #
        # 2. Key-value sort the (ball number, box number) pairs by box number.

        starts_expander_knl = self.get_starts_expander_kernel(tree.box_id_dtype)
        expanded_starts = cl.array.empty(
                queue, len(area_query.leaves_near_ball_lists), tree.box_id_dtype)
        evt = starts_expander_knl(
                expanded_starts,
                area_query.leaves_near_ball_starts.with_queue(queue),
                nballs_p_1)
        wait_for = [evt]

        logger.debug("leaves-to-balls lookup: key-value sort")

        balls_near_box_starts, balls_near_box_lists, evt \
                = self.key_value_sorter(
                        queue,
                        # keys
                        area_query.leaves_near_ball_lists.with_queue(queue),
                        # values
                        expanded_starts,
                        nkeys, starts_dtype=tree.box_id_dtype,
                        wait_for=wait_for)

        ltb_plog.done()

        return LeavesToBallsLookup(
                tree=tree,
                balls_near_box_starts=balls_near_box_starts,
                balls_near_box_lists=balls_near_box_lists).with_queue(None), evt
Example #13
0
    def __call__(self, queue, tree, ball_centers, ball_radii, peer_lists=None,
                 wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        :arg tree: a :class:`boxtree.Tree`.
        :arg ball_centers: an object array of coordinate
            :class:`pyopencl.array.Array` instances.
            Their *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg ball_radii: a
            :class:`pyopencl.array.Array`
            of positive numbers.
            Its *dtype* must match *tree*'s
            :attr:`boxtree.Tree.coord_dtype`.
        :arg peer_lists: may either be *None* or an instance of
            :class:`PeerListLookup` associated with `tree`.
        :arg wait_for: may either be *None* or a list of :class:`pyopencl.Event`
            instances for whose completion this command waits before starting
            exeuction.
        :returns: a tuple *(aq, event)*, where *aq* is an instance of
            :class:`AreaQueryResult`, and *event* is a :class:`pyopencl.Event`
            for dependency management.
        """

        from pytools import single_valued
        if single_valued(bc.dtype for bc in ball_centers) != tree.coord_dtype:
            raise TypeError("ball_centers dtype must match tree.coord_dtype")
        if ball_radii.dtype != tree.coord_dtype:
            raise TypeError("ball_radii dtype must match tree.coord_dtype")

        ball_id_dtype = tree.particle_id_dtype  # ?

        from pytools import div_ceil
        # Avoid generating too many kernels.
        max_levels = div_ceil(tree.nlevels, 10) * 10

        if peer_lists is None:
            peer_lists, evt = self.peer_list_finder(queue, tree, wait_for=wait_for)
            wait_for = [evt]

        if len(peer_lists.peer_list_starts) != tree.nboxes + 1:
            raise ValueError("size of peer lists must match with number of boxes")

        area_query_kernel = self.get_area_query_kernel(tree.dimensions,
            tree.coord_dtype, tree.box_id_dtype, ball_id_dtype,
            peer_lists.peer_list_starts.dtype, max_levels)

        aq_plog = ProcessLogger(logger, "area query")

        result, evt = area_query_kernel(
                queue, len(ball_radii),
                tree.box_centers.data, tree.root_extent,
                tree.box_levels.data, tree.aligned_nboxes,
                tree.box_child_ids.data, tree.box_flags.data,
                peer_lists.peer_list_starts.data,
                peer_lists.peer_lists.data, ball_radii.data,
                *(tuple(tree.bounding_box[0])
                    + tuple(bc.data for bc in ball_centers)),
                wait_for=wait_for)

        aq_plog.done()

        return AreaQueryResult(
                tree=tree,
                leaves_near_ball_starts=result["leaves"].starts,
                leaves_near_ball_lists=result["leaves"].lists).with_queue(None), evt
Example #14
0
    def __call__(self, queue, balls_to_leaves_lookup=None, wait_for=None):
        """
        :arg queue: a :class:`pyopencl.CommandQueue`
        """
        slk_plog = ProcessLogger(logger,
                                 "element-to-source lookup: run area query")

        if balls_to_leaves_lookup is None:
            balls_to_leaves_lookup, evt = \
                self.compute_short_lists(queue, wait_for=wait_for)
            wait_for = [evt]

        # -----------------------------------------------------------------
        # Refine the area query using point-in-simplex test

        logger.debug("element-to-source lookup: refine starts")

        element_lookup_kernel = self.get_simplex_lookup_kernel()

        vertices_dev = make_obj_array([
            cl.array.to_device(queue, verts)
            for verts in self.discr.mesh.vertices
        ])

        mesh_vertices_kwargs = {
            f"mesh_vertices_{iaxis}": vertices_dev[iaxis]
            for iaxis in range(self.dim)
        }

        source_points_kwargs = {
            f"source_points_{iaxis}": self.tree.sources[iaxis]
            for iaxis in range(self.dim)
        }

        evt, res = element_lookup_kernel(
            queue,
            dim=self.dim,
            nboxes=self.tree.nboxes,
            nelements=self.discr.mesh.nelements,
            nsources=self.tree.nsources,
            result=cl.array.zeros(queue, self.tree.nsources, dtype=np.int32) -
            1,
            mesh_vertex_indices=self.discr.mesh.groups[0].vertex_indices,
            box_source_starts=self.tree.box_source_starts,
            box_source_counts_cumul=self.tree.box_source_counts_cumul,
            leaves_near_ball_starts=balls_to_leaves_lookup.
            leaves_near_ball_starts,
            leaves_near_ball_lists=balls_to_leaves_lookup.
            leaves_near_ball_lists,
            wait_for=wait_for,
            **mesh_vertices_kwargs,
            **source_points_kwargs)

        source_to_element_lookup, = res

        wait_for = [evt]

        # elements = source_to_element_lookup.get()
        # for idx in [362,  365,  874,  877, 1386, 1389, 1898, 1901])

        # -----------------------------------------------------------------
        # Invert the source-to-element lookup by a key-value sort

        logger.debug("element-to-source lookup: key-value sort")

        sources_in_element_starts, sources_in_element_lists, evt = \
            self.key_value_sorter(
                queue,
                keys=source_to_element_lookup,
                values=cl.array.arange(
                    queue, self.tree.nsources, dtype=self.tree.box_id_dtype),
                nkeys=self.discr.mesh.nelements,
                starts_dtype=self.tree.box_id_dtype,
                wait_for=wait_for)

        slk_plog.done()

        return ElementsToSourcesLookup(
            tree=self.tree,
            discr=self.discr,
            sources_in_element_starts=sources_in_element_starts,
            sources_in_element_lists=sources_in_element_lists), evt
Example #15
0
def drive_fmm(expansion_wrangler, src_weight_vecs, timing_data=None,
        traversal=None):
    """Top-level driver routine for the QBX fast multipole calculation.

    :arg geo_data: A :class:`pytential.qbx.geometry.QBXFMMGeometryData` instance.
    :arg expansion_wrangler: An object exhibiting the
        :class:`boxtree.fmm.ExpansionWranglerInterface`.
    :arg src_weight_vecs: A sequence of source 'density/weights/charges'.
        Passed unmodified to *expansion_wrangler*.
    :arg timing_data: Either *None* or a dictionary that collects
        timing data.

    Returns the potentials computed by *expansion_wrangler*.

    See also :func:`boxtree.fmm.drive_fmm`.
    """
    wrangler = expansion_wrangler

    geo_data = wrangler.geo_data

    if traversal is None:
        traversal = geo_data.traversal()

    tree = traversal.tree

    recorder = TimingRecorder()

    # Interface guidelines: Attributes of the tree are assumed to be known
    # to the expansion wrangler and should not be passed.

    fmm_proc = ProcessLogger(logger, "qbx fmm")

    src_weight_vecs = [wrangler.reorder_sources(weight)
        for weight in src_weight_vecs]

    # {{{ construct local multipoles

    mpole_exps, timing_future = wrangler.form_multipoles(
            traversal.level_start_source_box_nrs,
            traversal.source_boxes,
            src_weight_vecs)

    recorder.add("form_multipoles", timing_future)

    # }}}

    # {{{ propagate multipoles upward

    mpole_exps, timing_future = wrangler.coarsen_multipoles(
            traversal.level_start_source_parent_box_nrs,
            traversal.source_parent_boxes,
            mpole_exps)

    recorder.add("coarsen_multipoles", timing_future)

    # }}}

    # {{{ direct evaluation from neighbor source boxes ("list 1")

    non_qbx_potentials, timing_future = wrangler.eval_direct(
            traversal.target_boxes,
            traversal.neighbor_source_boxes_starts,
            traversal.neighbor_source_boxes_lists,
            src_weight_vecs)

    recorder.add("eval_direct", timing_future)

    # }}}

    # {{{ translate separated siblings' ("list 2") mpoles to local

    local_exps, timing_future = wrangler.multipole_to_local(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_siblings_starts,
            traversal.from_sep_siblings_lists,
            mpole_exps)

    recorder.add("multipole_to_local", timing_future)

    # }}}

    # {{{ evaluate sep. smaller mpoles ("list 3") at particles

    # (the point of aiming this stage at particles is specifically to keep its
    # contribution *out* of the downward-propagating local expansions)

    mpole_result, timing_future = wrangler.eval_multipoles(
            traversal.target_boxes_sep_smaller_by_source_level,
            traversal.from_sep_smaller_by_level,
            mpole_exps)

    recorder.add("eval_multipoles", timing_future)

    non_qbx_potentials = non_qbx_potentials + mpole_result

    # assert that list 3 close has been merged into list 1
    assert traversal.from_sep_close_smaller_starts is None

    # }}}

    # {{{ form locals for separated bigger source boxes ("list 4")

    local_result, timing_future = wrangler.form_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_bigger_starts,
            traversal.from_sep_bigger_lists,
            src_weight_vecs)

    recorder.add("form_locals", timing_future)

    local_exps = local_exps + local_result

    # assert that list 4 close has been merged into list 1
    assert traversal.from_sep_close_bigger_starts is None

    # }}}

    # {{{ propagate local_exps downward

    local_exps, timing_future = wrangler.refine_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            local_exps)

    recorder.add("refine_locals", timing_future)

    # }}}

    # {{{ evaluate locals

    local_result, timing_future = wrangler.eval_locals(
            traversal.level_start_target_box_nrs,
            traversal.target_boxes,
            local_exps)

    recorder.add("eval_locals", timing_future)

    non_qbx_potentials = non_qbx_potentials + local_result

    # }}}

    # {{{ wrangle qbx expansions

    # form_global_qbx_locals and eval_target_specific_qbx_locals are responsible
    # for the same interactions (directly evaluated portion of the potentials
    # via unified List 1).  Which one is used depends on the wrangler. If one of
    # them is unused the corresponding output entries will be zero.

    qbx_expansions, timing_future = wrangler.form_global_qbx_locals(src_weight_vecs)

    recorder.add("form_global_qbx_locals", timing_future)

    local_result, timing_future = (
            wrangler.translate_box_multipoles_to_qbx_local(mpole_exps))

    recorder.add("translate_box_multipoles_to_qbx_local", timing_future)

    qbx_expansions = qbx_expansions + local_result

    local_result, timing_future = (
            wrangler.translate_box_local_to_qbx_local(local_exps))

    recorder.add("translate_box_local_to_qbx_local", timing_future)

    qbx_expansions = qbx_expansions + local_result

    qbx_potentials, timing_future = wrangler.eval_qbx_expansions(qbx_expansions)

    recorder.add("eval_qbx_expansions", timing_future)

    ts_result, timing_future = \
        wrangler.eval_target_specific_qbx_locals(src_weight_vecs)

    qbx_potentials = qbx_potentials + ts_result

    recorder.add("eval_target_specific_qbx_locals", timing_future)

    # }}}

    # {{{ reorder potentials

    nqbtl = geo_data.non_qbx_box_target_lists()

    all_potentials_in_tree_order = wrangler.full_output_zeros()

    for ap_i, nqp_i in zip(all_potentials_in_tree_order, non_qbx_potentials):
        ap_i[nqbtl.unfiltered_from_filtered_target_indices] = nqp_i

    all_potentials_in_tree_order += qbx_potentials

    def reorder_and_finalize_potentials(x):
        # "finalize" gives host FMMs (like FMMlib) a chance to turn the
        # potential back into a CL array.
        return wrangler.finalize_potentials(x[tree.sorted_target_ids])

    from pytools.obj_array import obj_array_vectorize
    result = obj_array_vectorize(
            reorder_and_finalize_potentials, all_potentials_in_tree_order)

    # }}}

    fmm_proc.done()

    if timing_data is not None:
        timing_data.update(recorder.summarize())
    return result
Example #16
0
def drive_fmm(traversal, expansion_wrangler, src_weights, timing_data=None):
    """Top-level driver routine for a fast multipole calculation.

    In part, this is intended as a template for custom FMMs, in the sense that
    you may copy and paste its
    `source code <https://github.com/inducer/boxtree/blob/master/boxtree/fmm.py>`_
    as a starting point.

    Nonetheless, many common applications (such as point-to-point FMMs) can be
    covered by supplying the right *expansion_wrangler* to this routine.

    :arg traversal: A :class:`boxtree.traversal.FMMTraversalInfo` instance.
    :arg expansion_wrangler: An object exhibiting the
        :class:`ExpansionWranglerInterface`.
    :arg src_weights: Source 'density/weights/charges'.
        Passed unmodified to *expansion_wrangler*.
    :arg timing_data: Either *None*, or a :class:`dict` that is populated with
        timing information for the stages of the algorithm (in the form of
        :class:`TimingResult`), if such information is available.

    Returns the potentials computed by *expansion_wrangler*.

    """
    wrangler = expansion_wrangler

    # Interface guidelines: Attributes of the tree are assumed to be known
    # to the expansion wrangler and should not be passed.

    fmm_proc = ProcessLogger(logger, "fmm")
    recorder = TimingRecorder()

    src_weights = wrangler.reorder_sources(src_weights)

    # {{{ "Step 2.1:" Construct local multipoles

    mpole_exps, timing_future = wrangler.form_multipoles(
            traversal.level_start_source_box_nrs,
            traversal.source_boxes,
            src_weights)

    recorder.add("form_multipoles", timing_future)

    # }}}

    # {{{ "Step 2.2:" Propagate multipoles upward

    mpole_exps, timing_future = wrangler.coarsen_multipoles(
            traversal.level_start_source_parent_box_nrs,
            traversal.source_parent_boxes,
            mpole_exps)

    recorder.add("coarsen_multipoles", timing_future)

    # mpole_exps is called Phi in [1]

    # }}}

    # {{{ "Stage 3:" Direct evaluation from neighbor source boxes ("list 1")

    potentials, timing_future = wrangler.eval_direct(
            traversal.target_boxes,
            traversal.neighbor_source_boxes_starts,
            traversal.neighbor_source_boxes_lists,
            src_weights)

    recorder.add("eval_direct", timing_future)

    # these potentials are called alpha in [1]

    # }}}

    # {{{ "Stage 4:" translate separated siblings' ("list 2") mpoles to local

    local_exps, timing_future = wrangler.multipole_to_local(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_siblings_starts,
            traversal.from_sep_siblings_lists,
            mpole_exps)

    recorder.add("multipole_to_local", timing_future)

    # local_exps represents both Gamma and Delta in [1]

    # }}}

    # {{{ "Stage 5:" evaluate sep. smaller mpoles ("list 3") at particles

    # (the point of aiming this stage at particles is specifically to keep its
    # contribution *out* of the downward-propagating local expansions)

    mpole_result, timing_future = wrangler.eval_multipoles(
            traversal.target_boxes_sep_smaller_by_source_level,
            traversal.from_sep_smaller_by_level,
            mpole_exps)

    recorder.add("eval_multipoles", timing_future)

    potentials = potentials + mpole_result

    # these potentials are called beta in [1]

    if traversal.from_sep_close_smaller_starts is not None:
        logger.debug("evaluate separated close smaller interactions directly "
                "('list 3 close')")

        direct_result, timing_future = wrangler.eval_direct(
                traversal.target_boxes,
                traversal.from_sep_close_smaller_starts,
                traversal.from_sep_close_smaller_lists,
                src_weights)

        recorder.add("eval_direct", timing_future)

        potentials = potentials + direct_result

    # }}}

    # {{{ "Stage 6:" form locals for separated bigger source boxes ("list 4")

    local_result, timing_future = wrangler.form_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            traversal.from_sep_bigger_starts,
            traversal.from_sep_bigger_lists,
            src_weights)

    recorder.add("form_locals", timing_future)

    local_exps = local_exps + local_result

    if traversal.from_sep_close_bigger_starts is not None:
        direct_result, timing_future = wrangler.eval_direct(
                traversal.target_boxes,
                traversal.from_sep_close_bigger_starts,
                traversal.from_sep_close_bigger_lists,
                src_weights)

        recorder.add("eval_direct", timing_future)

        potentials = potentials + direct_result

    # }}}

    # {{{ "Stage 7:" propagate local_exps downward

    local_exps, timing_future = wrangler.refine_locals(
            traversal.level_start_target_or_target_parent_box_nrs,
            traversal.target_or_target_parent_boxes,
            local_exps)

    recorder.add("refine_locals", timing_future)

    # }}}

    # {{{ "Stage 8:" evaluate locals

    local_result, timing_future = wrangler.eval_locals(
            traversal.level_start_target_box_nrs,
            traversal.target_boxes,
            local_exps)

    recorder.add("eval_locals", timing_future)

    potentials = potentials + local_result

    # }}}

    result = wrangler.reorder_potentials(potentials)

    result = wrangler.finalize_potentials(result)

    fmm_proc.done()

    if timing_data is not None:
        timing_data.update(recorder.summarize())

    return result
Example #17
0
def parse_fortran(source,
                  filename="<floopy code>",
                  free_form=None,
                  strict=None,
                  seq_dependencies=None,
                  auto_dependencies=None,
                  target=None):
    """
    :returns: a :class:`loopy.TranslationUnit`
    """

    parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename)

    if seq_dependencies is not None and auto_dependencies is not None:
        raise TypeError(
            "may not specify both seq_dependencies and auto_dependencies")
    if auto_dependencies is not None:
        from warnings import warn
        warn("auto_dependencies is deprecated, use seq_dependencies instead",
             DeprecationWarning,
             stacklevel=2)
        seq_dependencies = auto_dependencies

    if seq_dependencies is None:
        seq_dependencies = True
    if free_form is None:
        free_form = True
    if strict is None:
        strict = True

    import logging
    console = logging.StreamHandler()
    console.setLevel(logging.INFO)
    formatter = logging.Formatter("%(name)-12s: %(levelname)-8s %(message)s")
    console.setFormatter(formatter)
    logging.getLogger("fparser").addHandler(console)

    from fparser import api
    tree = api.parse(source,
                     isfree=free_form,
                     isstrict=strict,
                     analyze=False,
                     ignore_comments=False)

    if tree is None:
        raise LoopyError("Fortran parser was unhappy with source code "
                         "and returned invalid data (Sorry!)")

    from loopy.frontend.fortran.translator import F2LoopyTranslator
    f2loopy = F2LoopyTranslator(filename, target=target)
    f2loopy(tree)

    kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies)

    from loopy.transform.callable import merge
    prog = merge(kernels)
    all_kernels = [clbl.subkernel for clbl in prog.callables_table.values()]

    for knl in all_kernels:
        prog.with_kernel(_add_assignees_to_calls(knl, all_kernels))

    if len(all_kernels) == 1:
        # guesssing in the case of only one function
        prog = prog.with_entrypoints(all_kernels[0].name)

    from loopy.frontend.fortran.translator import specialize_fortran_division
    prog = specialize_fortran_division(prog)

    parse_plog.done()

    return prog