Esempio n. 1
0
def _make_cross_face_batches(
        queue, vol_discr, bdry_discr,
        i_tgt_grp, i_src_grp,
        i_face_tgt,
        adj_grp,
        vbc_tgt_grp_face_batch, src_grp_el_lookup):

    # {{{ index wrangling

    # Assert that the adjacency group and the restriction
    # interpolation batch and the adjacency group have the same
    # element ordering.

    adj_grp_tgt_flags = adj_grp.element_faces == i_face_tgt

    assert (
            np.array_equal(
                adj_grp.elements[adj_grp_tgt_flags],
                vbc_tgt_grp_face_batch.from_element_indices
                .get(queue=queue)))

    # find to_element_indices

    to_bdry_element_indices = (
            vbc_tgt_grp_face_batch.to_element_indices
            .get(queue=queue))

    # find from_element_indices

    from_vol_element_indices = adj_grp.neighbors[adj_grp_tgt_flags]
    from_element_faces = adj_grp.neighbor_faces[adj_grp_tgt_flags]

    from_bdry_element_indices = src_grp_el_lookup[
            from_vol_element_indices, from_element_faces]

    # }}}

    # {{{ visualization (for debugging)

    if 0:
        print("TVE", adj_grp.elements[adj_grp_tgt_flags])
        print("TBE", to_bdry_element_indices)
        print("FVE", from_vol_element_indices)
        from meshmode.mesh.visualization import draw_2d_mesh
        import matplotlib.pyplot as pt
        draw_2d_mesh(vol_discr.mesh, draw_element_numbers=True,
                set_bounding_box=True,
                draw_vertex_numbers=False,
                draw_face_numbers=True,
                fill=None)
        pt.figure()

        draw_2d_mesh(bdry_discr.mesh, draw_element_numbers=True,
                set_bounding_box=True,
                draw_vertex_numbers=False,
                draw_face_numbers=True,
                fill=None)

        pt.show()
    # }}}

    # {{{ invert face map (using Gauss-Newton)

    to_bdry_nodes = (
            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
            # non-contiguous transfers for now).
            bdry_discr.groups[i_tgt_grp].view(
                bdry_discr.nodes().get(queue=queue))
            [:, to_bdry_element_indices])

    tol = 1e4 * np.finfo(to_bdry_nodes.dtype).eps

    from_mesh_grp = bdry_discr.mesh.groups[i_src_grp]
    from_grp = bdry_discr.groups[i_src_grp]

    dim = from_grp.dim
    ambient_dim, nelements, nto_unit_nodes = to_bdry_nodes.shape

    initial_guess = np.mean(from_mesh_grp.vertex_unit_coordinates(), axis=0)
    from_unit_nodes = np.empty((dim, nelements, nto_unit_nodes))
    from_unit_nodes[:] = initial_guess.reshape(-1, 1, 1)

    import modepy as mp
    from_vdm = mp.vandermonde(from_grp.basis(), from_grp.unit_nodes)
    from_inv_t_vdm = la.inv(from_vdm.T)
    from_nfuncs = len(from_grp.basis())

    # (ambient_dim, nelements, nfrom_unit_nodes)
    from_bdry_nodes = (
            # FIXME: This should view-then-transfer (but PyOpenCL doesn't do
            # non-contiguous transfers for now).
            bdry_discr.groups[i_src_grp].view(
                bdry_discr.nodes().get(queue=queue))
            [:, from_bdry_element_indices])

    def apply_map(unit_nodes):
        # unit_nodes: (dim, nelements, nto_unit_nodes)

        # basis_at_unit_nodes
        basis_at_unit_nodes = np.empty((from_nfuncs, nelements, nto_unit_nodes))

        for i, f in enumerate(from_grp.basis()):
            basis_at_unit_nodes[i] = (
                    f(unit_nodes.reshape(dim, -1))
                    .reshape(nelements, nto_unit_nodes))

        intp_coeffs = np.einsum("fj,jet->fet", from_inv_t_vdm, basis_at_unit_nodes)

        # If we're interpolating 1, we had better get 1 back.
        one_deviation = np.abs(np.sum(intp_coeffs, axis=0) - 1)
        assert (one_deviation < tol).all(), np.max(one_deviation)

        return np.einsum("fet,aef->aet", intp_coeffs, from_bdry_nodes)

    def get_map_jacobian(unit_nodes):
        # unit_nodes: (dim, nelements, nto_unit_nodes)

        # basis_at_unit_nodes
        dbasis_at_unit_nodes = np.empty(
                (dim, from_nfuncs, nelements, nto_unit_nodes))

        for i, df in enumerate(from_grp.grad_basis()):
            df_result = df(unit_nodes.reshape(dim, -1))

            for rst_axis, df_r in enumerate(df_result):
                dbasis_at_unit_nodes[rst_axis, i] = (
                        df_r.reshape(nelements, nto_unit_nodes))

        dintp_coeffs = np.einsum(
                "fj,rjet->rfet", from_inv_t_vdm, dbasis_at_unit_nodes)

        return np.einsum("rfet,aef->raet", dintp_coeffs, from_bdry_nodes)

    # {{{ test map applier and jacobian

    if 0:
        u = from_unit_nodes
        f = apply_map(u)
        for h in [1e-1, 1e-2]:
            du = h*np.random.randn(*u.shape)

            f_2 = apply_map(u+du)

            jf = get_map_jacobian(u)

            f2_2 = f + np.einsum("raet,ret->aet", jf, du)

            print(h, la.norm((f_2-f2_2).ravel()))

    # }}}

    # {{{ visualize initial guess

    if 0:
        import matplotlib.pyplot as pt
        guess = apply_map(from_unit_nodes)
        goals = to_bdry_nodes

        from meshmode.discretization.visualization import draw_curve
        draw_curve(bdry_discr)

        pt.plot(guess[0].reshape(-1), guess[1].reshape(-1), "or")
        pt.plot(goals[0].reshape(-1), goals[1].reshape(-1), "og")
        pt.plot(from_bdry_nodes[0].reshape(-1), from_bdry_nodes[1].reshape(-1), "o",
                color="purple")
        pt.show()

    # }}}

    logger.info("make_opposite_face_connection: begin gauss-newton")

    niter = 0
    while True:
        resid = apply_map(from_unit_nodes) - to_bdry_nodes

        df = get_map_jacobian(from_unit_nodes)
        df_inv_resid = np.empty_like(from_unit_nodes)

        # For the 1D/2D accelerated versions, we'll use the normal
        # equations and Cramer's rule. If you're looking for high-end
        # numerics, look no further than meshmode.

        if dim == 1:
            # A is df.T
            ata = np.einsum("iket,jket->ijet", df, df)
            atb = np.einsum("iket,ket->iet", df, resid)

            df_inv_resid = atb / ata[0, 0]

        elif dim == 2:
            # A is df.T
            ata = np.einsum("iket,jket->ijet", df, df)
            atb = np.einsum("iket,ket->iet", df, resid)

            det = ata[0, 0]*ata[1, 1] - ata[0, 1]*ata[1, 0]

            df_inv_resid = np.empty_like(from_unit_nodes)
            df_inv_resid[0] = 1/det * (ata[1, 1] * atb[0] - ata[1, 0]*atb[1])
            df_inv_resid[1] = 1/det * (-ata[0, 1] * atb[0] + ata[0, 0]*atb[1])

        else:
            # The boundary of a 3D mesh is 2D, so that's the
            # highest-dimensional case we genuinely care about.
            #
            # This stinks, performance-wise, because it's not vectorized.
            # But we'll only hit it for boundaries of 4+D meshes, in which
            # case... good luck. :)
            for e in range(nelements):
                for t in range(nto_unit_nodes):
                    df_inv_resid[:, e, t], _, _, _ = \
                            la.lstsq(df[:, :, e, t].T, resid[:, e, t])

        from_unit_nodes = from_unit_nodes - df_inv_resid

        max_resid = np.max(np.abs(resid))
        logger.debug("gauss-newton residual: %g" % max_resid)

        if max_resid < tol:
            logger.info("make_opposite_face_connection: gauss-newton: done, "
                    "final residual: %g" % max_resid)
            break

        niter += 1
        if niter > 10:
            raise RuntimeError("Gauss-Newton (for finding opposite-face reference "
                    "coordinates) did not converge")

    # }}}

    # {{{ find groups of from_unit_nodes

    def to_dev(ary):
        return cl.array.to_device(queue, ary, array_queue=None)

    done_elements = np.zeros(nelements, dtype=np.bool)
    while True:
        todo_elements, = np.where(~done_elements)
        if not len(todo_elements):
            return

        template_unit_nodes = from_unit_nodes[:, todo_elements[0], :]

        unit_node_dist = np.max(np.max(np.abs(
                from_unit_nodes[:, todo_elements, :]
                -
                template_unit_nodes.reshape(dim, 1, -1)),
                axis=2), axis=0)

        close_els = todo_elements[unit_node_dist < tol]
        done_elements[close_els] = True

        unit_node_dist = np.max(np.max(np.abs(
                from_unit_nodes[:, todo_elements, :]
                -
                template_unit_nodes.reshape(dim, 1, -1)),
                axis=2), axis=0)

        from meshmode.discretization.connection import InterpolationBatch
        yield InterpolationBatch(
                from_group_index=i_src_grp,
                from_element_indices=to_dev(from_bdry_element_indices[close_els]),
                to_element_indices=to_dev(to_bdry_element_indices[close_els]),
                result_unit_nodes=template_unit_nodes,
                to_element_face=None)
Esempio n. 2
0
def test_perf_data_gathering(ctx_getter, n_arms=5):
    cl_ctx = ctx_getter()
    queue = cl.CommandQueue(cl_ctx)

    # prevent cache 'splosion
    from sympy.core.cache import clear_cache
    clear_cache()

    target_order = 8

    starfish_func = NArmedStarfish(n_arms, 0.8)
    mesh = make_curve_mesh(
            starfish_func,
            np.linspace(0, 1, n_arms * 30),
            target_order)

    sigma_sym = sym.var("sigma")

    # The kernel doesn't really matter here
    from sumpy.kernel import LaplaceKernel
    k_sym = LaplaceKernel(mesh.ambient_dim)

    sym_op = sym.S(k_sym, sigma_sym, qbx_forced_limit=+1)

    from meshmode.discretization import Discretization
    from meshmode.discretization.poly_element import (
            InterpolatoryQuadratureSimplexGroupFactory)
    pre_density_discr = Discretization(
            queue.context, mesh,
            InterpolatoryQuadratureSimplexGroupFactory(target_order))

    results = []

    def inspect_geo_data(insn, bound_expr, geo_data):
        from pytential.qbx.fmm import assemble_performance_data
        perf_data = assemble_performance_data(geo_data, uses_pde_expansions=True)
        results.append(perf_data)

        return False  # no need to do the actual FMM

    from pytential.qbx import QBXLayerPotentialSource
    lpot_source = QBXLayerPotentialSource(
            pre_density_discr, 4*target_order,
            # qbx order and fmm order don't really matter
            10, fmm_order=10,
            _expansions_in_tree_have_extent=True,
            _expansion_stick_out_factor=0.5,
            geometry_data_inspector=inspect_geo_data,
            target_association_tolerance=1e-10,
            )

    lpot_source, _ = lpot_source.with_refinement()

    density_discr = lpot_source.density_discr

    if 0:
        from meshmode.discretization.visualization import draw_curve
        draw_curve(density_discr)
        import matplotlib.pyplot as plt
        plt.show()

    nodes = density_discr.nodes().with_queue(queue)
    sigma = cl.clmath.sin(10 * nodes[0])

    bind(lpot_source, sym_op)(queue, sigma=sigma)
Esempio n. 3
0
    def plot(self,
             draw_circles=False,
             draw_center_numbers=False,
             highlight_centers=None):
        """Plot most of the information contained in a :class:`QBXFMMGeometryData`
        object, for debugging.

        :arg highlight_centers: If not *None*, an object with which the array of
            centers can be indexed to find the highlighted centers.

        .. note::

            This only works for two-dimensional geometries.
        """

        from pytential import sym
        import matplotlib.pyplot as pt
        pt.clf()

        dims = self.tree().targets.shape[0]
        if dims != 2:
            raise ValueError("only 2-dimensional geometry info can be plotted")

        with cl.CommandQueue(self.cl_context) as queue:
            stage2_density_discr = self.places.get_discretization(
                self.source_dd.geometry, sym.QBX_SOURCE_STAGE2)
            quad_stage2_density_discr = self.places.get_discretization(
                self.source_dd.geometry, sym.QBX_SOURCE_QUAD_STAGE2)
            from meshmode.discretization.visualization import draw_curve
            draw_curve(quad_stage2_density_discr)

            global_flags = self.global_qbx_flags().get(queue=queue)

            tree = self.tree().get(queue=queue)
            from boxtree.visualization import TreePlotter
            tp = TreePlotter(tree)
            tp.draw_tree()

            # {{{ draw centers and circles

            centers = self.flat_centers()
            centers = [centers[0].get(queue), centers[1].get(queue)]
            pt.plot(centers[0][global_flags == 0],
                    centers[1][global_flags == 0],
                    "oc",
                    label="centers needing local qbx")

            if highlight_centers is not None:
                pt.plot(centers[0][highlight_centers],
                        centers[1][highlight_centers],
                        "oc",
                        label="highlighted centers",
                        markersize=15)

            ax = pt.gca()

            if draw_circles:
                for icenter, (cx, cy, r) in enumerate(
                        zip(centers[0], centers[1],
                            self.flat_expansion_radii().get(queue))):
                    ax.add_artist(
                        pt.Circle((cx, cy), r, fill=False, ls="dotted", lw=1))

            if draw_center_numbers:
                for icenter, (cx, cy,
                              r) in enumerate(zip(centers[0], centers[1])):
                    pt.text(cx,
                            cy,
                            str(icenter),
                            fontsize=8,
                            ha="left",
                            va="center",
                            bbox=dict(facecolor="white", alpha=0.5, lw=0))

            # }}}

            # {{{ draw target-to-center arrows

            ttc = self.user_target_to_center().get(queue)
            tinfo = self.target_info()
            targets = tinfo.targets.get(queue)

            pt.plot(targets[0], targets[1], "+")
            pt.plot(targets[0][ttc == target_state.FAILED],
                    targets[1][ttc == target_state.FAILED],
                    "dr",
                    markersize=15,
                    label="failed targets")

            for itarget in np.where(ttc == target_state.FAILED)[0]:
                pt.text(targets[0][itarget],
                        targets[1][itarget],
                        str(itarget),
                        fontsize=8,
                        ha="left",
                        va="center",
                        bbox=dict(facecolor="white", alpha=0.5, lw=0))

            tccount = 0
            checked = 0
            for tx, ty, tcenter in zip(targets[0][self.ncenters:],
                                       targets[1][self.ncenters:],
                                       ttc[self.ncenters:]):
                checked += 1
                if tcenter >= 0:
                    tccount += 1
                    ax.add_artist(
                        pt.Line2D(
                            (tx, centers[0][tcenter]),
                            (ty, centers[1][tcenter]),
                        ))

            logger.info("found a center for %d/%d targets", tccount, checked)

            # }}}

            pt.gca().set_aspect("equal")
            #pt.legend()
            pt.savefig("geodata-stage2-nelem%d.pdf" %
                       stage2_density_discr.mesh.nelements)
Esempio n. 4
0
    def plot(self, draw_circles=False, draw_center_numbers=False,
            highlight_centers=None):
        """Plot most of the information contained in a :class:`QBXFMMGeometryData`
        object, for debugging.

        :arg highlight_centers: If not *None*, an object with which the array of
            centers can be indexed to find the highlighted centers.

        .. note::

            This only works for two-dimensional geometries.
        """

        import matplotlib.pyplot as pt
        pt.clf()

        dims = self.tree().targets.shape[0]
        if dims != 2:
            raise ValueError("only 2-dimensional geometry info can be plotted")

        with cl.CommandQueue(self.cl_context) as queue:
            from meshmode.discretization.visualization import draw_curve
            draw_curve(self.lpot_source.quad_stage2_density_discr)

            global_flags = self.global_qbx_flags().get(queue=queue)

            tree = self.tree().get(queue=queue)
            from boxtree.visualization import TreePlotter
            tp = TreePlotter(tree)
            tp.draw_tree()

            # {{{ draw centers and circles

            centers = self.centers()
            centers = [
                    centers[0].get(queue),
                    centers[1].get(queue)]
            pt.plot(centers[0][global_flags == 0],
                    centers[1][global_flags == 0], "oc",
                    label="centers needing local qbx")

            if highlight_centers is not None:
                pt.plot(centers[0][highlight_centers],
                        centers[1][highlight_centers], "oc",
                        label="highlighted centers",
                        markersize=15)

            ax = pt.gca()

            if draw_circles:
                for icenter, (cx, cy, r) in enumerate(zip(
                        centers[0], centers[1],
                        self.expansion_radii().get(queue))):
                    ax.add_artist(
                            pt.Circle((cx, cy), r, fill=False, ls="dotted", lw=1))

            if draw_center_numbers:
                for icenter, (cx, cy, r) in enumerate(zip(centers[0], centers[1])):
                    pt.text(cx, cy,
                            str(icenter), fontsize=8,
                            ha="left", va="center",
                            bbox=dict(facecolor='white', alpha=0.5, lw=0))

            # }}}

            # {{{ draw target-to-center arrows

            ttc = self.user_target_to_center().get(queue)
            tinfo = self.target_info()
            targets = tinfo.targets.get(queue)

            pt.plot(targets[0], targets[1], "+")
            pt.plot(
                    targets[0][ttc == target_state.FAILED],
                    targets[1][ttc == target_state.FAILED],
                    "dr", markersize=15, label="failed targets")

            for itarget in np.where(ttc == target_state.FAILED)[0]:
                pt.text(
                        targets[0][itarget],
                        targets[1][itarget],
                        str(itarget), fontsize=8,
                        ha="left", va="center",
                        bbox=dict(facecolor='white', alpha=0.5, lw=0))

            tccount = 0
            checked = 0
            for tx, ty, tcenter in zip(
                    targets[0][self.ncenters:],
                    targets[1][self.ncenters:],
                    ttc[self.ncenters:]):
                checked += 1
                if tcenter >= 0:
                    tccount += 1
                    ax.add_artist(
                            pt.Line2D(
                                (tx, centers[0][tcenter]),
                                (ty, centers[1][tcenter]),
                                ))

            print("found a center for %d/%d targets" % (tccount, checked))

            # }}}

            pt.gca().set_aspect("equal")
            #pt.legend()
            pt.savefig(
                    "geodata-stage2-nelem%d.pdf"
                    % self.lpot_source.stage2_density_discr.mesh.nelements)
Esempio n. 5
0
def _make_cross_face_batches(queue, tgt_bdry_discr, src_bdry_discr, i_tgt_grp,
                             i_src_grp, tgt_bdry_element_indices,
                             src_bdry_element_indices):

    # FIXME: This should view-then-transfer
    # (but PyOpenCL doesn't do non-contiguous transfers for now).
    tgt_bdry_nodes = (tgt_bdry_discr.groups[i_tgt_grp].view(
        tgt_bdry_discr.nodes().get(queue=queue))[:, tgt_bdry_element_indices])

    # FIXME: This should view-then-transfer
    # (but PyOpenCL doesn't do non-contiguous transfers for now).
    src_bdry_nodes = (src_bdry_discr.groups[i_src_grp].view(
        src_bdry_discr.nodes().get(queue=queue))[:, src_bdry_element_indices])

    tol = 1e4 * np.finfo(tgt_bdry_nodes.dtype).eps

    src_mesh_grp = src_bdry_discr.mesh.groups[i_src_grp]
    src_grp = src_bdry_discr.groups[i_src_grp]

    dim = src_grp.dim
    ambient_dim, nelements, ntgt_unit_nodes = tgt_bdry_nodes.shape
    assert tgt_bdry_nodes.shape == src_bdry_nodes.shape

    # {{{ invert face map (using Gauss-Newton)

    initial_guess = np.mean(src_mesh_grp.vertex_unit_coordinates(), axis=0)
    src_unit_nodes = np.empty((dim, nelements, ntgt_unit_nodes))
    src_unit_nodes[:] = initial_guess.reshape(-1, 1, 1)

    import modepy as mp
    vdm = mp.vandermonde(src_grp.basis(), src_grp.unit_nodes)
    inv_t_vdm = la.inv(vdm.T)
    nsrc_funcs = len(src_grp.basis())

    def apply_map(unit_nodes):
        # unit_nodes: (dim, nelements, ntgt_unit_nodes)

        # basis_at_unit_nodes
        basis_at_unit_nodes = np.empty(
            (nsrc_funcs, nelements, ntgt_unit_nodes))

        for i, f in enumerate(src_grp.basis()):
            basis_at_unit_nodes[i] = (f(unit_nodes.reshape(dim, -1)).reshape(
                nelements, ntgt_unit_nodes))

        intp_coeffs = np.einsum("fj,jet->fet", inv_t_vdm, basis_at_unit_nodes)

        # If we're interpolating 1, we had better get 1 back.
        one_deviation = np.abs(np.sum(intp_coeffs, axis=0) - 1)
        assert (one_deviation < tol).all(), np.max(one_deviation)

        return np.einsum("fet,aef->aet", intp_coeffs, src_bdry_nodes)

    def get_map_jacobian(unit_nodes):
        # unit_nodes: (dim, nelements, ntgt_unit_nodes)

        # basis_at_unit_nodes
        dbasis_at_unit_nodes = np.empty(
            (dim, nsrc_funcs, nelements, ntgt_unit_nodes))

        for i, df in enumerate(src_grp.grad_basis()):
            df_result = df(unit_nodes.reshape(dim, -1))

            for rst_axis, df_r in enumerate(df_result):
                dbasis_at_unit_nodes[rst_axis, i] = (df_r.reshape(
                    nelements, ntgt_unit_nodes))

        dintp_coeffs = np.einsum("fj,rjet->rfet", inv_t_vdm,
                                 dbasis_at_unit_nodes)

        return np.einsum("rfet,aef->raet", dintp_coeffs, src_bdry_nodes)

    # {{{ test map applier and jacobian

    if 0:
        u = src_unit_nodes
        f = apply_map(u)
        for h in [1e-1, 1e-2]:
            du = h * np.random.randn(*u.shape)

            f_2 = apply_map(u + du)

            jf = get_map_jacobian(u)

            f2_2 = f + np.einsum("raet,ret->aet", jf, du)

            print(h, la.norm((f_2 - f2_2).ravel()))

    # }}}

    # {{{ visualize initial guess

    if 0:
        import matplotlib.pyplot as pt
        guess = apply_map(src_unit_nodes)
        goals = tgt_bdry_nodes

        from meshmode.discretization.visualization import draw_curve
        pt.figure(0)
        draw_curve(tgt_bdry_discr)
        pt.figure(1)
        draw_curve(src_bdry_discr)
        pt.figure(2)

        pt.plot(guess[0].reshape(-1), guess[1].reshape(-1), "or")
        pt.plot(goals[0].reshape(-1), goals[1].reshape(-1), "og")
        pt.plot(src_bdry_nodes[0].reshape(-1), src_bdry_nodes[1].reshape(-1),
                "xb")
        pt.show()

    # }}}

    logger.info("make_opposite_face_connection: begin gauss-newton")

    niter = 0
    while True:
        resid = apply_map(src_unit_nodes) - tgt_bdry_nodes

        df = get_map_jacobian(src_unit_nodes)
        df_inv_resid = np.empty_like(src_unit_nodes)

        # For the 1D/2D accelerated versions, we'll use the normal
        # equations and Cramer's rule. If you're looking for high-end
        # numerics, look no further than meshmode.

        if dim == 1:
            # A is df.T
            ata = np.einsum("iket,jket->ijet", df, df)
            atb = np.einsum("iket,ket->iet", df, resid)

            df_inv_resid = atb / ata[0, 0]

        elif dim == 2:
            # A is df.T
            ata = np.einsum("iket,jket->ijet", df, df)
            atb = np.einsum("iket,ket->iet", df, resid)

            det = ata[0, 0] * ata[1, 1] - ata[0, 1] * ata[1, 0]

            df_inv_resid = np.empty_like(src_unit_nodes)
            df_inv_resid[0] = 1 / det * (ata[1, 1] * atb[0] -
                                         ata[1, 0] * atb[1])
            df_inv_resid[1] = 1 / det * (-ata[0, 1] * atb[0] +
                                         ata[0, 0] * atb[1])

        else:
            # The boundary of a 3D mesh is 2D, so that's the
            # highest-dimensional case we genuinely care about.
            #
            # This stinks, performance-wise, because it's not vectorized.
            # But we'll only hit it for boundaries of 4+D meshes, in which
            # case... good luck. :)
            for e in range(nelements):
                for t in range(ntgt_unit_nodes):
                    df_inv_resid[:, e, t], _, _, _ = \
                            la.lstsq(df[:, :, e, t].T, resid[:, e, t])

        src_unit_nodes = src_unit_nodes - df_inv_resid

        # {{{ visualize next guess

        if 0:
            import matplotlib.pyplot as pt
            guess = apply_map(src_unit_nodes)
            goals = tgt_bdry_nodes

            pt.plot(guess[0].reshape(-1), guess[1].reshape(-1), "rx")
            pt.plot(goals[0].reshape(-1), goals[1].reshape(-1), "go")
            pt.show()

        # }}}

        max_resid = np.max(np.abs(resid))
        logger.debug("gauss-newton residual: %g" % max_resid)

        if max_resid < tol:
            logger.info("make_opposite_face_connection: gauss-newton: done, "
                        "final residual: %g" % max_resid)
            break

        niter += 1
        if niter > 10:
            raise RuntimeError(
                "Gauss-Newton (for finding opposite-face reference "
                "coordinates) did not converge")

    # }}}

    # {{{ find groups of src_unit_nodes

    def to_dev(ary):
        return cl.array.to_device(queue, ary, array_queue=None)

    done_elements = np.zeros(nelements, dtype=np.bool)
    while True:
        todo_elements, = np.where(~done_elements)
        if not len(todo_elements):
            return

        template_unit_nodes = src_unit_nodes[:, todo_elements[0], :]

        unit_node_dist = np.max(np.max(
            np.abs(src_unit_nodes[:, todo_elements, :] -
                   template_unit_nodes.reshape(dim, 1, -1)),
            axis=2),
                                axis=0)

        close_els = todo_elements[unit_node_dist < tol]
        done_elements[close_els] = True

        unit_node_dist = np.max(np.max(
            np.abs(src_unit_nodes[:, todo_elements, :] -
                   template_unit_nodes.reshape(dim, 1, -1)),
            axis=2),
                                axis=0)

        from meshmode.discretization.connection.direct import InterpolationBatch
        yield InterpolationBatch(
            from_group_index=i_src_grp,
            from_element_indices=to_dev(src_bdry_element_indices[close_els]),
            to_element_indices=to_dev(tgt_bdry_element_indices[close_els]),
            result_unit_nodes=template_unit_nodes,
            to_element_face=None)