def _f(self, f, u, X):
        u, s, t = self._u_to_s_t(u)
        (N,) = u.shape

        X = np.atleast_2d(X)
        raise_if_not_shape('X', X, (self.num_control_points, self.dim))

        R = np.empty((N, self.dim), dtype=float)
        for s_, i in groupby(np.argsort(s), key=lambda i: s[i]):
            i = list(i)
            R[i] = np.dot(f(t[i]), X[self._i(s_)])

        return R
    def _f(self, f, u, X):
        u, s, t = self._u_to_s_t(u)
        (N, ) = u.shape

        X = np.atleast_2d(X)
        raise_if_not_shape('X', X, (self.num_control_points, self.dim))

        R = np.empty((N, self.dim), dtype=float)
        for s_, i in groupby(np.argsort(s), key=lambda i: s[i]):
            i = list(i)
            R[i] = np.dot(f(t[i]), X[self._i(s_)])

        return R
    def minimise(self,
                 Y,
                 w,
                 lambda_,
                 u,
                 X,
                 return_all=False,
                 max_num_iterations=100,
                 min_radius=1e-9,
                 max_radius=1e12,
                 initial_radius=1e4):
        """Minimise the sum of squared errors between the uniform B-spline
        specified by `X` and the positions of unstructured data points `Y`.
        The exact expression minimised with respect to `X` and `u` is:

            0.5 * ( sum(w * (Y - M(u, X))**2) + lambda_ * R(X) )

        where `M` is the uniform B-spline position function and `R` is the
        regularisation function (the sum of squared distances between
        adjacent control points).

        Parameters
        ----------
        Y : float, array_like of shape = (N, dim)
            The matrix of data point positions.

        w : float, array_like of shape = (N, dim)
            The matrix of non-negative weights applied to each squared residual
            on each dimension.

        lambda_ : float
            The non-negative float that specifies the amount of regularisation.

        u : float, array_like of shape = (N,)
            The vector of initial contour correspondences. Optimally, `u[i]` is
            the contour coordinate that minimises the weighted squared distance
            between the uniform B-spline and `Y[i]`. Here, only a coarse
            initialisation is (typically) required.

        X : float, array_like of shape = (num_control_points, dim)
            The matrix of initial control point positions.

        return_all : optional, bool
            If True, a tuple is returned of the form
            `(u, X, has_converged, states, n, t)` where:
                `u` is the optimised vector of correspondences;
                `X` is the optimised matrix of control point positions;
                `has_converged` is True if the optimisation terminated by
                    reaching the minimum trust region radius and False
                    otherwise;
                `states` is a list of optimisation states comprising of the
                    `u`, `X`, energy, and trust region radius after each
                    successful optimisation step (includes the initialisation);
                `n` is the number of total optimisation steps;
                `t` is the total time taken (measured using `time.time`).
            Otherwise, `minimise` returns `(u, X)`.

        max_num_iterations : optional, int
            The maximum number of optimisation iterations.

        min_radius: optional, float
            The non-negative minimum trust region radius. If the trust region
            radius falls below this value, optimisation terminates.

        max_radius : optional, float
            The non-negative maximum trust region radius.

        initial_radius : optional, float
            The initial non-negative trust region radius.

        Returns
        -------
        See `return_all`.

        Further Details
        ---------------
        The energy `e` to be minimised can be written as:

            e = 0.5 * (r(z)**2).sum()

        where `z` is the concatenated vector of correspondences `u` and control
        point positions `X` (row first), and `r` is a function which returns
        the vector of concatenated data point and regularisation residuals.

        Let `de` denote the vector of first derivatives. It is given by:

            de = dot(J(z).T, f(z))

        where `J` is the sparse Jacobian: `J[i, j]` is the first derivative of
        residual `i` with respect to `z[j]`.

        Similarly, using `J` and `r` instead of `J(z)` and `r(z)`, the matrix
        of second derivatives `de2` is given by:

            de2 = dot(J.T, J) + sum(r[i] * H[i])                            (1)

        where `H[i]` is the matrix of second derivatives (the "Hessian") for
        residual `i`.

        In Newton's method, the update `del_z` to minimise `e` is given by:

            del_z = -dot(inv(de2), de)

        If `de2` is not positive definite, then this update is invalid. As an
        alternative, a "damped" version (Levenberg's contribution) can be
        solved instead:

            del_z = -dot(inv(de2 + D), de)                                  (2)

        where `D` is a diagonal matrix with entries `1 / radius` so that
        `de2 + D` is positive definite. For large values of `radius`, the
        contribution of `D` has little effect. For small values, `del_z` tends
        to `-radius * de2` (gradient descent).

        Here, 'dn' (damped Newton) computes `del_z` exactly using (2) and (1)
        and 'lm' (Levenberg-Marquardt) approximates `de2` by ignoring all
        second derivative terms.

        To efficiently compute (2), the sparsity of the problem is leveraged.
        Since `z = r_[u, X.ravel()]`, and the data residuals are ordered before
        the regularisation residuals, `J` is block-sparse. Deviating from the
        Python-like notation so far:

            J = |E   F|
                |     |
                |0   G|

        where `E` is block-diagonal. Similarly, `H[i]`, where `i` indexes a
        data point residual, is also block-sparse:

            H[i] = |P[i]    Q[i]|
                   |            |
                   |Q[i].T     0|

        where `P[i]` is diagonal. (`H[i]` for regularisation residuals is 0.)

        Therefore, the linear system of (2), ignoring the leading minus sign,
        is of the form:

            |E.T*E + r[i]*P[i] + Da    E.T*F + r[i]*Q[i]|   | dza |   | a |
            |                                           | * |     | = |   |
            |(E.T*F + r[i]*Q[i]).T    F.T*F + G.T*G + Db|   | dzb |   | b |

        where `D` has been split into diagonal sub-blocks `Da` and `Db`,
        `del_z` and `de` have been partitioned into `(dza, dzb)` and `(a, b)`
        respectively, and summation over `i` is implicit.

        Expanding the above equation gives a pair of simultaneous equations in
        `dza` and `dzb`. Eliminating `dza`, it turns out that the only matrix
        inverse in the expression for `dzb` is of the upper left block above.
        That is, the linear system solved for `dzb` is the Schur complement of
        the complete system matrix. Since both `E.T * E` and `P[i]` are
        diagonal, this is trivial. Furthermore, the time taken to compute
        either a damped Newton or LM update is now linear in the number of data
        points.
        """
        # Ensure that the dimensions and values of inputs are valid.
        w = np.atleast_2d(w)
        N = w.shape[0]
        raise_if_not_shape('w', w, (N, self._c.dim))
        if np.any(w <= 0.0):
            raise ValueError('w <= 0.0')

        Y = np.atleast_2d(Y)
        raise_if_not_shape('Y', Y, (N, self._c.dim))

        if lambda_ <= 0.0:
            raise ValueError('lambda_ <= 0.0 (= {})'.format(lambda_))

        u = np.atleast_1d(u)
        raise_if_not_shape('u', u, (N, ))
        u = self._c.clip(u)

        X = np.atleast_2d(X)
        raise_if_not_shape('X', X, (self._c.num_control_points, self._c.dim))

        # Set `_Y`, `_w`, and `_lambda` for internal evaluation methods.
        self._Y = Y
        self._w = np.sqrt(w)
        self._lambda = np.sqrt(lambda_)

        # `G` is constant and depends only on `_lambda`.
        G = self._G()

        # Set internal variables for `_accept_step` and `_reject_step`.
        self._min_radius = max(0.0, min_radius)
        self._max_radius = max(self._min_radius, max_radius)

        self._radius = max(self._min_radius,
                           min(initial_radius, self._max_radius))
        self._decrease_factor = 2.0

        # Set `save_state`.
        if return_all:
            states = []

            def save_state(u, X, *args):
                states.append((u.copy(), X.copy()) + args)
        else:

            def save_state(*args):
                pass

        save_state(u, X, self._e(u, X), self._radius)

        # Use `d` for dimension of the problem (convenience).
        d = self._c.dim

        t0 = time()
        update_schur_components, has_converged = True, False
        for i in range(max_num_iterations):
            if self._radius <= self._min_radius:
                # Terminate if the trust region radius is too small.
                has_converged = True
                break

            # Compute a damped Newton or Levenberg-Marquardt step depending on
            # `_solver_type`.
            if update_schur_components:
                # Error and residual components.
                e, (ra, rb, r) = self._e(u, X, return_all=True)

                # First derivatives.
                # The actual E is a block-diagonal matrix of `N` blocks, each
                # of shape `(dim, 1)` (where `N = u.shape[0]`).
                # Here, `E` is a list of length `N`, where `E[i]` is a vector
                # for the `i`th block and is of shape `(dim,)`.
                E, F = self._E(u, X), self._F(u)

                # Set (partially) the Schur diagonal.
                D_EtE_rP = (E * E).sum(axis=1)

                # Set the Schur upper right block.
                EtF_rQ = np.empty((N, F.shape[1]))
                for i in range(N):
                    EtF_rQ[i] = np.dot(E[i], F[d * i:d * (i + 1)])

                # For damped Newton, add the second and mixed derivative terms
                # to `D_EtE_rP` and `EtF_rQ`.
                if self._solver_type == 'dn':
                    # Second derivatives.
                    # `P` is the same dimensions as `E`.
                    P, Q = self._P(u, X), self._Q(u)

                    D_EtE_rP += (P * ra.reshape(-1, d)).sum(axis=1)

                    for i in range(N):
                        EtF_rQ[i] += np.dot(ra[d * i:d * (i + 1)],
                                            Q[d * i:d * (i + 1)])

                # Set the Schur lower left block.
                FtE_rQ = EtF_rQ.T

                # Set (partially) the Schur lower right block.
                S0 = np.dot(F.T, F) + np.dot(G.T, G)

                # Set the Schur right-hand side components (a = Et * ra).
                a = (E * ra.reshape(-1, d)).sum(axis=1)
                b = np.dot(F.T, ra) + np.dot(G.T, rb)

            # `D` is the vector of the inverse of the complete Schur diagonal.
            D = 1.0 / (D_EtE_rP + 1.0 / self._radius)

            # Solve the Schur reduced system for `delta_u` and `delta_X`.
            S = (S0 + np.diag([1.0 / self._radius] * S0.shape[0]) -
                 np.dot(FtE_rQ, D[:, np.newaxis] * EtF_rQ))
            try:
                c_and_lower = scipy.linalg.cho_factor(S)
            except scipy.linalg.LinAlgError:
                # Step is invalid.
                self._reject_step()
                update_schur_components = False
                continue

            t = b - np.dot(FtE_rQ, D * a)
            v1 = scipy.linalg.cho_solve(c_and_lower, t)
            v0 = D * (a - np.dot(EtF_rQ, v1))
            delta_u = -v0
            delta_X = -v1.reshape(-1, d)

            # Evaluate the change in energy as expected by the quadratic
            # approximation.
            # For `solver_type == 'lm'`, `D_EtE_rP` and `EtF_rQ` do not contain
            # the second and mixed derivative terms so the following is OK
            # although could be done (slightly) more efficiently.
            Jdelta = np.r_[(E * delta_u[:, np.newaxis]).ravel() +
                           np.dot(F, delta_X.ravel()),
                           np.dot(G, delta_X.ravel())]

            Hdelta = np.r_[D_EtE_rP * delta_u +
                           np.dot(EtF_rQ, delta_X.ravel()),
                           np.dot(EtF_rQ.T, delta_u) +
                           np.dot(S0, delta_X.ravel())]
            model_e_decrease = -(np.dot(r, Jdelta) + 0.5 * np.dot(
                np.r_[delta_u, delta_X.ravel()], Hdelta))
            assert model_e_decrease >= 0.0

            # Evaluate the updated coordinates `u1` and control points `X1`.
            u1 = self._c.clip(u + delta_u)
            X1 = X + delta_X

            # Accept the updates if the energy has decreased and reject it
            # otherwise. Also update the trust region radius depending on how
            # well the quadratic approximation modelled the change in energy.
            e1 = self._e(u1, X1)
            step_quality = (e - e1) / model_e_decrease
            if step_quality > 0:
                save_state(u1, X1, e1, self._radius)

                self._accept_step(step_quality)
                e, u, X = e1, u1, X1
                update_schur_components = True
            else:
                self._reject_step()
                update_schur_components = False

        t1 = time()

        return ((u, X, has_converged, states, i, t1 - t0) if return_all else
                (u, X))
    def minimise(self, Y, w, lambda_, u, X, return_all=False,
                 max_num_iterations=100,
                 min_radius=1e-9, max_radius=1e12, initial_radius=1e4):
        """Minimise the sum of squared errors between the uniform B-spline
        specified by `X` and the positions of unstructured data points `Y`.
        The exact expression minimised with respect to `X` and `u` is:

            0.5 * ( sum(w * (Y - M(u, X))**2) + lambda_ * R(X) )

        where `M` is the uniform B-spline position function and `R` is the
        regularisation function (the sum of squared distances between
        adjacent control points).

        Parameters
        ----------
        Y : float, array_like of shape = (N, dim)
            The matrix of data point positions.

        w : float, array_like of shape = (N, dim)
            The matrix of non-negative weights applied to each squared residual
            on each dimension.

        lambda_ : float
            The non-negative float that specifies the amount of regularisation.

        u : float, array_like of shape = (N,)
            The vector of initial contour correspondences. Optimally, `u[i]` is
            the contour coordinate that minimises the weighted squared distance
            between the uniform B-spline and `Y[i]`. Here, only a coarse
            initialisation is (typically) required.

        X : float, array_like of shape = (num_control_points, dim)
            The matrix of initial control point positions.

        return_all : optional, bool
            If True, a tuple is returned of the form
            `(u, X, has_converged, states, n, t)` where:
                `u` is the optimised vector of correspondences;
                `X` is the optimised matrix of control point positions;
                `has_converged` is True if the optimisation terminated by
                    reaching the minimum trust region radius and False
                    otherwise;
                `states` is a list of optimisation states comprising of the
                    `u`, `X`, energy, and trust region radius after each
                    successful optimisation step (includes the initialisation);
                `n` is the number of total optimisation steps;
                `t` is the total time taken (measured using `time.time`).
            Otherwise, `minimise` returns `(u, X)`.

        max_num_iterations : optional, int
            The maximum number of optimisation iterations.

        min_radius: optional, float
            The non-negative minimum trust region radius. If the trust region
            radius falls below this value, optimisation terminates.

        max_radius : optional, float
            The non-negative maximum trust region radius.

        initial_radius : optional, float
            The initial non-negative trust region radius.

        Returns
        -------
        See `return_all`.

        Further Details
        ---------------
        The energy `e` to be minimised can be written as:

            e = 0.5 * (r(z)**2).sum()

        where `z` is the concatenated vector of correspondences `u` and control
        point positions `X` (row first), and `r` is a function which returns
        the vector of concatenated data point and regularisation residuals.

        Let `de` denote the vector of first derivatives. It is given by:

            de = dot(J(z).T, f(z))

        where `J` is the sparse Jacobian: `J[i, j]` is the first derivative of
        residual `i` with respect to `z[j]`.

        Similarly, using `J` and `r` instead of `J(z)` and `r(z)`, the matrix
        of second derivatives `de2` is given by:

            de2 = dot(J.T, J) + sum(r[i] * H[i])                            (1)

        where `H[i]` is the matrix of second derivatives (the "Hessian") for
        residual `i`.

        In Newton's method, the update `del_z` to minimise `e` is given by:

            del_z = -dot(inv(de2), de)

        If `de2` is not positive definite, then this update is invalid. As an
        alternative, a "damped" version (Levenberg's contribution) can be
        solved instead:

            del_z = -dot(inv(de2 + D), de)                                  (2)

        where `D` is a diagonal matrix with entries `1 / radius` so that
        `de2 + D` is positive definite. For large values of `radius`, the
        contribution of `D` has little effect. For small values, `del_z` tends
        to `-radius * de2` (gradient descent).

        Here, 'dn' (damped Newton) computes `del_z` exactly using (2) and (1)
        and 'lm' (Levenberg-Marquardt) approximates `de2` by ignoring all
        second derivative terms.

        To efficiently compute (2), the sparsity of the problem is leveraged.
        Since `z = r_[u, X.ravel()]`, and the data residuals are ordered before
        the regularisation residuals, `J` is block-sparse. Deviating from the
        Python-like notation so far:

            J = |E   F|
                |     |
                |0   G|

        where `E` is block-diagonal. Similarly, `H[i]`, where `i` indexes a
        data point residual, is also block-sparse:

            H[i] = |P[i]    Q[i]|
                   |            |
                   |Q[i].T     0|

        where `P[i]` is diagonal. (`H[i]` for regularisation residuals is 0.)

        Therefore, the linear system of (2), ignoring the leading minus sign,
        is of the form:

            |E.T*E + r[i]*P[i] + Da    E.T*F + r[i]*Q[i]|   | dza |   | a |
            |                                           | * |     | = |   |
            |(E.T*F + r[i]*Q[i]).T    F.T*F + G.T*G + Db|   | dzb |   | b |

        where `D` has been split into diagonal sub-blocks `Da` and `Db`,
        `del_z` and `de` have been partitioned into `(dza, dzb)` and `(a, b)`
        respectively, and summation over `i` is implicit.

        Expanding the above equation gives a pair of simultaneous equations in
        `dza` and `dzb`. Eliminating `dza`, it turns out that the only matrix
        inverse in the expression for `dzb` is of the upper left block above.
        That is, the linear system solved for `dzb` is the Schur complement of
        the complete system matrix. Since both `E.T * E` and `P[i]` are
        diagonal, this is trivial. Furthermore, the time taken to compute
        either a damped Newton or LM update is now linear in the number of data
        points.
        """
        # Ensure that the dimensions and values of inputs are valid.
        w = np.atleast_2d(w)
        N = w.shape[0]
        raise_if_not_shape('w', w, (N, self._c.dim))
        if np.any(w <= 0.0):
            raise ValueError('w <= 0.0')

        Y = np.atleast_2d(Y)
        raise_if_not_shape('Y', Y, (N, self._c.dim))

        if lambda_ <= 0.0:
            raise ValueError('lambda_ <= 0.0 (= {})'.format(lambda_))

        u = np.atleast_1d(u)
        raise_if_not_shape('u', u, (N,))
        u = self._c.clip(u)

        X = np.atleast_2d(X)
        raise_if_not_shape('X', X, (self._c.num_control_points, self._c.dim))

        # Set `_Y`, `_w`, and `_lambda` for internal evaluation methods.
        self._Y = Y
        self._w = np.sqrt(w)
        self._lambda = np.sqrt(lambda_)

        # `G` is constant and depends only on `_lambda`.
        G = self._G()

        # Set internal variables for `_accept_step` and `_reject_step`.
        self._min_radius = max(0.0, min_radius)
        self._max_radius = max(self._min_radius, max_radius)

        self._radius = max(self._min_radius, min(initial_radius,
                                                 self._max_radius))
        self._decrease_factor = 2.0

        # Set `save_state`.
        if return_all:
            states = []
            def save_state(u, X, *args):
                states.append((u.copy(), X.copy()) + args)
        else:
            def save_state(*args):
                pass

        save_state(u, X, self._e(u, X), self._radius)

        # Use `d` for dimension of the problem (convenience).
        d = self._c.dim

        t0 = time()
        update_schur_components, has_converged = True, False
        for i in range(max_num_iterations):
            if self._radius <= self._min_radius:
                # Terminate if the trust region radius is too small.
                has_converged = True
                break

            # Compute a damped Newton or Levenberg-Marquardt step depending on
            # `_solver_type`.
            if update_schur_components:
                # Error and residual components.
                e, (ra, rb, r) = self._e(u, X, return_all=True)

                # First derivatives.
                # The actual E is a block-diagonal matrix of `N` blocks, each
                # of shape `(dim, 1)` (where `N = u.shape[0]`).
                # Here, `E` is a list of length `N`, where `E[i]` is a vector
                # for the `i`th block and is of shape `(dim,)`.
                E, F = self._E(u, X), self._F(u)

                # Set (partially) the Schur diagonal.
                D_EtE_rP = (E * E).sum(axis=1)

                # Set the Schur upper right block.
                EtF_rQ = np.empty((N, F.shape[1]))
                for i in range(N):
                    EtF_rQ[i] = np.dot(E[i], F[d * i: d * (i + 1)])

                # For damped Newton, add the second and mixed derivative terms
                # to `D_EtE_rP` and `EtF_rQ`.
                if self._solver_type == 'dn':
                    # Second derivatives.
                    # `P` is the same dimensions as `E`.
                    P, Q = self._P(u, X), self._Q(u)

                    D_EtE_rP += (P * ra.reshape(-1, d)).sum(axis=1)

                    for i in range(N):
                        EtF_rQ[i] += np.dot(ra[d * i: d * (i + 1)],
                                             Q[d * i: d * (i + 1)])

                # Set the Schur lower left block.
                FtE_rQ = EtF_rQ.T

                # Set (partially) the Schur lower right block.
                S0 = np.dot(F.T, F) + np.dot(G.T, G)

                # Set the Schur right-hand side components (a = Et * ra).
                a = (E * ra.reshape(-1, d)).sum(axis=1)
                b = np.dot(F.T, ra) + np.dot(G.T, rb)

            # `D` is the vector of the inverse of the complete Schur diagonal.
            D = 1.0 / (D_EtE_rP + 1.0 / self._radius)

            # Solve the Schur reduced system for `delta_u` and `delta_X`.
            S = (S0 + np.diag([1.0 / self._radius] * S0.shape[0])
                    - np.dot(FtE_rQ, D[:, np.newaxis] * EtF_rQ))
            try:
                c_and_lower = scipy.linalg.cho_factor(S)
            except scipy.linalg.LinAlgError:
                # Step is invalid.
                self._reject_step()
                update_schur_components = False
                continue

            t = b - np.dot(FtE_rQ, D * a)
            v1 = scipy.linalg.cho_solve(c_and_lower, t)
            v0 = D * (a - np.dot(EtF_rQ, v1))
            delta_u = -v0
            delta_X = -v1.reshape(-1, d)

            # Evaluate the change in energy as expected by the quadratic
            # approximation.
            # For `solver_type == 'lm'`, `D_EtE_rP` and `EtF_rQ` do not contain
            # the second and mixed derivative terms so the following is OK
            # although could be done (slightly) more efficiently.
            Jdelta = np.r_[
                (E * delta_u[:, np.newaxis]).ravel() + np.dot(F, delta_X.ravel()),
                np.dot(G, delta_X.ravel())
            ]

            Hdelta = np.r_[
                D_EtE_rP * delta_u + np.dot(EtF_rQ, delta_X.ravel()),
                np.dot(EtF_rQ.T, delta_u) + np.dot(S0, delta_X.ravel())
            ]
            model_e_decrease = -(np.dot(r, Jdelta) +
                                 0.5 * np.dot(np.r_[delta_u, delta_X.ravel()],
                                              Hdelta))
            assert model_e_decrease >= 0.0

            # Evaluate the updated coordinates `u1` and control points `X1`.
            u1 = self._c.clip(u + delta_u)
            X1 = X + delta_X

            # Accept the updates if the energy has decreased and reject it
            # otherwise. Also update the trust region radius depending on how
            # well the quadratic approximation modelled the change in energy.
            e1 = self._e(u1, X1)
            step_quality = (e - e1) / model_e_decrease
            if step_quality > 0:
                save_state(u1, X1, e1, self._radius)

                self._accept_step(step_quality)
                e, u, X = e1, u1, X1
                update_schur_components = True
            else:
                self._reject_step()
                update_schur_components = False

        t1 = time()

        return ((u, X, has_converged, states, i, t1 - t0) if return_all else
                (u, X))