Ejemplo n.º 1
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        # noinspection PyArgumentList
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):  # TODO: FIXME
                t_start = time.time()
                AP = mmv(P)
                # noinspection PyArgumentList
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    if (X.is_cuda):
                        # addmm_ may not be finished yet causing mmv to get stale inputs.
                        torch.cuda.synchronize()
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                # noinspection PyArgumentList
                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                if P.is_cuda:
                    # P must be synced so that it's correct for mmv in next iter.
                    torch.cuda.synchronize()
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
Ejemplo n.º 2
0
    def solve(self, X0, B, mmv, max_iter, callback=None):
        t_start = time.time()

        if X0 is None:
            R = copy_same_stride(B)
            X = create_same_stride(B.size(), B, B.dtype, B.device)
            X.fill_(0.0)
        else:
            R = B - mmv(X0)
            X = X0

        m_eps = self.params.cg_epsilon(X.dtype)

        P = R
        Rsold = torch.sum(R.pow(2), dim=0)

        e_train = time.time() - t_start

        for i in range(max_iter):
            with TicToc("Chol Iter", debug=False):
                t_start = time.time()
                AP = mmv(P)
                alpha = Rsold / (torch.sum(P * AP, dim=0) + m_eps)
                X.addmm_(P, torch.diag(alpha))

                if (i + 1) % self.params.cg_full_gradient_every == 0:
                    R = B - mmv(X)
                else:
                    R = R - torch.mm(AP, torch.diag(alpha))
                    # R.addmm_(mat1=AP, mat2=torch.diag(alpha), alpha=-1.0)

                Rsnew = torch.sum(R.pow(2), dim=0)
                if Rsnew.abs().max().sqrt() < self.params.cg_tolerance:
                    print("Stopping conjugate gradient descent at "
                          "iteration %d. Solution has converged." % (i + 1))
                    break

                P = R + torch.mm(P, torch.diag(Rsnew / (Rsold + m_eps)))
                Rsold = Rsnew

                e_iter = time.time() - t_start
                e_train += e_iter
            with TicToc("Chol callback", debug=False):
                if callback is not None:
                    callback(i + 1, X, e_train)

        return X
Ejemplo n.º 3
0
def gpu_lauum(A,
              upper,
              overwrite=True,
              write_opposite=False,
              opt: Optional[FalkonOptions] = None):
    """
    Parameters
    -----------
    A : torch.Tensor
        (N x N) positive-definite matrix that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.

    Returns
    -------
    out : torch.Tensor
        A (N x N) tensor. This will share the same memory as the input tensor `A` if `overwrite`
        is set to True, otherwise it will be a newly allocated tensor.
    """
    if opt is None:
        opt = FalkonOptions()
    if not overwrite:
        A = copy_same_stride(A, pin_memory=True)
    # TODO: There is a helper function in mmv_ops for this.
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    # Parallel can only do lower C or F-contiguous arrays
    # By transposing as necessary, it is able to run with every combination of inputs.
    transposed = False
    # noinspection PyUnresolvedReferences
    if upper:
        A = A.T
        transposed = True

    # The parallel runner chooses based on the contiguity pattern of the inputs.
    _parallel_lauum_runner(A, write_opposite, gpu_info)

    if transposed:
        A = A.T
    return A
Ejemplo n.º 4
0
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool,
                 opt: FalkonOptions) -> torch.Tensor:
    """
    Parameters
    -----------
    A : torch.Tensor
        2D positive-definite matrix of size (n x n) that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    upper : bool
        Whether the triangle which should be factorized is the upper or lower of `A`.
    clean : bool
        Whether the "other" triangle of the output matrix (the one that
        does not contain the factorization) will be filled with zeros or
        not.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.
    opt : FalkonOptions
        Options forwarded for block calculation, and other knobs in the out-of-core
        parallel POTRF implementation. Useful options are the ones defined in
        :class:`~falkon.options.CholeskyOptions` .

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    # Handle 'overwrite' option immediately so that its usage is reflected in memory
    # availability (in case A is on GPU).
    if not overwrite:
        # We could change the stride to be more favorable to the POTRF requirements
        # but it gets complicated. We leave such decisions to the user!
        A = copy_same_stride(A, pin_memory=True)

    # Decide which version of the algo we run: can be in-core or parallel.
    # (Note that the original OOC version is not going to run).

    # Determine GPU free RAM
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    if A.is_cuda:
        try:
            device = [d for d in gpu_info if d.Id == A.device.index][0]
        except IndexError:
            # This should never happen!
            raise RuntimeError("Device of matrix A (%s) is not recognized" %
                               (A.device))
    else:
        device = max(gpu_info, key=lambda g: g.actual_free_mem)
    ic = can_do_ic(A, device) and not opt.chol_force_ooc
    if opt.chol_force_in_core and not ic:
        raise RuntimeError(
            "Cannot run in-core POTRF but `chol_force_in_core` was specified.")

    f_order = is_f_contig(A)
    transposed = False
    if not f_order:
        A = A.T
        upper = not upper
        transposed = True
    # Now A is always in f_order. So we can only allow upper=False (ooc)
    if upper:
        # Can do only in-core!
        if not ic:
            raise ValueError(
                "GPU POTRF is only implemented on the "
                "lower triangle for Fortran-ordered matrices (or on the upper "
                "triangle for C-ordered matrices)")
    if not ic and A.is_cuda:
        _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'."
        if opt.chol_force_ooc:
            _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF."
        raise ValueError(_msg)

    # Handle different implementations for POTRF: in-core and out-of-core
    if ic:
        if opt.debug:
            print("Using in-core POTRF")
        _ic_cholesky(A,
                     upper,
                     device=device.Id,
                     cusolver_handle=initialization.cusolver_handle(device.Id))
    else:
        if opt.debug:
            print("Using parallel POTRF")
        _parallel_potrf_runner(A, opt, gpu_info)

    # Perform cleaning of the 'other side' of the matrix
    if clean:
        la_helpers.zero_triang(A, upper=not upper)
    # Undo previous matrix transformations
    if transposed:
        A = A.T

    return A