Esempio n. 1
0
def cuda_trsm(A: torch.Tensor,
              v: torch.Tensor,
              alpha: float,
              lower: int,
              transpose: int,
              stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    device = A.device
    s = stream
    if stream is None:
        s = torch.cuda.current_stream(device=device)
    cublas_hdl = cublas_handle(device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    # noinspection PyProtectedMember
    with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
            s.synchronize(
            )  # sync is necessary here for correctness. Not sure why! TODO: Is it still needed?
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if is_f_contig(v, strict=False):
            vout = vF
        else:
            vout = create_C(v.size(), v.dtype, device)
            vout = cuda_transpose(input=vF, output=vout.T).T
    return vout
Esempio n. 2
0
def _sparse_matmul_cuda(A: SparseTensor, B: SparseTensor, out: torch.Tensor):
    """
    Typically D is very large and since `B` must be in CSR format, memory usage will be quite high.

    Parameters
    ----------
    A : SparseTensor
        N x D :class:`SparseTensor`. Must be in CSR format.
    B : SparseTensor
        D x M :class:`SparseTensor`. Must be in CSR format.
    out : torch.Tensor
        Dense N x M output tensor. Must be F-contiguous (column-contiguous)

    Notes
    ------
    This function runs in two steps:
    sparse*sparse->sparse multiplication and conversion of the output
    sparse matrix to a dense matrix.
    """
    from falkon.sparse.sparse_helpers import spspmm, csr2dense

    if not A.is_csr:
        raise ValueError("A must be CSR matrix")
    if not B.is_csr:
        raise ValueError("B must be CSR matrix")
    if not is_f_contig(out, strict=False):
        raise ValueError("out must be F-contiguous")

    # 1. MatMul
    out_indexptr, out_index, out_data = spspmm(A.indexptr, A.index, A.data,
                                               B.indexptr, B.index, B.data,
                                               A.shape[1])
    # 2. Convert to dense
    out = csr2dense(out_indexptr, out_index, out_data, out)
    return out
Esempio n. 3
0
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int,
              transpose: int) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    s = torch.cuda.Stream(device=A.device)
    cublas_hdl = cublas_handle(A.device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, v.device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if not is_f_contig(v, strict=False):
            vout = create_C(v.size(), v.dtype, v.device)
            vout = cuda_transpose(input=vF, output=vout.T).T
        else:
            vout = vF
        s.synchronize()
    return vout
Esempio n. 4
0
    def init(self, X: Union[torch.Tensor, SparseTensor]):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : MxD tensor
            The matrix of Nystroem centers
        """
        dtype = X.dtype
        eps = self.params.pc_epsilon(X.dtype)

        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M), X, dtype=dtype, device='cpu',
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M), dtype=dtype, device='cpu', pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        self.fC = C.numpy()
        if not is_f_contig(C):
            self.fC = self.fC.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag(self.fC, eps * M)
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(self.fC, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=True, use_cuda=self._use_cuda, opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC, upper=False, use_cuda=self._use_cuda, opt=self.params)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            self.fC = mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag(self.fC, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC, clean=False, upper=False,
                                    use_cuda=self._use_cuda, opt=self.params)
            self.dA = C.diag()
Esempio n. 5
0
    def select(
            self, X: _tensor_type, Y: Union[torch.Tensor, None],
            M: int) -> Union[_tensor_type, Tuple[_tensor_type, torch.Tensor]]:
        """Select M rows from 2D array `X`, preserving the memory order of `X`.
        """
        N = X.size(0)
        if M > N:
            warnings.warn("Number of centers M greater than the "
                          "number of data-points. Setting M to %d" % (N))
            M = N
        idx = self.random_gen.choice(N, size=M, replace=False)

        if isinstance(X, SparseTensor):
            X = X.to_scipy()
            centers = X[idx, :].copy()
            Xc = SparseTensor.from_scipy(centers)
        else:
            Xnp = X.numpy()  # work on np array
            if is_f_contig(X):
                order = 'F'
            else:
                order = 'C'
            Xc_np = np.empty((M, Xnp.shape[1]), dtype=Xnp.dtype, order=order)
            Xc = torch.from_numpy(
                np.take(Xnp, idx, axis=0, out=Xc_np, mode='wrap'))

        if Y is not None:
            Ynp = Y.numpy()  # work on np array
            if is_f_contig(X):
                order = 'F'
            else:
                order = 'C'
            Yc_np = np.empty((M, Ynp.shape[1]), dtype=Ynp.dtype, order=order)
            Yc = torch.from_numpy(
                np.take(Ynp, idx, axis=0, out=Yc_np, mode='wrap'))
            return Xc, Yc
        return Xc
Esempio n. 6
0
def to_c_contig(tensor: Optional[torch.Tensor],
                name: str = "",
                warn: bool = False) -> Optional[torch.Tensor]:
    warning_text = (
        "Input '%s' is F-contiguous; to ensure KeOps compatibility, C-contiguous inputs "
        "are necessary. The data will be copied to change its order. To avoid this "
        "unnecessary copy, either disable KeOps (passing `keops_active='no'`) or make "
        "the input tensors C-contiguous.")
    if tensor is not None and is_f_contig(tensor):
        if warn:
            warnings.warn(warning_text % name)
        orig_device = tensor.device
        return torch.from_numpy(np.array(tensor.cpu().numpy(),
                                         order="C")).to(device=orig_device)
    return tensor
Esempio n. 7
0
def _generic_fmm(proc_idx, queue, device_id):
    # Unpack the function arguments
    a: ArgsFmm = queue.get()
    X1: torch.Tensor = a.X1
    X2: torch.Tensor = a.X2
    cuda_inputs = X1.is_cuda
    out = a.out
    kernel, gpu_dtype = a.kernel, a.gpu_dtype
    max_mem = a.max_mem
    num_streams = a.num_streams

    # flags and local variables
    change_dtype = gpu_dtype != X1.dtype
    X1_equal_X2 = _gpu_tns_same_memory(X1, X2)
    use_gpu_bufs = change_dtype or not cuda_inputs
    stride = "F" if is_f_contig(out, strict=True) else "C"
    j_iter = 0
    dts = sizeof_dtype(gpu_dtype)
    tc_device = torch.device('cuda:%d' % (int(device_id)))
    avail_mem = max_mem / dts

    # Choose block sizes n, m such that we won't run out of GPU memory
    ntot, d = X1.shape
    mtot = X2.shape[0]
    extra_mem = kernel.extra_mem()
    if cuda_inputs and not change_dtype:
        # No allocation will be performed by us. Only in-kernel stuff.
        n, m = select_dim_over_nm(max_n=ntot,
                                  max_m=mtot,
                                  d=d,
                                  coef_nd=extra_mem.get('nd', 0),
                                  coef_md=extra_mem.get('md', 0),
                                  coef_nm=extra_mem.get('nm', 0),
                                  coef_n=extra_mem.get('n', 0),
                                  coef_m=extra_mem.get('m', 0),
                                  rest=extra_mem.get('d', 0),
                                  max_mem=avail_mem)
    else:
        n, m = select_dim_over_nm(
            max_n=ntot,
            max_m=mtot,
            d=d,
            coef_nd=num_streams * (extra_mem.get('nd', 0) + 1),
            coef_md=num_streams * (extra_mem.get('md', 0) + 1),
            coef_nm=num_streams * (extra_mem.get('nm', 0) + 1),
            coef_n=extra_mem.get('n', 0),
            coef_m=extra_mem.get('m', 0),
            rest=extra_mem.get('d', 0),
            max_mem=avail_mem)

    # Create streams
    streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)]

    # Create buffers
    if use_gpu_bufs:
        gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device)
        gX2_list = [
            create_same_stride((m, d), X2, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
        gout_list = [
            create_same_stride((n, m), out, gpu_dtype, tc_device)
            for _ in range(num_streams)
        ]
    if not cuda_inputs:
        cpu_buf_list = [
            create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True)
            for _ in range(num_streams)
        ]

    # Define helpers for the copy-back operations (from cpu_buf to output)
    copy_ops = [None] * num_streams

    def wrap_copy_op(stream_idx):
        if copy_ops[stream_idx] is not None:
            copy_ops[stream_idx]()
            copy_ops[stream_idx] = None

    def do_copy_op(output, buf, i_, ic_, j_, jc_):
        # This function will also do the type conversion
        output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_])

    # Kernel computation begin
    with tcd.device(tc_device):
        for i in range(0, ntot, n):
            ic = min(n, ntot - i)

            with tcd.stream(streams[j_iter % len(streams)]):
                X1_chunk = X1.narrow(0, i, ic)
                if use_gpu_bufs:
                    cur_gX1 = gX1.narrow(0, 0, ic)
                    cur_gX1.copy_(X1_chunk, non_blocking=True)
                else:
                    cur_gX1 = X1_chunk

            for j in range(0, mtot, m):
                jc = min(m, mtot - j)
                # Choose the buffers for this inner iteration
                stream_id = j_iter % len(streams)
                stream = streams[stream_id]
                if use_gpu_bufs:
                    gX2 = gX2_list[stream_id]
                    gout = gout_list[stream_id]
                if not cuda_inputs:
                    cpu_buf = cpu_buf_list[stream_id]

                # Sync for buffers we must use now (e.g. 2 previous iters)
                with tcd.stream(stream):  # Inner-loop
                    stream.synchronize()
                    wrap_copy_op(stream_id)

                    if X1_equal_X2 and j < i:  # Shortcut for symmetric kernels
                        jc = min(m, mtot - j)
                        out[i:i + ic, j:j + jc].copy_(out[j:j + jc,
                                                          i:i + ic].T,
                                                      non_blocking=True)
                        j_iter += 1
                        continue

                    # Copy (CPU->GPU)
                    X2_chunk = X2.narrow(0, j, jc)
                    if use_gpu_bufs:
                        cur_gX2 = gX2.narrow(0, 0, jc)
                        cur_gX2.copy_(X2_chunk, non_blocking=True)
                    else:
                        cur_gX2 = X2_chunk

                    if use_gpu_bufs:
                        cur_gout = gout[:ic, :jc]
                    else:
                        cur_gout = out[i:i + ic, j:j + jc]
                    cur_gout.fill_(0.0)

                    # Compute
                    ddd = kernel._prepare(cur_gX1, cur_gX2)
                    kernel._apply(cur_gX1, cur_gX2.T, cur_gout)
                    cur_gout = kernel._finalize(cur_gout, ddd)

                    # Copy Back (GPU->CPU)
                    if not cuda_inputs:
                        # copy_ does not care about the contiguity of copies, as long as it's consistent
                        # however, in case of C-contiguous inputs it will create an intermediate array
                        # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous
                        # arrays.
                        if stride == "F":
                            copy_to_host(ic,
                                         jc,
                                         cur_gout,
                                         0,
                                         0,
                                         cpu_buf,
                                         0,
                                         0,
                                         s=stream)
                        else:
                            cuda_memcpy2d_async(dst=cpu_buf.data_ptr(),
                                                dpitch=cpu_buf.stride(0) * dts,
                                                src=cur_gout.data_ptr(),
                                                spitch=cur_gout.stride(0) *
                                                dts,
                                                width=jc * dts,
                                                height=ic,
                                                stream=stream._as_parameter_)
                        copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf,
                                                      i, ic, j, jc)
                    elif change_dtype:
                        out.narrow(0, i,
                                   ic).narrow(1, j,
                                              jc).copy_(cur_gout,
                                                        non_blocking=True)
                j_iter += 1

            for i in range(num_streams):
                streams[i].synchronize()
                wrap_copy_op(i)

    return out
Esempio n. 8
0
def _parallel_lauum_runner(A, write_opposite: bool, gpu_info):
    # Choose target:
    if is_f_contig(A):
        target = par_lauum_f_lower
    elif is_contig(A):
        target = par_lauum_c_lower
    else:
        raise NotImplementedError(
            "Parallel LAUUM is only implemented for contiguous matrices")

    N = A.shape[0]
    dt = A.dtype
    dts = sizeof_dtype(dt)
    if A.is_cuda:
        sync_current_stream(A.device)
        gpu_info = [g for g in gpu_info if g.Id == A.device.index]
        avail_ram = gpu_info[0].actual_free_mem / dts
        if target.__name__ == "par_lauum_f_lower":
            # Each GPU should hold in memory two additional blocks (2*B^2 <= M)
            # and 1 full column.
            max_block_size = int(
                math.floor((-N + math.sqrt(N**2 + 8 * avail_ram)) / 4))
        else:
            # Same RAM requirements as the out-of-core version
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))
        # All computations on the same device (where data is stored). No multi-GPU support!
        block_sizes = calc_block_sizes3(max_block_size, 1, N)
    else:
        avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts
        # Each GPU should be able to hold in memory 2 block columns
        # Plus two blocks (=> quadratic equation 2B^2 + 2BN - M <= 0.
        # An additional block is needed whenever write_opposite is True, due to
        # copying blocks between matrices with different strides!
        if write_opposite:
            max_block_size = int(
                math.floor(
                    (-2 * N + math.sqrt(4 * N**2 + 12 * avail_ram)) / 6))
        else:
            max_block_size = int(
                math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4))
        if max_block_size < 1:
            raise RuntimeError("Cannot run parallel LAUUM with minimum "
                               "available memory of %.2fMB" %
                               (avail_ram * dts / 2**20))

        block_sizes = calc_block_sizes3(max_block_size, len(gpu_info), N)

    # Create BlockAlloc objects describing the subdivision of input
    block_allocations: List[BlockAlloc] = []
    cur_n = 0
    for bs in block_sizes:
        block_allocations.append(
            BlockAlloc(start=cur_n, end=cur_n + bs, length=bs))
        cur_n += bs

    num_gpus = len(gpu_info)
    if num_gpus < 1:
        raise ValueError(
            "Parallel LAUUM can only run when a GPU is available.")
    barrier = threading.Barrier(num_gpus, timeout=1000)
    threads = []
    for _gpu_idx, g in enumerate(gpu_info):
        # Assign rows to GPUs round-robin. Use _gpu_idx instead of g.Id since the latter
        # may not contain all integers from 0.
        gid_allocs = [
            i for i in range(len(block_allocations))
            if i % num_gpus == _gpu_idx
        ]
        cublas_handle = initialization.cublas_handle(g.Id)
        if cublas_handle is None:
            raise RuntimeError("CUBLAS must be initialized "
                               "on device %d before running parallel LAUUM." %
                               (g.Id))
        t = PropagatingThread(target=target,
                              name="GPU-%d" % (g.Id),
                              args=(A, block_allocations, gid_allocs, barrier,
                                    g.Id, cublas_handle, write_opposite))
        threads.append(t)

    for t in threads:
        t.start()
    for t in threads:
        t.join()
    return A
Esempio n. 9
0
    def init(self, X: Union[torch.Tensor, SparseTensor], Y: torch.Tensor,
             alpha: torch.Tensor, penalty: float, N: int) -> None:
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner becomes usable.

        Parameters
        ----------
        X : MxD tensor
            Matrix of Nystroem centers
        Y : Mx1 tensor
            Vector of targets corresponding to the Nystroem centers `X`
        alpha : Mx1 tensor
            Parameter vector (of the same dimension as `Y`) which gives the current
            solution to the optimization problem.
        penalty : float
            Regularization amount
        N : int
            Number of points in the full data-set.

        Notes
        -----
        If `debug=True` is present in the options, this method will print a lot of extra
        information pertaining timings of the various preconditioner operations. This can be
        useful to help understand how the preconditioner works.
        """
        if Y.shape[1] != 1:
            raise ValueError(
                "Logistic preconditioner can only deal with 1D outputs.")

        dtype = X.dtype
        M = X.size(0)

        eps = self.params.pc_epsilon(dtype)

        if self.fC is None:
            # This is done only at the first iteration of the logistic-falkon algorithm
            # It sets the `T` variable from the paper (chol(kMM)) to the upper part of `self.fC`
            with TicToc("Kernel", debug=self.params.debug):
                if isinstance(X, torch.Tensor):
                    C = create_same_stride((M, M),
                                           X,
                                           dtype=dtype,
                                           device='cpu',
                                           pin_memory=self._use_cuda)
                else:  # If sparse tensor we need fortran for kernel calculation
                    C = create_fortran((M, M),
                                       dtype=dtype,
                                       device='cpu',
                                       pin_memory=self._use_cuda)
                self.kernel(X, X, out=C, opt=self.params)
            self.fC = C.numpy()
            if not is_f_contig(C):
                self.fC = self.fC.T

            with TicToc("Add diag", debug=self.params.debug):
                # Compute T: lower(fC) = T.T
                inplace_add_diag(self.fC, eps * M)
            with TicToc("Cholesky 1", debug=self.params.debug):
                self.fC = potrf_wrapper(self.fC,
                                        clean=True,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
                # Save the diagonal which will be overwritten when computing A
                self.dT = C.diag()
            with TicToc("Copy triangular", debug=self.params.debug):
                # Copy lower(fC) to upper(fC):  upper(fC) = T.
                copy_triang(self.fC, upper=False)
        else:
            if not self._use_cuda:
                # Copy non-necessary for cuda since LAUUM will do the copying
                with TicToc("Copy triangular", debug=self.params.debug):
                    # Copy upper(fC) to lower(fC): lower(fC) = T.T
                    copy_triang(self.fC,
                                upper=True)  # does not copy the diagonal
            # Setting diagonal necessary for trmm
            inplace_set_diag(self.fC, self.dT)

        # Compute W
        with TicToc("TRMM", debug=self.params.debug):
            # T is on upper(fC). Compute T.T @ alpha
            alpha = self._trmm(alpha.clone())
        with TicToc("W (ddf)", debug=self.params.debug):
            W = self.loss.ddf(Y, alpha)
        with TicToc("W-Multiply", debug=self.params.debug):
            W.sqrt_()
            self.fC = vec_mul_triang(self.fC,
                                     W.numpy().reshape(-1),
                                     side=0,
                                     upper=False)

        if self._use_cuda:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=True,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)
        else:
            with TicToc("LAUUM", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC) : lower(fC) = T @ T.T
                self.fC = lauum_wrapper(self.fC,
                                        upper=False,
                                        use_cuda=self._use_cuda,
                                        opt=self.params)

        # NOTE: Here the multiplier is 1/N instead of the more common 1/M!
        mul_triang(self.fC, upper=False, preserve_diag=False, multiplier=1 / N)

        with TicToc("Add diag", debug=self.params.debug):
            # lower(fC) = 1/N * [email protected] + lambda * I
            inplace_add_diag(self.fC, penalty)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # Cholesky on lower(fC) : lower(fC) = A.T
            self.fC = potrf_wrapper(self.fC,
                                    clean=False,
                                    upper=False,
                                    use_cuda=self._use_cuda,
                                    opt=self.params)
            self.dA = torch.from_numpy(self.fC).diag()
Esempio n. 10
0
    def init(self,
             X: Union[torch.Tensor, SparseTensor],
             weight_vec: Optional[torch.Tensor] = None):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : torch.Tensor
            The (M x D) matrix of Nystroem centers
        weight_vec
            An optional vector of size (M x 1) which is used for reweighted least-squares.
            This vector should contain the weights corresponding to the Nystrom centers.
        """
        if X.is_cuda and not self._use_cuda:
            raise RuntimeError(
                "use_cuda is set to False, but data is CUDA tensor. "
                "Check your options.")
        if weight_vec is not None and not check_same_device(X, weight_vec):
            raise ValueError(f"Weights and data are not on the same device "
                             f"({weight_vec.device}, {X.device})")
        if weight_vec is not None and weight_vec.shape[0] != X.shape[0]:
            raise ValueError(
                f"Weights and Nystrom centers should have the same first dimension. "
                f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.")
        dtype = X.dtype
        dev = X.device
        eps = self.params.pc_epsilon(X.dtype)
        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M),
                                       X,
                                       dtype=dtype,
                                       device=dev,
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M),
                                   dtype=dtype,
                                   device=dev,
                                   pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        if not is_f_contig(C):
            C = C.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag_th(C, eps * M)
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(C, upper=False)

        # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM,
        # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify
        # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM!
        if weight_vec is not None and not self._use_cuda:
            with TicToc("Weighting(CPU)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM(CUDA)", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=True,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)
        else:
            with TicToc("LAUUM(CPU)", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=False,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)

        if weight_vec is not None and self._use_cuda:
            with TicToc("Weighting(CUDA)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=0, upper=False)
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag_th(C, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            self.dA = C.diag()

        self.fC = C
Esempio n. 11
0
def _ic_cholesky(A, upper, device, cusolver_handle):
    """Cholesky factorization of matrix `A` on the GPU

    Uses the cuSOLVER library for implementation of the POTRF function.

    Parameters:
    -----------
    A : [n, n] CPU or GPU array (column-contiguous)
        The (positive definite) matrix which should be factorized
    upper : bool
        Whether we need to factorize the upper of lower portion of `A`. The other side
        of the matrix will not be touched.
    device : int
        The GPU device on which to run the factorization
    cusolver_handle
        Pointer to the cuSOLVER context, which needs to be initialized before calling
        the function.

    Returns:
    --------
    A : [n, n] CPU or GPU array (column-contiguous)
        The factorization of A which overwrites the upper (or lower) triangular part
        of the matrix A. This is not a copy of the original matrix.
    """
    # Check library initialization
    if cusolver_handle is None:
        raise RuntimeError("CuSolver must be initialized "
                           "before running in-core Cholesky.")
    if not is_f_contig(A):
        raise RuntimeError("Cholesky input must be F-contiguous")

    uplo = 'U' if upper else 'L'
    n = A.shape[0]

    tc_device = torch.device("cuda:%d" % (device))
    tc_stream = torch.cuda.current_stream(tc_device)
    # Choose functions by dtype
    potrf_buf_size = choose_fn(A.dtype, cusolverDnDpotrf_bufferSize,
                               cusolverDnSpotrf_bufferSize,
                               "POTRF Buffer size")
    potrf_fn = choose_fn(A.dtype, cusolverDnDpotrf, cusolverDnSpotrf, "POTRF")

    with torch.cuda.device(tc_device), \
            torch.cuda.stream(tc_stream), \
            cusolver_stream(cusolver_handle, tc_stream._as_parameter_):
        # Determine necessary buffer size
        potrf_bsize = potrf_buf_size(handle=cusolver_handle,
                                     uplo=uplo,
                                     n=n,
                                     A=0,
                                     lda=n)

        # Allocate flat GPU buffer, and extract buffers
        if A.is_cuda:
            potrf_wspace = torch.empty(size=(potrf_bsize, ),
                                       dtype=A.dtype,
                                       device=tc_device)
            Agpu = A
        else:
            gpu_buf = torch.empty(size=(n * n + potrf_bsize, ),
                                  dtype=A.dtype,
                                  device=tc_device)
            potrf_wspace = gpu_buf[:potrf_bsize]
            Agpu = extract_fortran(gpu_buf, (n, n), offset=potrf_bsize)
            # Copy A to device memory
            copy_to_device(n, n, A, 0, 0, Agpu, 0, 0, s=tc_stream)

        dev_info = torch.tensor(4, dtype=torch.int32, device=tc_device)

        # Run cholesky
        potrf_fn(handle=cusolver_handle,
                 uplo=uplo,
                 n=n,
                 A=Agpu.data_ptr(),
                 lda=n,
                 workspace=potrf_wspace.data_ptr(),
                 Lwork=potrf_bsize,
                 devInfo=dev_info)

        # Copy back to CPU
        if not A.is_cuda:
            copy_to_host(n, n, Agpu, 0, 0, A, 0, 0, s=tc_stream)
            del Agpu
        del potrf_wspace, dev_info
        tc_stream.synchronize()
    return A
Esempio n. 12
0
def gpu_cholesky(A: torch.Tensor, upper: bool, clean: bool, overwrite: bool,
                 opt: FalkonOptions) -> torch.Tensor:
    """
    Parameters
    -----------
    A : torch.Tensor
        2D positive-definite matrix of size (n x n) that will be factorized as
        A = U.T @ U (if `upper` is True) or A = L @ L.T if `upper`
        is False.
    upper : bool
        Whether the triangle which should be factorized is the upper or lower of `A`.
    clean : bool
        Whether the "other" triangle of the output matrix (the one that
        does not contain the factorization) will be filled with zeros or
        not.
    overwrite : bool
        Whether to overwrite matrix A or to output the result in a new
        buffer.
    opt : FalkonOptions
        Options forwarded for block calculation, and other knobs in the out-of-core
        parallel POTRF implementation. Useful options are the ones defined in
        :class:`~falkon.options.CholeskyOptions` .

    Notes
    ------
    The factorization will always be the 'lower' version of the factorization
    which could however end up on the upper-triangular part of the matrix
    in case A is not Fortran contiguous to begin with.
    """
    # Handle 'overwrite' option immediately so that its usage is reflected in memory
    # availability (in case A is on GPU).
    if not overwrite:
        # We could change the stride to be more favorable to the POTRF requirements
        # but it gets complicated. We leave such decisions to the user!
        A = copy_same_stride(A, pin_memory=True)

    # Decide which version of the algo we run: can be in-core or parallel.
    # (Note that the original OOC version is not going to run).

    # Determine GPU free RAM
    gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
    for g in gpu_info:
        g.actual_free_mem = min((g.free_memory - 300 * 2**20) * 0.95,
                                opt.max_gpu_mem * 0.95)

    if A.is_cuda:
        try:
            device = [d for d in gpu_info if d.Id == A.device.index][0]
        except IndexError:
            # This should never happen!
            raise RuntimeError("Device of matrix A (%s) is not recognized" %
                               (A.device))
    else:
        device = max(gpu_info, key=lambda g: g.actual_free_mem)
    ic = can_do_ic(A, device) and not opt.chol_force_ooc
    if opt.chol_force_in_core and not ic:
        raise RuntimeError(
            "Cannot run in-core POTRF but `chol_force_in_core` was specified.")

    f_order = is_f_contig(A)
    transposed = False
    if not f_order:
        A = A.T
        upper = not upper
        transposed = True
    # Now A is always in f_order. So we can only allow upper=False (ooc)
    if upper:
        # Can do only in-core!
        if not ic:
            raise ValueError(
                "GPU POTRF is only implemented on the "
                "lower triangle for Fortran-ordered matrices (or on the upper "
                "triangle for C-ordered matrices)")
    if not ic and A.is_cuda:
        _msg = "Cannot run out-of-core POTRF on CUDA matrix 'A'."
        if opt.chol_force_ooc:
            _msg += " Set the `chol_force_ooc` option to `False` in to allow in-core POTRF."
        raise ValueError(_msg)

    # Handle different implementations for POTRF: in-core and out-of-core
    if ic:
        if opt.debug:
            print("Using in-core POTRF")
        _ic_cholesky(A,
                     upper,
                     device=device.Id,
                     cusolver_handle=initialization.cusolver_handle(device.Id))
    else:
        if opt.debug:
            print("Using parallel POTRF")
        _parallel_potrf_runner(A, opt, gpu_info)

    # Perform cleaning of the 'other side' of the matrix
    if clean:
        la_helpers.zero_triang(A, upper=not upper)
    # Undo previous matrix transformations
    if transposed:
        A = A.T

    return A