Ejemplo n.º 1
0
def incore_fmmv(mat: torch.Tensor,
                vec: torch.Tensor,
                out: Optional[torch.Tensor] = None,
                transpose: bool = False,
                opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if not check_same_dtype(mat, vec, out):
        raise TypeError("Data types of input matrices must be equal.")
    if not check_same_device(mat, vec, out):
        raise RuntimeError("All input arguments to incore_fmmv must be on the same device")

    if out is None:
        if transpose:
            out_shape = (mat.shape[1], vec.shape[1])
        else:
            out_shape = (mat.shape[0], vec.shape[1])
        out = create_same_stride(out_shape, mat, mat.dtype, device=mat.device, pin_memory=False)
    out.fill_(0.0)

    if mat.is_cuda:
        s1 = torch.cuda.Stream()
        with torch.cuda.stream(s1):
            if transpose:
                out.addmm_(mat.T, vec, beta=0.0)
            else:
                out.addmm_(mat, vec, beta=0.0)
            s1.synchronize()
    else:
        if transpose:
            out.addmm_(mat.T, vec, beta=0.0)
        else:
            out.addmm_(mat, vec, beta=0.0)
    return out
Ejemplo n.º 2
0
def cuda_trsm(A: torch.Tensor,
              v: torch.Tensor,
              alpha: float,
              lower: int,
              transpose: int,
              stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    device = A.device
    s = stream
    if stream is None:
        s = torch.cuda.current_stream(device=device)
    cublas_hdl = cublas_handle(device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    # noinspection PyProtectedMember
    with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
            s.synchronize(
            )  # sync is necessary here for correctness. Not sure why! TODO: Is it still needed?
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if is_f_contig(v, strict=False):
            vout = vF
        else:
            vout = create_C(v.size(), v.dtype, device)
            vout = cuda_transpose(input=vF, output=vout.T).T
    return vout
Ejemplo n.º 3
0
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int,
              transpose: int) -> torch.Tensor:
    if not is_f_contig(A, strict=False):
        raise ValueError("A must be f-contiguous for CUDA TRSM to work.")
    if not check_same_device(A, v):
        raise ValueError("A and v must be on the same CUDA device.")
    if not A.is_cuda:
        raise ValueError("A and v must be CUDA tensors!")

    s = torch.cuda.Stream(device=A.device)
    cublas_hdl = cublas_handle(A.device.index)
    trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM")

    with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream(
            cublas_hdl, s._as_parameter_):
        # Deal with copying v, which may not be F-contiguous.
        vF = create_fortran(v.size(), v.dtype, v.device)
        if is_f_contig(v, strict=False):
            # We can just make a copy of v
            vF.copy_(v)
        else:
            vF = cuda_transpose(input=v, output=vF.T).T

        uplo = 'L' if lower else 'U'
        trans = 'T' if transpose else 'N'
        trsm_fn(cublas_hdl,
                side='L',
                uplo=uplo,
                trans=trans,
                diag='N',
                m=vF.shape[0],
                n=vF.shape[1],
                alpha=alpha,
                A=A.data_ptr(),
                lda=A.stride(1),
                B=vF.data_ptr(),
                ldb=vF.stride(1))
        if not is_f_contig(v, strict=False):
            vout = create_C(v.size(), v.dtype, v.device)
            vout = cuda_transpose(input=vF, output=vout.T).T
        else:
            vout = vF
        s.synchronize()
    return vout
Ejemplo n.º 4
0
def trsm(v: arr_type, A: arr_type, alpha: float, lower: int = 0, transpose: int = 0) -> arr_type:
    out_torch_convert = False
    if isinstance(A, torch.Tensor):
        if isinstance(v, torch.Tensor):
            if not check_same_device(A, v):
                raise ValueError("A and v must be on the same device.")
            if A.is_cuda and v.is_cuda:
                from falkon.la_helpers.cuda_trsm import cuda_trsm
                return cuda_trsm(A, v, alpha, lower, transpose)
            else:
                out_torch_convert = True
                A = A.numpy()
                v = v.numpy()
        else:  # v is numpy array (thus CPU)
            if A.is_cuda:
                raise ValueError("A and v must be on the same device.")
            else:
                out_torch_convert = True
                A = A.numpy()

    vout = cpu_trsm(A, v, alpha, lower, transpose)
    if out_torch_convert:
        return torch.from_numpy(vout)
    return vout
Ejemplo n.º 5
0
 def _check_predict_inputs(self, X):
     if not check_same_device(X, self.alpha_):
         raise ValueError("X must be on device %s" % (self.alpha_.device))
     return super()._check_predict_inputs(X)
Ejemplo n.º 6
0
 def _check_fit_inputs(self, X, Y, Xts, Yts):
     if not check_same_device(X, Y, Xts, Yts) or (not X.is_cuda):
         raise ValueError(
             "All tensors for fitting InCoreFalkon must be CUDA tensors, "
             "located on the same GPU.")
     return super()._check_fit_inputs(X, Y, Xts, Yts)
Ejemplo n.º 7
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. "
                      "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
                      "please pass CPU tensors; to avoid this warning if the GPU backend is "
                      "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula, aliases,
                reduction_op=reduction, axis=axis,
                dtype=dtype, dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Compile on a small data subset
    small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars
    small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device)
    fn(*small_data_variables, out=small_data_out, backend=backend)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N, T, dtype=X1.dtype, device=device,
                          pin_memory=(backend != 'CPU') and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # Info about GPUs
        ram_slack = 0.7  # slack is high due to imprecise memory usage estimates
        gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0]
        gpu_ram = [
            min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack)
            for g in gpu_info
        ]
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i in range(len(gpu_info)):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue

            args.append((ArgsFmmv(
                X1=X1.narrow(0, block_sizes[i], bwidth),
                X2=X2,
                v=v,
                out=out.narrow(0, block_sizes[i], bwidth),
                other_vars=other_vars,
                function=fn,
                backend=backend,
                gpu_ram=gpu_ram[i]
            ), gpu_info[i].Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        out = fn(*variables, out=out, backend=backend)

    return out
Ejemplo n.º 8
0
def run_keops_mmv(X1: torch.Tensor,
                  X2: torch.Tensor,
                  v: torch.Tensor,
                  other_vars: List[torch.Tensor],
                  out: Optional[torch.Tensor],
                  formula: str,
                  aliases: List[str],
                  axis: int,
                  reduction: str = 'Sum',
                  opt: Optional[FalkonOptions] = None) -> torch.Tensor:
    if opt is None:
        opt = FalkonOptions()
    # Choose backend
    N, D = X1.shape
    T = v.shape[1]
    backend = _decide_backend(opt, D)
    dtype = _keops_dtype(X1.dtype)
    device = X1.device

    if not check_same_device(X1, X2, v, out, *other_vars):
        raise RuntimeError("All input tensors must be on the same device.")
    if (device.type == 'cuda') and (not backend.startswith("GPU")):
        warnings.warn(
            "KeOps backend was chosen to be CPU, but GPU input tensors found. "
            "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, "
            "please pass CPU tensors; to avoid this warning if the GPU backend is "
            "desired, check your options (i.e. set 'use_cpu=False').")
        backend = "GPU_1D"

    # Define formula wrapper
    fn = Genred(formula,
                aliases,
                reduction_op=reduction,
                axis=axis,
                dtype=dtype,
                dtype_acc=opt.keops_acc_dtype,
                sum_scheme=opt.keops_sum_scheme)

    # Create output matrix
    if out is None:
        # noinspection PyArgumentList
        out = torch.empty(N,
                          T,
                          dtype=X1.dtype,
                          device=device,
                          pin_memory=(backend != 'CPU')
                          and (device.type == 'cpu'))

    if backend.startswith("GPU") and device.type == 'cpu':
        # slack is high due to imprecise memory usage estimates for keops
        gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack)
        block_sizes = calc_gpu_block_sizes(gpu_info, N)

        # Create queues
        args = []  # Arguments passed to each subprocess
        for i, g in enumerate(gpu_info):
            # First round of subdivision
            bwidth = block_sizes[i + 1] - block_sizes[i]
            if bwidth <= 0:
                continue
            args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth),
                                  X2=X2,
                                  v=v,
                                  out=out.narrow(0, block_sizes[i], bwidth),
                                  other_vars=other_vars,
                                  function=fn,
                                  backend=backend,
                                  gpu_ram=g.usable_ram), g.Id))
        _start_wait_processes(_single_gpu_method, args)
    else:  # Run on CPU or GPU with CUDA inputs
        variables = [X1, X2, v] + other_vars
        if device.type == 'cuda':
            with torch.cuda.device(device):
                sync_current_stream(device)
                out = fn(*variables, out=out, backend=backend)
        else:
            out = fn(*variables, out=out, backend=backend)

    return out
Ejemplo n.º 9
0
 def _check_device_properties(*args, fn_name: str, opt: FalkonOptions):
     if not check_same_device(*args):
         raise RuntimeError(
             "All input arguments to %s must be on the same device" %
             (fn_name))
Ejemplo n.º 10
0
    def init(self,
             X: Union[torch.Tensor, SparseTensor],
             weight_vec: Optional[torch.Tensor] = None):
        """Initialize the preconditioner matrix.

        This method must be called before the preconditioner can be used.

        Parameters
        ----------
        X : torch.Tensor
            The (M x D) matrix of Nystroem centers
        weight_vec
            An optional vector of size (M x 1) which is used for reweighted least-squares.
            This vector should contain the weights corresponding to the Nystrom centers.
        """
        if X.is_cuda and not self._use_cuda:
            raise RuntimeError(
                "use_cuda is set to False, but data is CUDA tensor. "
                "Check your options.")
        if weight_vec is not None and not check_same_device(X, weight_vec):
            raise ValueError(f"Weights and data are not on the same device "
                             f"({weight_vec.device}, {X.device})")
        if weight_vec is not None and weight_vec.shape[0] != X.shape[0]:
            raise ValueError(
                f"Weights and Nystrom centers should have the same first dimension. "
                f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.")
        dtype = X.dtype
        dev = X.device
        eps = self.params.pc_epsilon(X.dtype)
        M = X.size(0)

        with TicToc("Kernel", debug=self.params.debug):
            if isinstance(X, torch.Tensor):
                C = create_same_stride((M, M),
                                       X,
                                       dtype=dtype,
                                       device=dev,
                                       pin_memory=self._use_cuda)
            else:  # If sparse tensor we need fortran for kernel calculation
                C = create_fortran((M, M),
                                   dtype=dtype,
                                   device=dev,
                                   pin_memory=self._use_cuda)
            self.kernel(X, X, out=C, opt=self.params)
        if not is_f_contig(C):
            C = C.T

        with TicToc("Cholesky 1", debug=self.params.debug):
            # Compute T: lower(fC) = T.T
            inplace_add_diag_th(C, eps * M)
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            # Save the diagonal which will be overwritten when computing A
            self.dT = C.diag()

        with TicToc("Copy triangular", debug=self.params.debug):
            # Copy lower(fC) to upper(fC):  upper(fC) = T.
            copy_triang(C, upper=False)

        # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM,
        # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify
        # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM!
        if weight_vec is not None and not self._use_cuda:
            with TicToc("Weighting(CPU)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        if self._use_cuda:
            with TicToc("LAUUM(CUDA)", debug=self.params.debug):
                # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=True,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)
        else:
            with TicToc("LAUUM(CPU)", debug=self.params.debug):
                # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T
                C = lauum_wrapper(C,
                                  upper=False,
                                  use_cuda=self._use_cuda,
                                  opt=self.params)

        if weight_vec is not None and self._use_cuda:
            with TicToc("Weighting(CUDA)", debug=self.params.debug):
                weight_vec.sqrt_()
                vec_mul_triang(C, weight_vec, side=0, upper=False)
                vec_mul_triang(C, weight_vec, side=1, upper=False)

        with TicToc("Cholesky 2", debug=self.params.debug):
            # lower(fC) = 1/M * [email protected]
            mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M)
            # lower(fC) = 1/M * [email protected] + lambda * I
            inplace_add_diag_th(C, self._lambda)
            # Cholesky on lower(fC) : lower(fC) = A.T
            C = potrf_wrapper(C,
                              clean=False,
                              upper=False,
                              use_cuda=self._use_cuda,
                              opt=self.params)
            self.dA = C.diag()

        self.fC = C