def incore_fmmv(mat: torch.Tensor, vec: torch.Tensor, out: Optional[torch.Tensor] = None, transpose: bool = False, opt: Optional[FalkonOptions] = None) -> torch.Tensor: if not check_same_dtype(mat, vec, out): raise TypeError("Data types of input matrices must be equal.") if not check_same_device(mat, vec, out): raise RuntimeError("All input arguments to incore_fmmv must be on the same device") if out is None: if transpose: out_shape = (mat.shape[1], vec.shape[1]) else: out_shape = (mat.shape[0], vec.shape[1]) out = create_same_stride(out_shape, mat, mat.dtype, device=mat.device, pin_memory=False) out.fill_(0.0) if mat.is_cuda: s1 = torch.cuda.Stream() with torch.cuda.stream(s1): if transpose: out.addmm_(mat.T, vec, beta=0.0) else: out.addmm_(mat, vec, beta=0.0) s1.synchronize() else: if transpose: out.addmm_(mat.T, vec, beta=0.0) else: out.addmm_(mat, vec, beta=0.0) return out
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int, stream: Optional[torch.cuda.Stream] = None) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") device = A.device s = stream if stream is None: s = torch.cuda.current_stream(device=device) cublas_hdl = cublas_handle(device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") # noinspection PyProtectedMember with torch.cuda.device(device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) s.synchronize( ) # sync is necessary here for correctness. Not sure why! TODO: Is it still needed? else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if is_f_contig(v, strict=False): vout = vF else: vout = create_C(v.size(), v.dtype, device) vout = cuda_transpose(input=vF, output=vout.T).T return vout
def cuda_trsm(A: torch.Tensor, v: torch.Tensor, alpha: float, lower: int, transpose: int) -> torch.Tensor: if not is_f_contig(A, strict=False): raise ValueError("A must be f-contiguous for CUDA TRSM to work.") if not check_same_device(A, v): raise ValueError("A and v must be on the same CUDA device.") if not A.is_cuda: raise ValueError("A and v must be CUDA tensors!") s = torch.cuda.Stream(device=A.device) cublas_hdl = cublas_handle(A.device.index) trsm_fn = choose_fn(A.dtype, cublasDtrsm, cublasStrsm, "TRSM") with torch.cuda.device(A.device), torch.cuda.stream(s), cublas_stream( cublas_hdl, s._as_parameter_): # Deal with copying v, which may not be F-contiguous. vF = create_fortran(v.size(), v.dtype, v.device) if is_f_contig(v, strict=False): # We can just make a copy of v vF.copy_(v) else: vF = cuda_transpose(input=v, output=vF.T).T uplo = 'L' if lower else 'U' trans = 'T' if transpose else 'N' trsm_fn(cublas_hdl, side='L', uplo=uplo, trans=trans, diag='N', m=vF.shape[0], n=vF.shape[1], alpha=alpha, A=A.data_ptr(), lda=A.stride(1), B=vF.data_ptr(), ldb=vF.stride(1)) if not is_f_contig(v, strict=False): vout = create_C(v.size(), v.dtype, v.device) vout = cuda_transpose(input=vF, output=vout.T).T else: vout = vF s.synchronize() return vout
def trsm(v: arr_type, A: arr_type, alpha: float, lower: int = 0, transpose: int = 0) -> arr_type: out_torch_convert = False if isinstance(A, torch.Tensor): if isinstance(v, torch.Tensor): if not check_same_device(A, v): raise ValueError("A and v must be on the same device.") if A.is_cuda and v.is_cuda: from falkon.la_helpers.cuda_trsm import cuda_trsm return cuda_trsm(A, v, alpha, lower, transpose) else: out_torch_convert = True A = A.numpy() v = v.numpy() else: # v is numpy array (thus CPU) if A.is_cuda: raise ValueError("A and v must be on the same device.") else: out_torch_convert = True A = A.numpy() vout = cpu_trsm(A, v, alpha, lower, transpose) if out_torch_convert: return torch.from_numpy(vout) return vout
def _check_predict_inputs(self, X): if not check_same_device(X, self.alpha_): raise ValueError("X must be on device %s" % (self.alpha_.device)) return super()._check_predict_inputs(X)
def _check_fit_inputs(self, X, Y, Xts, Yts): if not check_same_device(X, Y, Xts, Yts) or (not X.is_cuda): raise ValueError( "All tensors for fitting InCoreFalkon must be CUDA tensors, " "located on the same GPU.") return super()._check_fit_inputs(X, Y, Xts, Yts)
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn("KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Compile on a small data subset small_data_variables = [X1[:100], X2[:10], v[:10]] + other_vars small_data_out = torch.empty((100, T), dtype=X1.dtype, device=device) fn(*small_data_variables, out=small_data_out, backend=backend) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # Info about GPUs ram_slack = 0.7 # slack is high due to imprecise memory usage estimates gpu_info = [v for k, v in devices.get_device_info(opt).items() if k >= 0] gpu_ram = [ min((g.free_memory - 300 * 2 ** 20) * ram_slack, opt.max_gpu_mem * ram_slack) for g in gpu_info ] block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i in range(len(gpu_info)): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=gpu_ram[i] ), gpu_info[i].Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars out = fn(*variables, out=out, backend=backend) return out
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn( "KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # slack is high due to imprecise memory usage estimates for keops gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=g.usable_ram), g.Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars if device.type == 'cuda': with torch.cuda.device(device): sync_current_stream(device) out = fn(*variables, out=out, backend=backend) else: out = fn(*variables, out=out, backend=backend) return out
def _check_device_properties(*args, fn_name: str, opt: FalkonOptions): if not check_same_device(*args): raise RuntimeError( "All input arguments to %s must be on the same device" % (fn_name))
def init(self, X: Union[torch.Tensor, SparseTensor], weight_vec: Optional[torch.Tensor] = None): """Initialize the preconditioner matrix. This method must be called before the preconditioner can be used. Parameters ---------- X : torch.Tensor The (M x D) matrix of Nystroem centers weight_vec An optional vector of size (M x 1) which is used for reweighted least-squares. This vector should contain the weights corresponding to the Nystrom centers. """ if X.is_cuda and not self._use_cuda: raise RuntimeError( "use_cuda is set to False, but data is CUDA tensor. " "Check your options.") if weight_vec is not None and not check_same_device(X, weight_vec): raise ValueError(f"Weights and data are not on the same device " f"({weight_vec.device}, {X.device})") if weight_vec is not None and weight_vec.shape[0] != X.shape[0]: raise ValueError( f"Weights and Nystrom centers should have the same first dimension. " f"Found instead {weight_vec.shape[0]}, {X.shape[0]}.") dtype = X.dtype dev = X.device eps = self.params.pc_epsilon(X.dtype) M = X.size(0) with TicToc("Kernel", debug=self.params.debug): if isinstance(X, torch.Tensor): C = create_same_stride((M, M), X, dtype=dtype, device=dev, pin_memory=self._use_cuda) else: # If sparse tensor we need fortran for kernel calculation C = create_fortran((M, M), dtype=dtype, device=dev, pin_memory=self._use_cuda) self.kernel(X, X, out=C, opt=self.params) if not is_f_contig(C): C = C.T with TicToc("Cholesky 1", debug=self.params.debug): # Compute T: lower(fC) = T.T inplace_add_diag_th(C, eps * M) C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) # Save the diagonal which will be overwritten when computing A self.dT = C.diag() with TicToc("Copy triangular", debug=self.params.debug): # Copy lower(fC) to upper(fC): upper(fC) = T. copy_triang(C, upper=False) # Weighted least-squares needs to weight the A matrix. We can weigh once before LAUUM, # but since CUDA-LAUUM touches both sides of C, weighting before LAUUM will also modify # the matrix T. Therefore for CUDA inputs we weigh twice after LAUUM! if weight_vec is not None and not self._use_cuda: with TicToc("Weighting(CPU)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=1, upper=False) if self._use_cuda: with TicToc("LAUUM(CUDA)", debug=self.params.debug): # Product upper(fC) @ upper(fC).T, store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=True, use_cuda=self._use_cuda, opt=self.params) else: with TicToc("LAUUM(CPU)", debug=self.params.debug): # Product lower(fC).T @ lower(fC), store in lower(fC) = T @ T.T C = lauum_wrapper(C, upper=False, use_cuda=self._use_cuda, opt=self.params) if weight_vec is not None and self._use_cuda: with TicToc("Weighting(CUDA)", debug=self.params.debug): weight_vec.sqrt_() vec_mul_triang(C, weight_vec, side=0, upper=False) vec_mul_triang(C, weight_vec, side=1, upper=False) with TicToc("Cholesky 2", debug=self.params.debug): # lower(fC) = 1/M * [email protected] mul_triang(C, upper=False, preserve_diag=False, multiplier=1 / M) # lower(fC) = 1/M * [email protected] + lambda * I inplace_add_diag_th(C, self._lambda) # Cholesky on lower(fC) : lower(fC) = A.T C = potrf_wrapper(C, clean=False, upper=False, use_cuda=self._use_cuda, opt=self.params) self.dA = C.diag() self.fC = C