def fmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_fortran((N, v.size(1)), v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fmmv, args) return out
def fdmmv_cuda_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.95) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_C((M, T), X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_C((M, T), X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(sparse_fdmmv, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( torch.cuda.comm.reduce_add( wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, 'cpu', pin_memory=True) out.fill_(0.0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv _start_wait_processes(target, args) return out
def fmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T performs fnc(X1*X2', X1, X2) * v : N x T in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (out, 'out')) device = X1.device N = X1.size(0) # Create output matrix if out is None: out = create_same_stride((N, v.size(1)), X1, v.dtype, device=device, pin_memory=device.type != 'cuda') out.fill_(0.0) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fmmv else: target = generic_fmmv gpu_info = _get_gpu_info(opt, slack=0.9) if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFmmv(X1=X1, X2=X2, v=v, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv( X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) if out is None: out = create_same_stride((M, T), X1, X1.dtype, 'cpu', pin_memory=True) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) # If using torch multiprocessing with spawn/forkserver here we must make sure # that any consumer of the queues is on a different process than the queue producer. # This is due to passing in a CUDA tensor to the queue # https://pytorch.org/docs/stable/multiprocessing.html#sharing-cuda-tensors # Thus we cannot run the first task on the current process. if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out
def run_keops_mmv(X1: torch.Tensor, X2: torch.Tensor, v: torch.Tensor, other_vars: List[torch.Tensor], out: Optional[torch.Tensor], formula: str, aliases: List[str], axis: int, reduction: str = 'Sum', opt: Optional[FalkonOptions] = None) -> torch.Tensor: if opt is None: opt = FalkonOptions() # Choose backend N, D = X1.shape T = v.shape[1] backend = _decide_backend(opt, D) dtype = _keops_dtype(X1.dtype) device = X1.device if not check_same_device(X1, X2, v, out, *other_vars): raise RuntimeError("All input tensors must be on the same device.") if (device.type == 'cuda') and (not backend.startswith("GPU")): warnings.warn( "KeOps backend was chosen to be CPU, but GPU input tensors found. " "Defaulting to 'GPU_1D' backend. To force usage of the CPU backend, " "please pass CPU tensors; to avoid this warning if the GPU backend is " "desired, check your options (i.e. set 'use_cpu=False').") backend = "GPU_1D" # Define formula wrapper fn = Genred(formula, aliases, reduction_op=reduction, axis=axis, dtype=dtype, dtype_acc=opt.keops_acc_dtype, sum_scheme=opt.keops_sum_scheme) # Create output matrix if out is None: # noinspection PyArgumentList out = torch.empty(N, T, dtype=X1.dtype, device=device, pin_memory=(backend != 'CPU') and (device.type == 'cpu')) if backend.startswith("GPU") and device.type == 'cpu': # slack is high due to imprecise memory usage estimates for keops gpu_info = _get_gpu_info(opt, slack=opt.keops_memory_slack) block_sizes = calc_gpu_block_sizes(gpu_info, N) # Create queues args = [] # Arguments passed to each subprocess for i, g in enumerate(gpu_info): # First round of subdivision bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, out=out.narrow(0, block_sizes[i], bwidth), other_vars=other_vars, function=fn, backend=backend, gpu_ram=g.usable_ram), g.Id)) _start_wait_processes(_single_gpu_method, args) else: # Run on CPU or GPU with CUDA inputs variables = [X1, X2, v] + other_vars if device.type == 'cuda': with torch.cuda.device(device): sync_current_stream(device) out = fn(*variables, out=out, backend=backend) else: out = fn(*variables, out=out, backend=backend) return out
def fdmmv_cuda(X1: torch.Tensor, X2: torch.Tensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ X1 : N x D X2 : M x D v : M x T w : N x T performs fnc(X1*X2', X1, X2)' * ( fnc(X1*X2', X1, X2) * v + w ) : M x T in blocks on multiple GPUs Assume all inputs have the same data type """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (v, 'v'), (w, 'w'), (out, 'out')) device = X1.device if v is None and w is None: raise ValueError("one of 'v' or 'w' must not be None.") T = v.size(1) if v is not None else w.size(1) M = X2.size(0) N = X1.size(0) if out is None: out = create_same_stride((M, T), X1, X1.dtype, device=device, pin_memory=device.type != 'cuda') gpu_info = _get_gpu_info(opt, slack=0.9) if kernel.kernel_type == "l2distance" and kernel.name == "gaussian": target = distk_fdmmv else: target = generic_fdmmv if device.type == 'cuda': single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFdmmv(X1=X1, X2=X2, v=v, w=w, out=out, kernel=kernel, max_mem=single_gpu_info.usable_ram) _call_direct(target, (args, device.index)) else: block_sizes = calc_gpu_block_sizes(gpu_info, N) wrlk = [] # outputs for each subprocess. args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue cur_out_gpu = create_same_stride((M, T), X1, X1.dtype, f'cuda:{gpu_info[i].Id}') # M x T wrlk.append(cur_out_gpu) cur_w = None if w is not None: cur_w = w.narrow(0, block_sizes[i], bwidth) args.append((ArgsFdmmv(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, v=v, w=cur_w, out=cur_out_gpu, kernel=kernel, max_mem=g.usable_ram), g.Id)) _start_wait_processes(target, args) if len(wrlk) > 1: # noinspection PyTypeChecker fastest_device: int = np.argmax([d.speed for d in gpu_info]) out.copy_( tcd.comm.reduce_add(wrlk, destination=gpu_info[fastest_device].Id)) else: out.copy_(wrlk[0]) return out