def fdmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: Optional[torch.Tensor], w: Optional[torch.Tensor], kernel, out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None): opt = _setup_opt(opt, is_cpu=True) # Parameter validation if v is None and w is None: raise ValueError("One of v and w must be specified to run fMMV.") T = v.size(1) if v is not None else w.size(1) ntot, dtot = X1.size() M = X2.size(0) dtype = X1.dtype # Create output matrix if out is None: out = torch.empty(M, T, dtype=dtype) out.fill_(0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrow X1 : n # ker_chunk : n*M # w_blk : n*T n = avail_mem / (M * T + 1) n = int(math.floor(n)) if n < 1: raise MemoryError(("Available memory %.2fGB is insufficient " "for blockwise fdMMv.") % (avail_mem * sizeof_dtype(dtype) / 2**30)) # Allocate fixed arrays ker_chunk = create_same_stride((n, M), out, dtype, device='cpu') w_blk = create_same_stride((n, T), out, dtype, device='cpu') # Run blocked fdmmv for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic) cur_ker_chunk = ker_chunk[:ic] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2) kernel._apply_sparse(X1_chunk, X2.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_w_blk = w_blk[:ic] # n x T cur_w_blk.fill_(0.0) if w is not None: cur_w_blk.copy_(w[i:i + ic, :]) if v is not None: # w_blk + c_out * v => (n x T) + (n x M)*(M x T) cur_w_blk.addmm_(cur_ker_chunk, v) out.addmm_(cur_ker_chunk.T, cur_w_blk) del ker_chunk, w_blk return out
def _parallel_potrf_runner(A: torch.Tensor, opt: CholeskyOptions, gpu_info) -> torch.Tensor: num_gpus = len(gpu_info) N = A.shape[0] dt = A.dtype # Calculate the maximum block size such that we don't run out of GPU # RAM on **any** available GPU. We need a total of 2 whole columns and 1 tile: # block-size^2 * ((N / block-size) * 2 + 1) floats # (plus the cuSOLVER buffer which is small). # block_size < (sqrt((2*N)^2 + 4R) - 2*N) / 2 dts = sizeof_dtype(dt) avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts max_block_size = (math.sqrt(4 * N**2 + 4 * avail_ram) - 2 * N) / 2 max_block_size = int(math.floor(max_block_size)) if max_block_size < 1: raise RuntimeError("Cannot run parallel POTRF with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) block_sizes = calc_block_sizes(max_block_size, num_gpus, N, opt.chol_par_blk_multiplier) block_allocations = [] cur_n = 0 for i, bs in enumerate(block_sizes): block_allocations.append((cur_n, cur_n + bs, bs, i % num_gpus, i)) cur_n += bs device_info = [] for g in range(num_gpus): device_info.append((0.0, initialization.cusolver_handle(g), g)) parallel_potrf(device_info, block_allocations, A) return A
def fmmv_cpu(X1, X2, v, kernel, out, opt): """Blockwise kernel-vector product This function computes ``kernel(X1, X2) @ v`` in a blockwise fashion, to avoid having the whole N*M kernel matrix in memory at once. Note that while the principle is that of matrix-vector product, `v` can have more than one column. Parameters ----------- X1 [N, D] array X2 [M, D] array v [M, T] array kernel Class representing the desired kernel function out : torch.Tensor or None [N, T] array for storing the kernel-vector product output. If None, will be allocated within the function. opt Basic options dictionary, used for determining available memory. """ opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size(0), X1.size(1) M, T = v.size() dtype = v.dtype # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Only necessary memory allocation is that for the temporary kernel # `temp_out` of size n*M extra_mem = kernel.extra_mem() n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=extra_mem.get('nd', 0), coef_n=M + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=extra_mem.get('m', 0), max_mem=avail_mem) # Run batched matrix multiplication for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) # , v=v) temp_out = torch.zeros(ic, M, dtype=dtype) for k in range(0, dtot, d): kc = min(d, dtot - k) X1d = X1[i: i + ic, k: k + kc] X2d = X2[:, k: k + kc] kernel._apply(X1d, X2d.T, temp_out) # temp_out = fnc(X1*X2', X1, X2) kernel._finalize(temp_out, ddd) torch.mm(temp_out, v, out=out[i: i + ic, :]) return out
def fmm_cpu_sparse(X1: SparseTensor, X2: SparseTensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions) -> torch.Tensor: opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size() mtot = X2.size(0) if out is None: out = torch.empty(ntot, mtot, dtype=X1.dtype) if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: avail_mem = _get_cpu_ram(opt, 0.9) if avail_mem <= 0: raise MemoryError("Memory insufficient for kernel evaluation.") blockwise_fmm_cpu_sparse(X1, X2, kernel, out, avail_mem) else: # Do the kernel computation on the spot out.fill_(0.0) ddd = kernel._prepare_sparse(X1, X2) kernel._apply_sparse(X1, X2.transpose_csc(), out) kernel._finalize(out, ddd) return out
def test_no_overwrite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = self.basic_opt.max_gpu_mem + mat.shape[ 0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_upper, act_up.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu()) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=False, opt=new_opt) torch.cuda.synchronize() np.testing.assert_allclose(expected_lower, act_lo.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(omat, mat.cpu())
def fmm_cuda_sparse(X1: SparseTensor, X2: SparseTensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: opt = _setup_opt(opt) _check_contiguity((out, 'out')) N = X1.size(0) M = X2.size(0) if out is None: out = create_fortran((N, M), X1.dtype, 'cpu', pin_memory=True) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow_rows(block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram), g.Id)) _start_wait_processes(_sparse_fmm, args) torch.cuda.empty_cache() return out
def _can_store_knm(self, X, ny_points, available_ram, store_threshold=1200): """Decide whether it's worthwile to pre-compute the k_NM kernel. Notes ----- If we precompute K_NM, each CG iteration costs Given a single kernel evaluation between two D-dimensional vectors costs D, at CG iteration we must perform N*M kernel evaluations. Other than the kernel evaluations we must perform two matrix-vector products 2(N*M*T) and a bunch of triangular solves. So if we precompute we have 2*(N*M*T), othewise we also have N*M*D but precomputing costs us N*M memory. So heuristic is the following: - If D is large (> `store_threshold`) check if RAM is sufficient - If RAM is sufficient precompute - Otherwise do not precompute """ if self.options.never_store_kernel: return False dts = sizeof_dtype(X.dtype) if X.size(1) > store_threshold: necessary_ram = X.size(0) * ny_points.size(0) * dts if available_ram > necessary_ram: if self.options.debug: print("%d*%d Kernel matrix will be stored" % (X.size(0), ny_points.size(0))) return True elif self.options.debug: print( "Cannot store full kernel matrix: not enough memory (have %.2fGB, need %.2fGB)" % (available_ram / 2 ** 30, necessary_ram / 2 ** 30)) return False else: return False
def test_size_of_dtype(): assert 8 == sizeof_dtype(np.float64) assert 4 == sizeof_dtype(np.float32) with pytest.raises(TypeError): sizeof_dtype(np.int32) assert 8 == sizeof_dtype(torch.float64) assert 4 == sizeof_dtype(torch.float32) with pytest.raises(TypeError): sizeof_dtype(torch.int32)
def _parallel_lauum_runner(A, write_opposite: bool, opt: LauumOptions, gpu_info): # Choose target: if is_f_contig(A): target = par_lauum_f_lower elif is_contig(A): target = par_lauum_c_lower else: raise NotImplementedError( "Parallel LAUUM is only implemented for contiguous matrices") num_gpus = len(gpu_info) if num_gpus < 1: raise ValueError( "Parallel LAUUM should only be run when some GPU is available.") N = A.shape[0] dt = A.dtype dts = sizeof_dtype(dt) avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts # Each GPU should be able to hold in memory 2 block columns max_block_size = int(math.floor(avail_ram / (2 * N))) if max_block_size < 1: raise RuntimeError("Cannot run parallel LAUUM with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) block_sizes = calc_block_sizes2(max_block_size, num_gpus, N, opt.lauum_par_blk_multiplier) block_allocations: List[BlockAlloc] = [] cur_n = 0 for bs in block_sizes: block_allocations.append( BlockAlloc(start=cur_n, end=cur_n + bs, length=bs)) cur_n += bs barrier = threading.Barrier(num_gpus, timeout=1000) threads = [] for g in gpu_info: gid_allocs = [ i for i in range(len(block_allocations)) if i % num_gpus == g.Id ] cublas_handle = initialization.cublas_handle(g.Id) if cublas_handle is None: raise RuntimeError("CUBLAS must be initialized " "on device %d before running parallel LAUUM." % (g.Id)) t = PropagatingThread(target=target, name="GPU-%d" % (g.Id), args=(A, block_allocations, gid_allocs, barrier, g.Id, cublas_handle, write_opposite)) threads.append(t) for t in threads: t.start() for t in threads: t.join() return A
def fmm_cuda(X1: torch.Tensor, X2: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor] = None, opt: Optional[BaseOptions] = None) -> torch.Tensor: """ performs fnc(X1*X2', X1, X2) in blocks on multiple GPUs """ opt = _setup_opt(opt) _check_contiguity((X1, 'X1'), (X2, 'X2'), (out, 'out')) N = X1.shape[0] M = X2.shape[0] device = X1.device if out is None: out = create_same_stride((N, M), X1, X1.dtype, device=device, pin_memory=False) gpu_info = _get_gpu_info(opt, slack=0.9) block_sizes = calc_gpu_block_sizes(gpu_info, N) # If float32 we need to upcast to float64 to avoid numerical precision errors # in the kernel gpu_dtype = X1.dtype if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: gpu_dtype = torch.float64 if device.type == 'cuda': sync_current_stream(device) single_gpu_info = [g for g in gpu_info if g.Id == device.index][0] args = ArgsFmm(X1=X1, X2=X2, out=out, kernel=kernel, gpu_dtype=gpu_dtype, max_mem=single_gpu_info.usable_ram, num_streams=opt.num_fmm_streams) _call_direct(_generic_fmm, (args, device.index)) else: # Create the arguments passed to each subprocess args = [] for i, g in enumerate(gpu_info): bwidth = block_sizes[i + 1] - block_sizes[i] if bwidth <= 0: continue args.append((ArgsFmm(X1=X1.narrow(0, block_sizes[i], bwidth), X2=X2, out=out.narrow(0, block_sizes[i], bwidth), kernel=kernel, gpu_dtype=gpu_dtype, max_mem=g.usable_ram, num_streams=opt.num_fmm_streams), g.Id)) _start_wait_processes(_generic_fmm, args) return out
def generic_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.size() M, T = v.size() # GPU Memory Usage: # ker_gpu : n*M # v_gpu : M*T # X1s_gpu : n*d # X2s_gpu : M*d # mmv_gpu : n*T # ---------- # total : n*d + n*(M+T) + d*M + M*T avail_mem = max_mem / sizeof_dtype(dtype) n, d = select_dim_over_d(maxD=dtot, maxN=ntot, coef_nd=1, coef_n=M + T, coef_d=M, rest=M * T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): ker_gpu = torch.empty(n, M, dtype=dtype, device=ddev) v_gpu = v.to(device=ddev) # M x T X1s_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) mmv_gpu = create_same_stride((n, T), out, dtype, ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) # Multiply by the vector v c_g_mmv = mmv_gpu[:ic, :] torch.mm(c_g_ker, v_gpu, out=c_g_mmv) # n x T # Copy back to host copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0) return out
def test_ooc_mem(self, pd_data, dtype, order, upper, clean, overwrite): # 1600 is the minimum memory the fn seems to use (even for the 4x4 data) max_mem = max(pd_data.shape[0] * sizeof_dtype(dtype) * 1000, 1600) opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem) run_potrf_test(pd_data, dtype=dtype, order=order, upper=upper, clean=clean, overwrite=overwrite, start_cuda=False, opt=opt)
def can_do_ic(A: torch.Tensor, device: DeviceInfo): # noinspection PyUnresolvedReferences avail_ram = device.actual_free_mem # The multiplier here is a bit tricky since setting it too high results # in hard-to-debug cuda errors avail_ram *= 0.85 if A.is_cuda: needed_ram = 100 * 8 # Not very much indeed else: needed_ram = A.shape[0] * A.shape[1] * sizeof_dtype(A.dtype) return avail_ram >= needed_ram
def fmm_cpu( X1: torch.Tensor, X2: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions) -> torch.Tensor: """Compute kernel value on matrices X1 and X2: ``out = kernel(X1, X2)`` Parameters ----------- X1 [N, D] array X2 [M, D] array kernel Class representing the desired kernel function out Array for storing the kernel output. If None, will be allocated within the function. opt Basic options dictionary, used for determining available memory. Additionally, the :attr:`~falkon.options.FalkonOptions.no_single_kernel` option is used to determine the accumulator data type. Returns -------- out [N, M] array. The kernel between X1 and X2. """ opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size() mtot = X2.size(0) if out is None: out = torch.empty(ntot, mtot, dtype=X1.dtype) if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: avail_mem = _get_cpu_ram(opt, 0.9) if avail_mem <= 0: raise MemoryError("Memory insufficient for kernel evaluation.") blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem) else: # Do the kernel computation on the spot out.fill_(0.0) ddd = kernel._prepare(X1, X2) kernel._apply(X1, X2.T, out) kernel._finalize(out, ddd) return out
def fmm_cpu(X1: torch.Tensor, X2: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions) -> torch.Tensor: """Compute kernel value on matrices X1 and X2: `out = kernel(X1, X2)` Parameters ----------- X1 : [N, D] array X2 : [M, D] array kernel : Kernel Class representing the desired kernel function out : Optional([N, M] array) Array for storing the kernel output. If None, will be allocated within the function. opt : Union(Dict, CompOpt) Options dictionary. Supported options are - 'final_type', the data-type of the output array. If 'out' is not None and it's data-type clashes with the setting of 'final_type', the out matrix will not be modified. Returns -------- out : [N, M] array The kernel between X1 and X2. """ opt = _setup_opt(opt, is_cpu=True) ntot, dtot = X1.size() mtot = X2.size(0) if out is None: out = torch.empty(ntot, mtot, dtype=X1.dtype) if sizeof_dtype(X1.dtype) < 8 and opt.no_single_kernel: avail_mem = _get_cpu_ram(opt, 0.9) if avail_mem <= 0: raise MemoryError("Memory insufficient for kernel evaluation.") blockwise_fmm_cpu(X1, X2, kernel, out, avail_mem) else: # Do the kernel computation on the spot out.fill_(0.0) ddd = kernel._prepare(X1, X2) kernel._apply(X1, X2.T, out) kernel._finalize(out, ddd) return out
def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions): opt = _setup_opt(opt, is_cpu=True) dtype = X1.dtype ntot, dtot = X1.size() mtot, T = v.size() # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) out.fill_(0.0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrowing X1, X2: n + m # Prepare - not computable, depends on kernel # ker_chunk : n*m # finalize : 0 (if can be implemented in place, kernel-dependent) n, m = select_dim_over_m(maxM=mtot, maxN=ntot, coef_nm=1, coef_n=1, coef_m=1, tot=avail_mem) ker_chunk = create_same_stride((n, m), out, dtype, device='cpu') for i in range(0, ntot, n): ic = min(n, ntot - i) cur_out = out[i:i + ic, :] X1_chunk = X1.narrow_rows(i, ic) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) cur_ker_chunk = ker_chunk[:ic, :jc] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc)) return out
def test_diff_blk_sizes(self, dtype, order, get_mat, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) # For cuda inputs we must add to available GPU memory the amount used by the # input matrix, since overwrite=False and a full copy must be performed. mgpu_slack = 0 if device.startswith("cuda"): mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype) opt_v1 = dataclasses.replace(self.basic_opt, max_gpu_mem=2*2**20 + mgpu_slack) act_up_v1 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v1) opt_v2 = dataclasses.replace(self.basic_opt, max_gpu_mem=4*2**20 + mgpu_slack) act_up_v2 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v2) opt_v3 = dataclasses.replace(self.basic_opt, max_gpu_mem=6*2**20 + mgpu_slack) act_up_v3 = gpu_lauum(mat, upper=True, overwrite=False, opt=opt_v3) np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v1.cpu().numpy(), rtol=self.rtol[dtype]) np.testing.assert_allclose(act_up_v3.cpu().numpy(), act_up_v2.cpu().numpy(), rtol=self.rtol[dtype])
def test_ic_mem(self, pd_data, dtype, order, upper, clean, overwrite, start_cuda): if start_cuda: max_mem = 2000 else: # 1600 is needed! max_mem = max( 1600, pd_data.shape[0] * pd_data.shape[1] * sizeof_dtype(dtype) * 1.5) opt = dataclasses.replace(self.basic_options, max_gpu_mem=max_mem) run_potrf_test(pd_data, dtype=dtype, order=order, upper=upper, clean=clean, overwrite=overwrite, start_cuda=start_cuda, opt=opt)
def _single_gpu_method(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() backend = a.backend X1 = a.X1 X2 = a.X2 v = a.v oout = a.out other_vars = a.other_vars fn = a.function R = a.gpu_ram N, D = X1.shape M = X2.shape[0] T = v.shape[1] # Second round of subdivision (only if necessary due to RAM constraints) n, m = _estimate_split(N, M, D, T, R, sizeof_dtype(X1.dtype)) # Process the two rounds of splitting with a nested loop. for mi in range(0, M, m): ml = min(m, M - mi) if ml != M and mi > 0: # Then we must create a temporary output array out = torch.empty_like(oout) else: out = oout cX2 = X2[mi:mi + ml, :] cv = v[mi:mi + ml, :] for ni in range(0, N, n): nl = min(n, N - ni) cX1 = X1[ni:ni + nl, :] cout = out[ni: ni + nl, :] variables = [cX1, cX2, cv] + other_vars fn(*variables, out=cout, device_id=device_id, backend=backend) torch.cuda.synchronize(device_id) if ml != M and mi > 0: oout.add_(out) return oout
def test_write_opposite(self, dtype, order, get_mat, expected_lower, expected_upper, device): omat = get_mat(order=order, dtype=dtype) mat = get_mat(order=order, dtype=dtype, device=device) mgpu_slack = mat.shape[0]**2 * sizeof_dtype(mat.dtype) with memory_checker(self.basic_opt, extra_mem=mgpu_slack) as new_opt: act_up = gpu_lauum(mat, upper=True, overwrite=False, write_opposite=True, opt=new_opt) act_up = act_up.cpu() np.testing.assert_allclose(np.triu(omat, k=1), np.triu(act_up.numpy(), k=1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.tril(act_up.numpy()), np.triu(expected_upper).T, rtol=self.rtol[dtype]) mat = get_mat(order=order, dtype=dtype, device=device) with memory_checker(self.basic_opt) as new_opt: act_lo = gpu_lauum(mat, upper=False, overwrite=True, write_opposite=True, opt=new_opt) torch.cuda.synchronize() act_lo = act_lo.cpu() np.testing.assert_allclose(np.tril(omat, k=-1), np.tril(act_lo.numpy(), k=-1), rtol=self.rtol[dtype]) np.testing.assert_allclose(np.triu(act_lo.numpy()), np.tril(expected_lower).T, rtol=self.rtol[dtype])
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) #if sizeof_dtype(dtype) == 4: # avail_mem /= 2 n, m = select_dim_over_m(maxM=M, maxN=N, coef_nm=1.0, coef_n=D + T, coef_m=D + T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): nm_gpu = create_same_stride((n, m), X1, dtype, ddev) out_gpu = create_same_stride((n, T), out, dtype, ddev) X1s_gpu = create_same_stride((n, D), X1, dtype, ddev) X2s_gpu = create_same_stride((m, D), X2, dtype, ddev) vs_gpu = create_same_stride((m, T), v, dtype, ddev) for i in range(0, N, n): nb = min(n, N - i) cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu, 0, 0) cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0, 0) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v # FIXME: This is the cause of mapping errors in case of float32 calculations. cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0) return out
def fdmmv_cpu(X1, X2, v, w, kernel, out, opt): """Calculate a double kernel-vector product. This function computes the following quantity: ``kernel(X1, X2).T @ (kernel(X1, X2) @ v + w)`` Where one of `v` or `w` can be empty. All arrays passed to this function must be 2-dimensional, although the second dimension can be unitary. The expression is not computed directly. We separate the computation into smaller blocks so as to reduce the total memory consumption (the large N*M kernel matrix is never wholly stored in RAM.) Parameters ----------- X1 [N, D] array X2 [M, D] array v : torch.Tensor or None [M, T] array. But note that at least one of v or w must be specified. w : torch.Tensor or None [N, T] array. But note that at least one of v or w must be specified. kernel Class representing the desired kernel function out : torch.Tensor or None [M, T] array for storing the kernel-vector product output. If None, will be allocated within the function. opt Basic options dictionary, used for determining available memory. """ opt = _setup_opt(opt, is_cpu=True) # Parameter validation if v is None and w is None: raise ValueError("One of v and w must be specified to run fMMV.") T = v.shape[1] if v is not None else w.shape[1] ntot, dtot = X1.size() M = X2.size(0) dtype = X1.dtype # Create output matrix if out is None: out = torch.empty(M, T, dtype=dtype) out.fill_(0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # The only necessary temporary matrices are: `temp_out` of size n*M and # temp_w_block of size n*T n, d = select_dim_over_d(maxD=dtot, maxN=ntot, coef_nd=0, coef_n=M + T, coef_d=0, rest=0, tot=avail_mem) # Run Batched Matrix Computation for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1[i:i + ic, :], X2) temp_out = torch.zeros(ic, M, dtype=dtype) for k in range(0, dtot, d): kc = min(d, dtot - k) X1d = X1[i:i + ic, k:k + kc] X2d = X2[:, k:k + kc] kernel._apply(X1d, X2d.T, temp_out) kernel._finalize(temp_out, ddd) # fnc(X1*X2', X1, X2) [n x M] w_blk = torch.zeros(ic, T, dtype=dtype) # n x T if w is not None: w_blk.copy_(w[i:i + ic, :]) if v is not None: # w_blk + c_out * v => (n x T) + (n x M)*(M x T) w_blk.addmm_(temp_out, v) out.add_(torch.mm(temp_out.T, w_blk)) return out
def _sparse_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.size(0) avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # X1_chunk : ntot + 2 * D * ntot * density # X2_chunk : dtot + 2 * D * mtot * density (because is transposed) # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here) # ker_gpu : mtot * ntot n, m = select_dim_over_nm_v2(max_n=ntot, max_m=mtot, coef_nm=3, coef_n=2 + 2 * dtot * X1.density, coef_m=2 * dtot * X2.density, rest=dtot, max_mem=avail_mem) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype) X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=tc_device) for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype) X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_g_out) cur_g_out = kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd, X1_chunk_d, X1_chunk del X2_chunk, X2_chunk_d del g_out return out
def _generic_fmm(proc_idx, queue, device_id): # Unpack the function arguments a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem num_streams = a.num_streams # flags and local variables change_dtype = gpu_dtype != X1.dtype X1_equal_X2 = _gpu_tns_same_memory(X1, X2) use_gpu_bufs = change_dtype or not cuda_inputs stride = "F" if is_f_contig(out, strict=True) else "C" j_iter = 0 dts = sizeof_dtype(gpu_dtype) tc_device = torch.device('cuda:%d' % (int(device_id))) avail_mem = max_mem / dts # Choose block sizes n, m such that we won't run out of GPU memory ntot, d = X1.shape mtot = X2.shape[0] extra_mem = kernel.extra_mem() if cuda_inputs and not change_dtype: # No allocation will be performed by us. Only in-kernel stuff. n, m = select_dim_over_nm(max_n=ntot, max_m=mtot, d=d, coef_nd=extra_mem.get('nd', 0), coef_md=extra_mem.get('md', 0), coef_nm=extra_mem.get('nm', 0), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) else: n, m = select_dim_over_nm( max_n=ntot, max_m=mtot, d=d, coef_nd=num_streams * (extra_mem.get('nd', 0) + 1), coef_md=num_streams * (extra_mem.get('md', 0) + 1), coef_nm=num_streams * (extra_mem.get('nm', 0) + 1), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) # Create streams streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)] # Create buffers if use_gpu_bufs: gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device) gX2_list = [ create_same_stride((m, d), X2, gpu_dtype, tc_device) for _ in range(num_streams) ] gout_list = [ create_same_stride((n, m), out, gpu_dtype, tc_device) for _ in range(num_streams) ] if not cuda_inputs: cpu_buf_list = [ create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for _ in range(num_streams) ] # Define helpers for the copy-back operations (from cpu_buf to output) copy_ops = [None] * num_streams def wrap_copy_op(stream_idx): if copy_ops[stream_idx] is not None: copy_ops[stream_idx]() copy_ops[stream_idx] = None def do_copy_op(output, buf, i_, ic_, j_, jc_): # This function will also do the type conversion output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_]) # Kernel computation begin with tcd.device(tc_device): for i in range(0, ntot, n): ic = min(n, ntot - i) with tcd.stream(streams[j_iter % len(streams)]): X1_chunk = X1.narrow(0, i, ic) if use_gpu_bufs: cur_gX1 = gX1.narrow(0, 0, ic) cur_gX1.copy_(X1_chunk, non_blocking=True) else: cur_gX1 = X1_chunk for j in range(0, mtot, m): jc = min(m, mtot - j) # Choose the buffers for this inner iteration stream_id = j_iter % len(streams) stream = streams[stream_id] if use_gpu_bufs: gX2 = gX2_list[stream_id] gout = gout_list[stream_id] if not cuda_inputs: cpu_buf = cpu_buf_list[stream_id] # Sync for buffers we must use now (e.g. 2 previous iters) with tcd.stream(stream): # Inner-loop stream.synchronize() wrap_copy_op(stream_id) if X1_equal_X2 and j < i: # Shortcut for symmetric kernels jc = min(m, mtot - j) out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T, non_blocking=True) j_iter += 1 continue # Copy (CPU->GPU) X2_chunk = X2.narrow(0, j, jc) if use_gpu_bufs: cur_gX2 = gX2.narrow(0, 0, jc) cur_gX2.copy_(X2_chunk, non_blocking=True) else: cur_gX2 = X2_chunk if use_gpu_bufs: cur_gout = gout[:ic, :jc] else: cur_gout = out[i:i + ic, j:j + jc] cur_gout.fill_(0.0) # Compute ddd = kernel._prepare(cur_gX1, cur_gX2) kernel._apply(cur_gX1, cur_gX2.T, cur_gout) cur_gout = kernel._finalize(cur_gout, ddd) # Copy Back (GPU->CPU) if not cuda_inputs: # copy_ does not care about the contiguity of copies, as long as it's consistent # however, in case of C-contiguous inputs it will create an intermediate array # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous # arrays. if stride == "F": copy_to_host(ic, jc, cur_gout, 0, 0, cpu_buf, 0, 0, s=stream) else: cuda_memcpy2d_async(dst=cpu_buf.data_ptr(), dpitch=cpu_buf.stride(0) * dts, src=cur_gout.data_ptr(), spitch=cur_gout.stride(0) * dts, width=jc * dts, height=ic, stream=stream._as_parameter_) copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf, i, ic, j, jc) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_gout, non_blocking=True) j_iter += 1 for i in range(num_streams): streams[i].synchronize() wrap_copy_op(i) return out
def par_lauum_c_lower(A: torch.Tensor, block_allocs: List[BlockAlloc], my_rows: List[int], barrier: threading.Barrier, device_id: int, cublas_handle, independent_output: bool): N = A.shape[0] dts = sizeof_dtype(A.dtype) lauum_fn = choose_fn(A.dtype, scll.dlauum, scll.slauum, "Lapack LAUUM") trmm_fn = choose_fn(A.dtype, cublasDtrmm, cublasStrmm, "cuBlas TRMM") gemm_fn = choose_fn(A.dtype, cublasDgemm, cublasSgemm, "cuBlas GEMM") syrk_fn = choose_fn(A.dtype, cublasDsyrk, cublasSsyrk, "cuBlas SYRK") tc_device = torch.device('cuda:%d' % (device_id)) s1 = torch.cuda.Stream(device=tc_device) s2 = torch.cuda.Stream(device=tc_device) s1_cuda, s2_cuda = s1._as_parameter_, s2._as_parameter_ cublasSetStream(cublas_handle, s1_cuda) max_block_size = max(ba.length for ba in block_allocs) my_rows = sorted(my_rows) with torch.cuda.device(tc_device), torch.cuda.stream(s1): # Preallocate 2 block-columns. The single block is a CPU buffer whole_col_b = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) whole_col_r = create_fortran((A.shape[0] * max_block_size, ), A.dtype, tc_device) temp_bb = create_fortran((max_block_size, max_block_size), A.dtype, 'cpu', pin_memory=True).T for b in range(len(block_allocs)): bb = block_allocs[b] # Load col b. # Instead of loading the whole column only load the last rows # as necessary by inspecting the minimum value in my_rows which is >= b. try: min_row = min([r for r in my_rows if r >= b]) b_start = block_allocs[min_row].start cuda_memcpy2d_async(dst=whole_col_b.data_ptr(), dpitch=max_block_size * dts, src=A[b_start, bb.start].data_ptr(), spitch=A.shape[1] * dts, width=bb.length * dts, height=N - b_start, stream=s1_cuda) except ValueError: # all of `my_rows` are smaller than `b`. pass if not independent_output: barrier.wait() for r in my_rows: if r < b: continue if r == b: is_last_row = b_start + bb.length == N # SYRK on g_b[bb.length:, :] with output replacing g_b[:bb.length, :] # C = beta*C + alpha * op(A) @ op(A).T if not is_last_row: syrk_fn(cublas_handle, uplo='U', trans='N', n=bb.length, k=N - b_start - bb.length, alpha=1.0, A=whole_col_b[bb.length * max_block_size:].data_ptr(), lda=max_block_size, beta=0.0, C=whole_col_b.data_ptr(), ldc=max_block_size) # Run LAUUM on CPU on Abb.T (transpose because LAPACK works in F-order) # Result will be on upper(uu). So if we copy back to lower(A), we must copy # back uu.T -- otherwise we should copy back uu directly. Abb = A[bb.start:bb.end, bb.start:bb.end] if independent_output: Abb_np = Abb.T.numpy().copy(order="F") # U\L copy_triang(Abb_np, upper=True) # L\L uu, info = lauum_fn(Abb_np, lower=1, overwrite_c=True) # LAU\L Abb.copy_(torch.from_numpy(uu.T)) # L \ LAU else: uu, info = lauum_fn(Abb.T.numpy(), lower=0, overwrite_c=False) # Zeroing must happen if the SYRK output is to be added: otherwise the # non-processed part of Abb (i.e. upper(Abb) if not independent_output) # will be multiplied by 2. if not is_last_row: zero_triang(uu, upper=False) Abb.copy_(torch.from_numpy(uu.T)) if not is_last_row: cuda_memcpy2d_async(dst=temp_bb.data_ptr(), dpitch=max_block_size * dts, src=whole_col_b.data_ptr(), spitch=max_block_size * dts, width=bb.length * dts, height=bb.length, stream=s1_cuda) s1.synchronize( ) # TODO: Check if failure when this commented out. if independent_output: Abb.add_( torch.triu(temp_bb[:bb.length, :bb.length].T)) else: Abb.add_(temp_bb[:bb.length, :bb.length]) else: # r > b br = block_allocs[r] # Load column r. Since r > b this column will be shorter than column b cuda_memcpy2d_async(dst=whole_col_r.data_ptr(), dpitch=max_block_size * dts, src=A[br.start, br.start].data_ptr(), spitch=A.shape[1] * dts, width=br.length * dts, height=N - br.start, stream=s1_cuda) #s1.synchronize() # Restrict column b to only the last 'r' rows ccb = whole_col_b[(br.start - b_start) * max_block_size:] # TRMM on g_r[0:br.length, :] which is triangular (r*r) # and cur_g_b[0:br.length, :] # output is a r*b matrix and should be stored in a separate g_out block # Could store output in the first rows of g_b # C = alpha * op(A) @ B -- A triangular trmm_fn(handle=cublas_handle, side='R', uplo='U', trans='T', diag='N', m=bb.length, n=br.length, alpha=1.0, A=whole_col_r.data_ptr(), lda=max_block_size, B=ccb.data_ptr(), ldb=max_block_size, C=ccb.data_ptr(), ldc=max_block_size) # GEMM on g_r[br.length:, :].T and cur_g_b[bb.length:, :] # output is the same r*b matrix as before, outputs need to be summed. # C = alpha * op(A) @ op(B) + beta * C if br.end < N: gemm_fn(handle=cublas_handle, transa='N', transb='T', m=bb.length, n=br.length, k=N - br.start - br.length, alpha=1.0, A=ccb[br.length * max_block_size:].data_ptr(), lda=max_block_size, B=whole_col_r[br.length * max_block_size:].data_ptr(), ldb=max_block_size, beta=1.0, C=ccb.data_ptr(), ldc=max_block_size) # Copy back to A[r, b] if independent_output: # Copy must be transposed, copy to temp_bb first. cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=temp_bb.data_ptr(), ldb=max_block_size, stream=s1_cuda) s1.synchronize() A[bb.start:bb.end, br.start:br.end].copy_( temp_bb[:br.length, :bb.length].T) else: s1.synchronize() cublasGetMatrixAsync(rows=bb.length, cols=br.length, elem_size=dts, A=ccb.data_ptr(), lda=max_block_size, B=A[br.start, bb.start].data_ptr(), ldb=A.shape[0], stream=s2_cuda) s2.synchronize()
def _parallel_lauum_runner(A, write_opposite: bool, gpu_info): # Choose target: if is_f_contig(A): target = par_lauum_f_lower elif is_contig(A): target = par_lauum_c_lower else: raise NotImplementedError( "Parallel LAUUM is only implemented for contiguous matrices") N = A.shape[0] dt = A.dtype dts = sizeof_dtype(dt) if A.is_cuda: sync_current_stream(A.device) gpu_info = [g for g in gpu_info if g.Id == A.device.index] avail_ram = gpu_info[0].actual_free_mem / dts if target.__name__ == "par_lauum_f_lower": # Each GPU should hold in memory two additional blocks (2*B^2 <= M) # and 1 full column. max_block_size = int( math.floor((-N + math.sqrt(N**2 + 8 * avail_ram)) / 4)) else: # Same RAM requirements as the out-of-core version max_block_size = int( math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4)) if max_block_size < 1: raise RuntimeError("Cannot run parallel LAUUM with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) # All computations on the same device (where data is stored). No multi-GPU support! block_sizes = calc_block_sizes3(max_block_size, 1, N) else: avail_ram = min([g.actual_free_mem for g in gpu_info]) / dts # Each GPU should be able to hold in memory 2 block columns # Plus two blocks (=> quadratic equation 2B^2 + 2BN - M <= 0. # An additional block is needed whenever write_opposite is True, due to # copying blocks between matrices with different strides! if write_opposite: max_block_size = int( math.floor( (-2 * N + math.sqrt(4 * N**2 + 12 * avail_ram)) / 6)) else: max_block_size = int( math.floor((-2 * N + math.sqrt(4 * N**2 + 8 * avail_ram)) / 4)) if max_block_size < 1: raise RuntimeError("Cannot run parallel LAUUM with minimum " "available memory of %.2fMB" % (avail_ram * dts / 2**20)) block_sizes = calc_block_sizes3(max_block_size, len(gpu_info), N) # Create BlockAlloc objects describing the subdivision of input block_allocations: List[BlockAlloc] = [] cur_n = 0 for bs in block_sizes: block_allocations.append( BlockAlloc(start=cur_n, end=cur_n + bs, length=bs)) cur_n += bs num_gpus = len(gpu_info) if num_gpus < 1: raise ValueError( "Parallel LAUUM can only run when a GPU is available.") barrier = threading.Barrier(num_gpus, timeout=1000) threads = [] for _gpu_idx, g in enumerate(gpu_info): # Assign rows to GPUs round-robin. Use _gpu_idx instead of g.Id since the latter # may not contain all integers from 0. gid_allocs = [ i for i in range(len(block_allocations)) if i % num_gpus == _gpu_idx ] cublas_handle = initialization.cublas_handle(g.Id) if cublas_handle is None: raise RuntimeError("CUBLAS must be initialized " "on device %d before running parallel LAUUM." % (g.Id)) t = PropagatingThread(target=target, name="GPU-%d" % (g.Id), args=(A, block_allocations, gid_allocs, barrier, g.Id, cublas_handle, write_opposite)) threads.append(t) for t in threads: t.start() for t in threads: t.join() return A
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.size(1) if v is not None else w.size(1) dtype = X1.dtype # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream() s2 = tcd.Stream() with tcd.device(ddev), tcd.stream(s1): if v is not None: v_gpu = create_same_stride((M, T), v, dtype, ddev) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) K_gpu = create_same_stride((n, M), X1, dtype, ddev) X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) Kv_gpu = create_same_stride((n, T), X1, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) sq1_gpu = create_same_stride((n, ), X1, dtype, ddev) sq2_gpu = create_same_stride((M, ), X1, dtype, ddev) #if (d == D): # with torch.cuda.stream(s2): # cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2) # torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu.narrow(0, 0, nb) # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) # Parallelize two matrix transfers (probably pointless) #if d < D: with torch.cuda.stream(s2): cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize() s1.synchronize() cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: # Copy split w to GPU into cur_Kv_gpu, cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T s1.synchronize() s1.synchronize() if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def sparse_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, w, out = a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.shape M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory needs: # X1_chunk : ntot + 2 * D * ntot * density # X2 : dtot + 2 * D * M * density (because is transposed) # sparse_out : ntot + 2 * ntot * M * density (assume here density = 1) # ker_gpu : M * ntot # w_gpu : ntot * T # v_gpu : M * T # out_gpu : M * T avail_mem = max_mem / sizeof_dtype(dtype) den = 2 * D * X1.density + 2 + 3 * M + T sub = D + 2 * D * M * X2.density + M * T if v is not None: sub += M * T n = (avail_mem - sub) / den n = min(int(n), N) if n < 1: raise MemoryError("Not enough memory to run sparse dfmmv") ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data w_gpu = create_same_stride((n, T), out, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) ker_gpu = create_fortran((n, M), dtype, ddev) if v is not None: v_gpu = v.to(device=ddev) # M x T X2_d = SparseTensor.from_scipy( X2.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) for i in range(0, N, n): ic = min(n, N - i) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) ker_chunk = ker_gpu[:ic] ker_chunk.fill_(0.0) # TODO: This is wasteful (X2 will be prepared many times over) ddd = kernel._prepare_sparse(X1_chunk, X2) ker_chunk = kernel._apply_sparse(X1_chunk_d, X2_d, ker_chunk) ker_chunk = kernel._finalize(ker_chunk, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(ker_chunk, v_gpu) out_gpu.addmm_(ker_chunk.T, c_g_w) del ddd, X1_chunk, X1_chunk_d if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def generic_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype N, D = X1.size() M = X2.size(0) if v is None: T = w.size(1) else: T = v.size(1) # Memory usage: # v : M x T # K : n x M # X1d : n x d # X2d : M x d # Kv : n x T # out2 : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. if sizeof_dtype(dtype) == 4: avail_mem /= 2 rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): # Initialize GPU data ker_gpu = create_same_stride((n, M), out, dtype=dtype, device=ddev) X1s_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) w_gpu = create_same_stride((n, T), ker_gpu, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) if v is not None: v_gpu = v.to(device=ddev) # M x T for i in range(0, N, n): ic = min(n, N - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, D, d): kc = min(d, D - k) c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(c_g_ker, v_gpu) out_gpu.addmm_(c_g_ker.T, c_g_w) if not out.is_cuda: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out