def fmmv_cpu_sparse(X1: SparseTensor, X2: SparseTensor, v: torch.Tensor, kernel: 'falkon.kernels.Kernel', out: Optional[torch.Tensor], opt: BaseOptions): opt = _setup_opt(opt, is_cpu=True) dtype = X1.dtype ntot, dtot = X1.size() mtot, T = v.size() # Create output matrix if out is None: out = torch.empty(ntot, T, dtype=dtype) out.fill_(0.0) avail_mem = _get_cpu_ram(opt, 0.95) / sizeof_dtype(dtype) # Narrowing X1, X2: n + m # Prepare - not computable, depends on kernel # ker_chunk : n*m # finalize : 0 (if can be implemented in place, kernel-dependent) n, m = select_dim_over_m(maxM=mtot, maxN=ntot, coef_nm=1, coef_n=1, coef_m=1, tot=avail_mem) ker_chunk = create_same_stride((n, m), out, dtype, device='cpu') for i in range(0, ntot, n): ic = min(n, ntot - i) cur_out = out[i:i + ic, :] X1_chunk = X1.narrow_rows(i, ic) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) cur_ker_chunk = ker_chunk[:ic, :jc] cur_ker_chunk.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) kernel._apply_sparse(X1_chunk, X2_chunk.transpose_csc(), cur_ker_chunk) kernel._finalize(cur_ker_chunk, ddd) # Multiply by the vector v cur_out.addmm_(cur_ker_chunk, v.narrow(0, j, jc)) return out
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) #if sizeof_dtype(dtype) == 4: # avail_mem /= 2 n, m = select_dim_over_m(maxM=M, maxN=N, coef_nm=1.0, coef_n=D + T, coef_m=D + T, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): nm_gpu = create_same_stride((n, m), X1, dtype, ddev) out_gpu = create_same_stride((n, T), out, dtype, ddev) X1s_gpu = create_same_stride((n, D), X1, dtype, ddev) X2s_gpu = create_same_stride((m, D), X2, dtype, ddev) vs_gpu = create_same_stride((m, T), v, dtype, ddev) for i in range(0, N, n): nb = min(n, N - i) cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu, 0, 0) cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0, 0) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v # FIXME: This is the cause of mapping errors in case of float32 calculations. cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0) return out
def sparse_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 v, out = a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype ntot, dtot = X1.shape mtot, T = v.size() avail_mem = max_mem / sizeof_dtype(dtype) # Memory needs: # X1_chunk : N + 2*D*N*density # X2_chunk : D + 2*D*M*density (because is transposed) # sparse_out : N + 2*N*M*(density) (assume density = 1) # ker_gpu : M*N # mmv_gpu : N*T # v_gpu : M*T # Other: GPU buffer n, m = select_dim_over_m( maxM=mtot, maxN=ntot, tot=avail_mem, coef_nm=3, coef_n=2 + 2 * dtot * X1.density + T, coef_m=2 * dtot * X2.density + T, rest=dtot, ) ddev = torch.device('cuda:%d' % int(device_id)) with tcd.device(ddev): v_gpu = v.to(device=ddev) # M x T mmv_gpu = create_same_stride((n, T), out, dtype, ddev) # ker_gpu should be fortran-ordered due to cusparse csr2dense function ker_gpu = create_fortran((n, m), dtype=dtype, device=ddev) for i in range(0, ntot, n): ic = min(n, ntot - i) cur_mmv_gpu = mmv_gpu[:ic] # n x T cur_mmv_gpu.fill_(0.0) X1_chunk = X1.narrow_rows(i, ic) X1_chunk_d = X1_chunk.index_to_int().to(device=ddev) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc) # Prepare sparse on CPU ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) # Transpose X2-chunk and convert it to CSR. This uses lots of RAM X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=ddev) cur_ker_gpu = ker_gpu[:ic, :jc] cur_ker_gpu.fill_(0.0) # Run the matrix multiplication (kernel apply) cur_ker_gpu = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_ker_gpu) cur_ker_gpu = kernel._finalize(cur_ker_gpu, ddd) # Multiply by the vector v cur_mmv_gpu.addmm_(cur_ker_gpu, v_gpu.narrow(0, j, jc)) del ddd, X2_chunk, X2_chunk_d # send result to CPU copy_to_host_noorder(ic, T, cur_mmv_gpu, 0, 0, out, i, 0) del X1_chunk, X1_chunk_d return out
def _sparse_fmm(proc_idx, queue, device_id): a: ArgsFmm = queue.get() X1: SparseTensor = a.X1 X2: SparseTensor = a.X2 out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem ntot, dtot = X1.shape mtot = X2.size(0) avail_mem = max_mem / sizeof_dtype(gpu_dtype) # Memory usage: # X1_chunk : ntot + 2 * D * ntot * density # X2_chunk : dtot + 2 * D * mtot * density (because is transposed) # sparse_out : ntot + 2 * ntot * mtot * density (assume density=1 here) # ker_gpu : mtot * ntot n, m = select_dim_over_m(maxN=ntot, maxM=mtot, tot=avail_mem, coef_nm=3, coef_m=2 * dtot * X2.density, coef_n=2 + 2 * dtot * X1.density, rest=dtot) tc_device = torch.device('cuda:%d' % (int(device_id))) with torch.cuda.device(tc_device): # Initialize GPU buffers g_out = create_same_stride((n, m), out, gpu_dtype, tc_device) cpu_buf = None if X1.dtype != gpu_dtype: cpu_buf = create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for j in range(0, mtot, m): jc = min(m, mtot - j) X2_chunk = X2.narrow_rows(j, jc).to(dtype=gpu_dtype) X2_chunk_d = SparseTensor.from_scipy( X2_chunk.transpose_csc().to_scipy().tocsr(copy=False)) \ .index_to_int() \ .to(device=tc_device) for i in range(0, ntot, n): ic = min(n, ntot - i) X1_chunk = X1.narrow_rows(i, ic).to(dtype=gpu_dtype) X1_chunk_d = X1_chunk.index_to_int().to(device=tc_device) cur_g_out = g_out.narrow(0, 0, ic).narrow(1, 0, jc) cur_g_out.fill_(0.0) ddd = kernel._prepare_sparse(X1_chunk, X2_chunk) cur_g_out = kernel._apply_sparse(X1_chunk_d, X2_chunk_d, cur_g_out) cur_g_out = kernel._finalize(cur_g_out, ddd) copy_to_host_noorder(ic, jc, cur_g_out, 0, 0, out, i, j, cpu_buf) del ddd, X1_chunk_d, X1_chunk del X2_chunk, X2_chunk_d del g_out return out