def __iter__(self): """Yield a batch of (input, output) from the data loader, with the inputs normalized. :return: batch of (input, output). :rtype: (torch.Tensor, torch.Tensor) """ stream = cuda.Stream(self.device) first_entry = True for next_input, next_target in self.data_loader: with cuda.stream(stream): # Pre-load a batch of input and targets to the GPU, and normalize the input: next_input = next_input.to(self.device, non_blocking=True) next_target = next_target.to(self.device, non_blocking=True) next_input = next_input.float() next_input = next_input.sub_(self.data_mean).div_( self.data_std) if not first_entry: yield input, target # Yield the pre-loaded batch of input and targets. else: # On the first entry, we have to do the pre-loading step twice (as nothing as been pre-loaded before!) first_entry = False cuda.current_stream().wait_stream(stream) input = next_input target = next_target yield input, target
def _get_stream(device): """Gets a background stream for copying between CPU and GPU""" global _streams if device == -1: return None if _streams is None: _streams = [None] * cuda.device_count() if _streams[device] is None: _streams[device] = cuda.Stream(device) return _streams[device]
def _generic_fmm(proc_idx, queue, device_id): # Unpack the function arguments a: ArgsFmm = queue.get() X1: torch.Tensor = a.X1 X2: torch.Tensor = a.X2 cuda_inputs = X1.is_cuda out = a.out kernel, gpu_dtype = a.kernel, a.gpu_dtype max_mem = a.max_mem num_streams = a.num_streams # flags and local variables change_dtype = gpu_dtype != X1.dtype X1_equal_X2 = _gpu_tns_same_memory(X1, X2) use_gpu_bufs = change_dtype or not cuda_inputs stride = "F" if is_f_contig(out, strict=True) else "C" j_iter = 0 dts = sizeof_dtype(gpu_dtype) tc_device = torch.device('cuda:%d' % (int(device_id))) avail_mem = max_mem / dts # Choose block sizes n, m such that we won't run out of GPU memory ntot, d = X1.shape mtot = X2.shape[0] extra_mem = kernel.extra_mem() if cuda_inputs and not change_dtype: # No allocation will be performed by us. Only in-kernel stuff. n, m = select_dim_over_nm(max_n=ntot, max_m=mtot, d=d, coef_nd=extra_mem.get('nd', 0), coef_md=extra_mem.get('md', 0), coef_nm=extra_mem.get('nm', 0), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) else: n, m = select_dim_over_nm( max_n=ntot, max_m=mtot, d=d, coef_nd=num_streams * (extra_mem.get('nd', 0) + 1), coef_md=num_streams * (extra_mem.get('md', 0) + 1), coef_nm=num_streams * (extra_mem.get('nm', 0) + 1), coef_n=extra_mem.get('n', 0), coef_m=extra_mem.get('m', 0), rest=extra_mem.get('d', 0), max_mem=avail_mem) # Create streams streams = [tcd.Stream(device=tc_device) for _ in range(num_streams)] # Create buffers if use_gpu_bufs: gX1 = create_same_stride((n, d), X1, gpu_dtype, tc_device) gX2_list = [ create_same_stride((m, d), X2, gpu_dtype, tc_device) for _ in range(num_streams) ] gout_list = [ create_same_stride((n, m), out, gpu_dtype, tc_device) for _ in range(num_streams) ] if not cuda_inputs: cpu_buf_list = [ create_same_stride((n, m), out, gpu_dtype, 'cpu', pin_memory=True) for _ in range(num_streams) ] # Define helpers for the copy-back operations (from cpu_buf to output) copy_ops = [None] * num_streams def wrap_copy_op(stream_idx): if copy_ops[stream_idx] is not None: copy_ops[stream_idx]() copy_ops[stream_idx] = None def do_copy_op(output, buf, i_, ic_, j_, jc_): # This function will also do the type conversion output[i_:i_ + ic_, j_:j_ + jc_].copy_(buf[:ic_, :jc_]) # Kernel computation begin with tcd.device(tc_device): for i in range(0, ntot, n): ic = min(n, ntot - i) with tcd.stream(streams[j_iter % len(streams)]): X1_chunk = X1.narrow(0, i, ic) if use_gpu_bufs: cur_gX1 = gX1.narrow(0, 0, ic) cur_gX1.copy_(X1_chunk, non_blocking=True) else: cur_gX1 = X1_chunk for j in range(0, mtot, m): jc = min(m, mtot - j) # Choose the buffers for this inner iteration stream_id = j_iter % len(streams) stream = streams[stream_id] if use_gpu_bufs: gX2 = gX2_list[stream_id] gout = gout_list[stream_id] if not cuda_inputs: cpu_buf = cpu_buf_list[stream_id] # Sync for buffers we must use now (e.g. 2 previous iters) with tcd.stream(stream): # Inner-loop stream.synchronize() wrap_copy_op(stream_id) if X1_equal_X2 and j < i: # Shortcut for symmetric kernels jc = min(m, mtot - j) out[i:i + ic, j:j + jc].copy_(out[j:j + jc, i:i + ic].T, non_blocking=True) j_iter += 1 continue # Copy (CPU->GPU) X2_chunk = X2.narrow(0, j, jc) if use_gpu_bufs: cur_gX2 = gX2.narrow(0, 0, jc) cur_gX2.copy_(X2_chunk, non_blocking=True) else: cur_gX2 = X2_chunk if use_gpu_bufs: cur_gout = gout[:ic, :jc] else: cur_gout = out[i:i + ic, j:j + jc] cur_gout.fill_(0.0) # Compute ddd = kernel._prepare(cur_gX1, cur_gX2) kernel._apply(cur_gX1, cur_gX2.T, cur_gout) cur_gout = kernel._finalize(cur_gout, ddd) # Copy Back (GPU->CPU) if not cuda_inputs: # copy_ does not care about the contiguity of copies, as long as it's consistent # however, in case of C-contiguous inputs it will create an intermediate array # which is undesired. We use cuda_memcpy2d_async which works well with C-contiguous # arrays. if stride == "F": copy_to_host(ic, jc, cur_gout, 0, 0, cpu_buf, 0, 0, s=stream) else: cuda_memcpy2d_async(dst=cpu_buf.data_ptr(), dpitch=cpu_buf.stride(0) * dts, src=cur_gout.data_ptr(), spitch=cur_gout.stride(0) * dts, width=jc * dts, height=ic, stream=stream._as_parameter_) copy_ops[stream_id] = partial(do_copy_op, out, cpu_buf, i, ic, j, jc) elif change_dtype: out.narrow(0, i, ic).narrow(1, j, jc).copy_(cur_gout, non_blocking=True) j_iter += 1 for i in range(num_streams): streams[i].synchronize() wrap_copy_op(i) return out
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.size(1) if v is not None else w.size(1) dtype = X1.dtype # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_d(maxD=D, maxN=N, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, tot=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream() s2 = tcd.Stream() with tcd.device(ddev), tcd.stream(s1): if v is not None: v_gpu = create_same_stride((M, T), v, dtype, ddev) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) K_gpu = create_same_stride((n, M), X1, dtype, ddev) X1ss_gpu = create_same_stride((n, d), X1, dtype, ddev) X2s_gpu = create_same_stride((M, d), X2, dtype, ddev) Kv_gpu = create_same_stride((n, T), X1, dtype, ddev) if out.is_cuda: out_gpu = out else: out_gpu = create_same_stride((M, T), out, dtype, ddev) out_gpu.fill_(0.0) sq1_gpu = create_same_stride((n, ), X1, dtype, ddev) sq2_gpu = create_same_stride((M, ), X1, dtype, ddev) #if (d == D): # with torch.cuda.stream(s2): # cur_X2s_gpu = copy_to_device_noorder(M, d, X2, 0, 0, X2s_gpu, 0, 0, s=s2) # torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu.narrow(0, 0, nb) # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) # Parallelize two matrix transfers (probably pointless) #if d < D: with torch.cuda.stream(s2): cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize() s1.synchronize() cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: # Copy split w to GPU into cur_Kv_gpu, cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T s1.synchronize() s1.synchronize() if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) return out
def distk_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.size() M = X2.size(0) T = v.shape[1] if v is not None else w.shape[1] dtype = X1.dtype cuda_inputs = X1.is_cuda # Memory usage: # v : M x T # K : n x M # X1ss : n x d # X2s : M x d # Kv : n x T # out : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T + 1) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) rest_coef = 2 * M * T if v is not None else M * T n, d = select_dim_over_nd(max_n=N, max_d=D, coef_nd=1, coef_n=M + T + 1, coef_d=M, rest=rest_coef + M, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) s2 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): # First collect necessary memory mem_needed = n * M + n * T + n + M if not cuda_inputs: mem_needed += n * d + M * d if v is not None: mem_needed += M * T if not out.is_cuda: mem_needed += M * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed, ), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 if v is not None: if not cuda_inputs: v_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) flat_offset += np.prod(v_gpu.shape) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0) else: v_gpu = v K_gpu = extract_same_stride(flat_gpu_tn, size=(n, M), other=X1, offset=flat_offset) flat_offset += np.prod(K_gpu.shape) Kv_gpu = extract_same_stride(flat_gpu_tn, size=(n, T), other=X1, offset=flat_offset) flat_offset += np.prod(Kv_gpu.shape) if out.is_cuda: out_gpu = out else: out_gpu = extract_same_stride(flat_gpu_tn, size=(M, T), other=out, offset=flat_offset) flat_offset += np.prod(out_gpu.shape) out_gpu.fill_(0.0) if not cuda_inputs: X1ss_gpu = extract_same_stride(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) flat_offset += np.prod(X1ss_gpu.shape) X2s_gpu = extract_same_stride(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) flat_offset += np.prod(X2s_gpu.shape) sq1_gpu = extract_same_stride(flat_gpu_tn, size=(n, ), other=X1, offset=flat_offset) flat_offset += np.prod(sq1_gpu.shape) sq2_gpu = extract_same_stride(flat_gpu_tn, size=(M, ), other=X1, offset=flat_offset) for i in range(0, N, n): nb = min(N - i, n) cur_K_gpu = K_gpu[:nb] # nb x M cur_K_gpu.fill_(0.0) for j in range(0, D, d): db = min(D - j, d) s1.synchronize( ) # need that the add_(sq2_gpu.T) op is complete to avoid overwrite # Parallelize two matrix transfers with tcd.stream(s2): if cuda_inputs: cur_X2s_gpu = X2[:, j:j + db] else: cur_X2s_gpu = copy_to_device_noorder(M, db, X2, 0, j, X2s_gpu, 0, 0, s=s2) torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True, out=sq2_gpu).pow_(2) if cuda_inputs: cur_X1ss_gpu = X1[i:i + nb, j:j + db] else: cur_X1ss_gpu = copy_to_device_noorder(nb, db, X1, i, j, X1ss_gpu, 0, 0, s=s1) torch.norm(cur_X1ss_gpu, p=2, dim=1, keepdim=True, out=sq1_gpu).pow_(2) s2.synchronize( ) # need that cur_X2s_gpu and sq2_gpu are available. cur_K_gpu.addmm_(mat1=cur_X1ss_gpu, mat2=cur_X2s_gpu.T, alpha=-2.0) cur_K_gpu.add_(sq1_gpu) cur_K_gpu.add_(sq2_gpu.T) cur_K_gpu.clamp_min_(0) cur_K_gpu = kernel._transform(cur_K_gpu) if w is not None: cur_Kv_gpu = copy_to_device_noorder(nb, T, w, i, 0, Kv_gpu, 0, 0, s=s1) # n x T if v is not None: cur_Kv_gpu.addmm_(cur_K_gpu, v_gpu) else: # v cannot be None if w is None cur_Kv_gpu = Kv_gpu.narrow(0, 0, nb) # n x T torch.mm(cur_K_gpu, v_gpu, out=cur_Kv_gpu) # n x T # Multiply transposed kernel with the Kv result. out_gpu.addmm_(cur_K_gpu.T, cur_Kv_gpu) # M x T if not out.is_cuda: copy_to_host_noorder(M, T, out_gpu, 0, 0, out, 0, 0) s1.synchronize() return out
def distk_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel: L2DistanceKernel = a.kernel max_mem = a.max_mem N, D = X1.shape M = X2.shape[0] T = v.shape[1] dtype = X1.dtype cuda_inputs = X1.is_cuda # GPU memory usage: # X1s : n x D # X2s : m x D # vs : m x T # nm : n x m # out : n x T # ----------- # total: n*m + n * (D + T) + m * (D + T) = R avail_mem = max_mem / sizeof_dtype(dtype) n, m = select_dim_over_nm_v2(max_n=N, max_m=M, coef_nm=1, coef_n=D + T, coef_m=D + T, rest=0, max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): mem_needed = n * m if not cuda_inputs: mem_needed += n * T + n * D + m * D + m * T flat_gpu_tn = torch.empty(size=(mem_needed,), dtype=dtype, device=ddev) flat_offset = 0 nm_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, m), other=X1, offset=flat_offset) if not cuda_inputs: out_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) X1s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, D), other=X1, offset=flat_offset) X2s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(m, D), other=X2, offset=flat_offset) vs_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(m, T), other=v, offset=flat_offset) for i in range(0, N, n): nb = min(n, N - i) if cuda_inputs: cur_X1s_gpu = X1.narrow(0, i, nb) # n x D else: cur_X1s_gpu = copy_to_device_noorder(nb, D, X1, i, 0, X1s_gpu, 0, 0, s=s1) sq1 = torch.norm(cur_X1s_gpu, p=2, dim=1, keepdim=True).pow_(2) if cuda_inputs: cur_out_gpu = out.narrow(0, i, nb) # n x T else: cur_out_gpu = out_gpu.narrow(0, 0, nb) # n x T cur_out_gpu.fill_(0.0) for j in range(0, M, m): mb = min(m, M - j) if cuda_inputs: cur_X2s_gpu = X2.narrow(0, j, mb) # m x D cur_vs_gpu = v.narrow(0, j, mb) # m x T else: cur_X2s_gpu = copy_to_device_noorder(mb, D, X2, j, 0, X2s_gpu, 0, 0, s=s1) # m x D cur_vs_gpu = copy_to_device_noorder(mb, T, v, j, 0, vs_gpu, 0, 0, s=s1) # m x T cur_nm_gpu = nm_gpu[:nb, :mb] # n x m sq2 = torch.norm(cur_X2s_gpu, p=2, dim=1, keepdim=True).pow_(2) torch.mm(cur_X1s_gpu, cur_X2s_gpu.T, out=cur_nm_gpu) cur_nm_gpu.mul_(-2.0) cur_nm_gpu.add_(sq1) cur_nm_gpu.add_(sq2.T) cur_nm_gpu.clamp_min_(0) kernel._transform(cur_nm_gpu) # Multiply by the vector v cur_out_gpu.addmm_(cur_nm_gpu, cur_vs_gpu) # n x T if not cuda_inputs: # send result to CPU copy_to_host_noorder(nb, T, out_gpu, 0, 0, out, i, 0, s=s1) s1.synchronize() return out
def generic_fdmmv(proc_idx, queue, device_id): a: ArgsFdmmv = queue.get() X1, X2, v, w, out = a.X1, a.X2, a.v, a.w, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda N, D = X1.size() M = X2.shape[0] if v is None: T = w.shape[1] else: T = v.shape[1] # Memory usage: # v : M x T # K : n x M # X1d : n x d # X2d : M x d # Kv : n x T # out2 : M x T # sq1 : n x 1 # sq2 : M x 1 # ------------ # total : n*d + M*d + n*(M + T) + 2*M*T + M avail_mem = max_mem / sizeof_dtype(dtype) # FIXME: There seems to be a bug where if we let avail_mem like it is # for 32-bit data-types some copy fails. In such case we need # to free up some more memory and then everything runs fine. if sizeof_dtype(dtype) == 4: avail_mem /= 2 rest_coef = 2 * M * T if v is not None else M * T extra_mem = kernel.extra_mem() n, d = select_dim_over_nd(max_n=N, max_d=D, coef_nd=1 + extra_mem.get('nd', 0), coef_n=M + T + 1 + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=M + extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=rest_coef + M + extra_mem.get('m', 0), max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): # First collect necessary memory mem_needed = n * M + n * T if not cuda_inputs: mem_needed += n * d + M * d + M * T if v is not None: mem_needed += M * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed,), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 ker_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, M), other=out, offset=flat_offset) w_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) if not cuda_inputs: X1s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) X2s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) out_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(M, T), other=out, offset=flat_offset) if v is not None: v_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0, s=s1) else: out_gpu = out if v is not None: v_gpu = v out_gpu.fill_(0.0) # Algorithm start for i in range(0, N, n): ic = min(n, N - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, D, d): kc = min(d, D - k) if cuda_inputs: c_g_X1s = X1[i:i + ic, k:k + kc] c_g_X2s = X2[:, k:k + kc] else: c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0, s=s1) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0, s=s1) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) if w is not None: c_g_w = copy_to_device_noorder(ic, T, w, i, 0, w_gpu, 0, 0, s=s1) else: c_g_w = w_gpu.narrow(0, 0, ic) c_g_w.fill_(0.0) if v is not None: c_g_w.addmm_(c_g_ker, v_gpu) out_gpu.addmm_(c_g_ker.T, c_g_w) if not cuda_inputs: copy_to_device_noorder(M, T, out_gpu, 0, 0, out, 0, 0, s=s1) s1.synchronize() return out
def generic_fmmv(proc_idx, queue, device_id): a: ArgsFmmv = queue.get() X1, X2, v, out = a.X1, a.X2, a.v, a.out kernel, max_mem = a.kernel, a.max_mem dtype = X1.dtype cuda_inputs = X1.is_cuda ntot, dtot = X1.size() M, T = v.size() # GPU Memory Usage: # ker_gpu : n*M # v_gpu : M*T # X1s_gpu : n*d # X2s_gpu : M*d # mmv_gpu : n*T # ---------- # total : n*d + n*(M+T) + d*M + M*T avail_mem = max_mem / sizeof_dtype(dtype) extra_mem = kernel.extra_mem() n, d = select_dim_over_nd(max_n=ntot, max_d=dtot, coef_nd=1 + extra_mem.get('nd', 0), coef_n=M + T + extra_mem.get('n', 0) + extra_mem.get('nm', 0) * M, coef_d=M + extra_mem.get('d', 0) + extra_mem.get('md', 0) * M, rest=M * T + extra_mem.get('m', 0), max_mem=avail_mem) ddev = torch.device('cuda:%d' % int(device_id)) s1 = tcd.Stream(ddev) with tcd.device(ddev), tcd.stream(s1): # First collect necessary memory mem_needed = n * M if not cuda_inputs: mem_needed += M * T + n * d + M * d + n * T # Create flat tensor flat_gpu_tn = torch.empty(size=(mem_needed,), dtype=dtype, device=ddev) # Extract the sub-tensors flat_offset = 0 ker_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, M), other=X1, offset=flat_offset) if not cuda_inputs: X1s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, d), other=X1, offset=flat_offset) X2s_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(M, d), other=X2, offset=flat_offset) mmv_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(n, T), other=out, offset=flat_offset) v_gpu, flat_offset = _extract_flat(flat_gpu_tn, size=(M, T), other=v, offset=flat_offset) copy_to_device_noorder(M, T, v, 0, 0, v_gpu, 0, 0, s=s1) else: v_gpu = v for i in range(0, ntot, n): ic = min(n, ntot - i) ddd = kernel._prepare(X1.narrow(0, i, ic), X2) c_g_ker = ker_gpu.narrow(0, 0, ic) c_g_ker.fill_(0.0) for k in range(0, dtot, d): kc = min(d, dtot - k) if cuda_inputs: c_g_X1s = X1[i:i + ic, k:k + kc] c_g_X2s = X2[:, k:k + kc] else: c_g_X1s = copy_to_device_noorder(ic, kc, X1, i, k, X1s_gpu, 0, 0, s=s1) c_g_X2s = copy_to_device_noorder(M, kc, X2, 0, k, X2s_gpu, 0, 0, s=s1) kernel._apply(c_g_X1s, c_g_X2s.T, c_g_ker) kernel._finalize(c_g_ker, ddd) # Multiply by the vector v if cuda_inputs: c_g_mmv = out[i:i + ic, :] else: c_g_mmv = mmv_gpu[:ic, :] torch.mm(c_g_ker, v_gpu, out=c_g_mmv) # n x T # Copy back to host if not cuda_inputs: copy_to_host_noorder(ic, T, c_g_mmv, 0, 0, out, i, 0, s=s1) s1.synchronize() return out