def todense(self, out=None, allocator=mem_alloc, stream=None): if out is None: out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") if self.nnz == 0: # weird but happens out.fill(0.0, stream=stream) return out # we need to out-of-place transpose if we want rowmajor outputs # thus we need a temporary to store our results if out.flags.c_contiguous: tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") else: tmp = out if stream is not None: cusparse.cusparseSetStream(cusparse_handle, stream.handle) cublas.cublasSetStream(cublas_handle, stream.handle) cusparse.cusparseScsr2dense(cusparse_handle, self.shape[0], self.shape[1], self.descr, self.data.gpudata, self.indptr.gpudata, self.indices.gpudata, tmp.gpudata, tmp.shape[0]) if out.flags.c_contiguous: cublas.cublasSgeam(cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0], 1.0, tmp.gpudata, tmp.shape[0], 0.0, 0, tmp.shape[0], out.gpudata, out.shape[1]) if stream is not None: cusparse.cusparseSetStream(cusparse_handle, 0) cublas.cublasSetStream(cublas_handle, 0) return out
def cgemm(A, B, transa=False, transb=False, alpha=1,beta=1): """This function uses the C-wrapper to use cuBLAS. """ CUBLAS_OP_N = cublas._CUBLAS_OP['n'] CUBLAS_OP_T = cublas._CUBLAS_OP['t'] m, n, k = A.size(1),B.size(2),A.size(2) batchCount = A.size(0) C = A.new(batchCount,m,n) lda = m ldb = k ldc = m trans_a_ptr = CUBLAS_OP_N if not transa else CUBLAS_OP_T trans_b_ptr = CUBLAS_OP_N if not transb else CUBLAS_OP_T alpha_tensor = torch.cuda.tensor([1]).fill_(alpha) beta_tensor = torch.cuda.tensor([1]).fill_(beta) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCgemmBatched(handle, trans_a_ptr, trans_b_ptr, m, n, k, alpha_tensor.data_ptr(), A.data_ptr(), lda, B.data_ptr(),ldb, beta_tensor.data_ptr(), C.data_ptr(), ldc, batchCount) return C
def backward(ctx, grad_output): A, B = ctx.saved_tensors conjA = A.clone() conjB = B.clone() conjA[..., 1] = -A[..., 1] conjB[..., 1] = -B[..., 1] #conjA[:,:,:,:,1] = -A[:,:,:,:,1] #conjB[:,:,1] = -B[:,:,1] m, n = conjB.nelement() // 2, conjA.nelement() // conjB.nelement() # n is the B*C # m is the M*N gradA = conjA.new(conjA.size()) # (n,m), col-major gradC = grad_output.contiguous() # (n,m), col-major # grad_A = grad_C * conj(B) lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, gradC.data_ptr(), lda, conjB.data_ptr(), incx, gradA.data_ptr(), ldc) # grad_B = sum_n grad_C * conj(A) # view grad_C and conjA as one vector of size n*m gradB_ = gradC.new(gradC.size()) # mul(gradC,conjA) # (B,C,M,N,2) lda = m * n ldc = m * n incx = 1 cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m * n, 1, gradC.data_ptr(), lda, conjA.data_ptr(), incx, gradB_.data_ptr(), ldc) gradB = torch.sum(torch.sum(gradB_, 0), 0) # (m) return gradA, gradB
def forward(ctx, A, B): # assume A and B has the same size , with last dim = 2 A, B = A.contiguous(), B.contiguous() ctx.save_for_backward(A, B) if not iscomplex(A) or not iscomplex(B): raise TypeError('The input, filter and output should be complex') if A.nelement() != B.nelement(): raise TypeError('The input and filter should have same size') if type(A) is not type(B): raise RuntimeError('A and B should be same type!') if not A.is_cuda: raise RuntimeError('Use the torch backend for cpu tensors!') C = A.new(A.size()) m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def cdgmm(A, B, inplace=False): """ Complex pointwise multiplication between (batched) tensor A and tensor B. Parameters ---------- A : tensor A is a complex tensor of size (B, C, M, N, 2) B : tensor B is a complex tensor of size (M, N, 2) or real tensor of (M, N, 1) inplace : boolean, optional if set to True, all the operations are performed inplace Returns ------- C : tensor output tensor of size (B, C, M, N, 2) such that: C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :] """ if not iscomplex(A): raise TypeError('The input must be complex, indicated by a last ' 'dimension of size 2') if B.ndimension() != 3: raise RuntimeError('The filter must be a 3-tensor, with a last ' 'dimension of size 1 or 2 to indicate it is real ' 'or complex, respectively') if not iscomplex(B) and not isreal(B): raise TypeError('The filter must be complex or real, indicated by a ' 'last dimension of size 2 or 1, respectively') if A.size()[-3:-1] != B.size()[-3:-1]: raise RuntimeError('The filters are not compatible for multiplication!') if A.dtype is not B.dtype: raise RuntimeError('A and B must be of the same dtype') if A.device != B.device: raise RuntimeError('A and B must be on the same device') if isreal(B): if inplace: return A.mul_(B) else: return A * B else: A, B = A.contiguous(), B.contiguous() C = A.new(A.size()) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def cublas_dot(a, b, out=None, transpose_a=False, transpose_b=False, increment=False, stream=None): """Matrix multiplication using CUBLAS.""" assert not increment or out is not None dtype = a.dtype if transpose_a: a1, a0 = a.shape else: a0, a1 = a.shape if transpose_b: b1, b0 = b.shape else: b0, b1 = b.shape assert a1 == b0 if out is None: out = gpuarray.zeros((a0, b1), dtype=dtype) assert a.dtype == b.dtype == out.dtype # note: we swap the order of a and b and swap the transposes because # cublas assumes column-major ordering transa = "t" if transpose_a else "n" transb = "t" if transpose_b else "n" beta = dtype.type(1.0) if increment else dtype.type(0.0) lda = a0 if transpose_a else a1 ldb = b0 if transpose_b else b1 ldout = b1 if stream is not None: # note: this assumes that the stream is set to the default stream to # start cublas.cublasSetStream(misc._global_cublas_handle, stream.handle) if dtype == np.float32: gemm = cublas.cublasSgemm else: gemm = cublas.cublasDgemm gemm(misc._global_cublas_handle, transb, transa, b1, a0, a1, dtype.type(1.0), b.gpudata, ldb, a.gpudata, lda, beta, out.gpudata, ldout) if stream is not None: cublas.cublasSetStream(misc._global_cublas_handle, 0) return out
def todense(self, out=None, allocator=mem_alloc, stream=None): if out is None: out = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") if self.nnz == 0: # weird but happens out.fill(0.0, stream=stream) return out # we need to out-of-place transpose if we want rowmajor outputs # thus we need a temporary to store our results if out.flags.c_contiguous: tmp = gpuarray.empty(self.shape, allocator=allocator, dtype=self.dtype, order="C") else: tmp = out if stream is not None: cusparse.cusparseSetStream(cusparse_handle, stream.handle) cublas.cublasSetStream(cublas_handle, stream.handle) cusparse.cusparseScsr2dense( cusparse_handle, self.shape[0], self.shape[1], self.descr, self.data.gpudata, self.indptr.gpudata, self.indices.gpudata, tmp.gpudata, tmp.shape[0], ) if out.flags.c_contiguous: cublas.cublasSgeam( cublas_handle, 1, 1, tmp.shape[1], tmp.shape[0], 1.0, tmp.gpudata, tmp.shape[0], 0.0, 0, tmp.shape[0], out.gpudata, out.shape[1], ) if stream is not None: cusparse.cusparseSetStream(cusparse_handle, 0) cublas.cublasSetStream(cublas_handle, 0) return out
def forward(ctx, A, B): """ Complex pointwise multiplication between (batched) tensor A and tensor B. Parameters ---------- A : tensor input tensor with size (B, C, M, N, 2) B : tensor B is a complex tensor of size (M, N, 2) inplace : boolean, optional if set to True, all the operations are performed inplace Returns ------- C : tensor output tensor of size (B, C, M, N, 2) such that: C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :] """ A, B = A.contiguous(), B.contiguous() ctx.save_for_backward(A, B) if A.size()[-3:] != B.size(): raise RuntimeError( 'The filters are not compatible for multiplication!') if not iscomplex(A) or not iscomplex(B): raise TypeError('The input, filter and output should be complex') if B.ndimension() != 3: raise RuntimeError('The filters must be simply a complex array!') if type(A) is not type(B): raise RuntimeError('A and B should be same type!') if not A.is_cuda: raise RuntimeError('Use the torch backend for cpu tensors!') C = A.new(A.size()) m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def cdgmm(A, B, jit=True, inplace=False): """This function uses the C-wrapper to use cuBLAS. """ A, B = A.contiguous(), B.contiguous() if A.size()[-3:] != B.size(): raise RuntimeError( 'The filters are not compatible for multiplication!') if not iscomplex(A) or not iscomplex(B): raise TypeError('The input, filter and output should be complex') if B.ndimension() != 3: raise RuntimeError('The filters must be simply a complex array!') if type(A) is not type(B): raise RuntimeError('A and B should be same type!') if not jit or isinstance(A, (torch.FloatTensor, torch.DoubleTensor)): C = A.new(A.size()) A_r = A[..., 0].contiguous().view(-1, A.size(-2) * A.size(-3)) A_i = A[..., 1].contiguous().view(-1, A.size(-2) * A.size(-3)) B_r = B[..., 0].contiguous().view( B.size(-2) * B.size(-3)).unsqueeze(0).expand_as(A_i) B_i = B[..., 1].contiguous().view( B.size(-2) * B.size(-3)).unsqueeze(0).expand_as(A_r) C[..., 0].copy_(A_r * B_r - A_i * B_i) C[..., 1].copy_(A_r * B_i + A_i * B_r) # faster if B is actually real #B[...,1] = B[...,0] #C = A * B.unsqueeze(0).expand_as(A) return C if not inplace else A.copy_(C) else: C = A.new(A.size()) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def cublas_dgmm(A, x, out=None): if out is not None: assert out.is_contiguous() and out.size() == A.size() else: out = A.new(A.size()) assert x.dim() == 1 assert x.numel() == A.size(-1) or x.numel() == A.size(0) assert A.type() == x.type() == out.type() assert A.is_contiguous() if not isinstance(A, (torch.cuda.FloatTensor, torch.cuda.DoubleTensor)): if x.numel() == A.size(-1): return A.mm(torch.diag(x), out=out.view_as(A)) else: return torch.diag(x).mm(A, out=out.view_as(A)) else: if x.numel() == A.size(-1): m, n = A.size(-1), A.numel() // A.size(-1) mode = 'l' # A.mm(x.diag(), out=out) # return out elif x.numel() == A.size(0): n, m = A.size(0), A.numel() // A.size(0) mode = 'r' # if A.stride(0) == 1: # mode = 'l' # n, m = m, n # x.diag().mm(A, out=out) # return out lda, ldc = m, m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ from skcuda import cublas cublas.cublasSetStream(handle, stream) args = [ handle, mode, m, n, A.data_ptr(), lda, x.data_ptr(), incx, out.data_ptr(), ldc ] if isinstance(A, torch.cuda.FloatTensor): cublas.cublasSdgmm(*args) elif isinstance(A, torch.cuda.DoubleTensor): cublas.cublasDdgmm(*args) return out
def forward(self, vector1, vector2): with torch.cuda.device_of(vector1): output = vector1.new(1) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream() cublas.cublasSetStream(handle, stream) if isinstance(vector1, torch.cuda.FloatTensor): result = cublas.cublasSdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) elif isinstance(vector1, torch.cuda.DoubleTensor): result = cublas.cublasDdot(handle, vector1.numel(), vector1.data_ptr(), 1, vector2.data_ptr(), 1) output = output.fill_(float(result)) self.save_for_backward(vector1, vector2) return output
def cdgmm3d(A, B, inplace=False): """ Pointwise multiplication of complex tensors. ---------- A: complex tensor B: complex tensor of the same size as A Returns ------- output : tensor of the same size as A containing the result of the elementwise complex multiplication of A with B """ if not A.is_contiguous(): warnings.warn("cdgmm3d: tensor A is converted to a contiguous array") A = A.contiguous() if not B.is_contiguous(): warnings.warn("cdgmm3d: tensor B is converted to a contiguous array") B = B.contiguous() if A.size()[-4:] != B.size(): raise RuntimeError('The filters are not compatible for multiplication.') if not iscomplex(A) or not iscomplex(B): raise TypeError('The input, filter and output should be complex.') if B.ndimension() != 4: raise RuntimeError('The filters must be simply a complex array.') if type(A) is not type(B): raise RuntimeError('A and B should be same type.') if not A.is_cuda: raise RuntimeError('Use the torch backend for cpu tensors.') C = A.new(A.size()) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def cdgmm(A, B, jit=True, inplace=False): """This function uses the C-wrapper to use cuBLAS. """ A, B = A.contiguous(), B.contiguous() if A.size()[-3:] != B.size(): raise RuntimeError('The filters are not compatible for multiplication!') if not iscomplex(A) or not iscomplex(B): raise TypeError('The input, filter and output should be complex') if B.ndimension() != 3: raise RuntimeError('The filters must be simply a complex array!') if type(A) is not type(B): raise RuntimeError('A and B should be same type!') if not jit or isinstance(A, (torch.FloatTensor, torch.DoubleTensor)): C = A.new(A.size()) A_r = A[..., 0].contiguous().view(-1, A.size(-2)*A.size(-3)) A_i = A[..., 1].contiguous().view(-1, A.size(-2)*A.size(-3)) B_r = B[...,0].contiguous().view(B.size(-2)*B.size(-3)).unsqueeze(0).expand_as(A_i) B_i = B[..., 1].contiguous().view(B.size(-2)*B.size(-3)).unsqueeze(0).expand_as(A_r) C[..., 0].copy_(A_r * B_r - A_i * B_i) C[..., 1].copy_(A_r * B_i + A_i * B_r) # faster if B is actually real #B[...,1] = B[...,0] #C = A * B.unsqueeze(0).expand_as(A) return C if not inplace else A.copy_(C) else: C = A.new(A.size()) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def forward(self, matrix1, matrix2): with torch.cuda.device_of(matrix1): dim1, dim2 = matrix1.size() dim2, dim3 = matrix2.size() output = matrix1.new(dim1, dim3) handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream() cublas.cublasSetStream(handle, stream) if isinstance(matrix1, torch.cuda.FloatTensor): cublas.cublasSgemm(handle, 'n', 'n', dim3, dim1, dim2, 1, matrix2.data_ptr(), dim3, matrix1.data_ptr(), dim2, 0, output.data_ptr(), dim3) elif isinstance(matrix1, torch.cuda.DoubleTensor): cublas.cublasDgemm(handle, 'n', 'n', dim3, dim1, dim2, 1, matrix2.data_ptr(), dim3, matrix1.data_ptr(), dim2, 0, output.data_ptr(), dim3) self.save_for_backward(matrix1, matrix2) return output
def cublas_cdgmm(A, x, out=None): if out is not None: assert out.is_contiguous() and out.size() == A.size() else: out = A.new(A.size()) assert x.dim() == 2 and x.size(-1) == 2 and A.size(-1) == 2 assert A.dim() == 3 assert x.size(0) == A.size(1) or x.size(0) == A.size(0) assert A.type() == x.type() == out.type() assert A.is_contiguous() if not isinstance(A, (torch.cuda.FloatTensor, torch.cuda.DoubleTensor)): raise NotImplementedError else: m, n = A.size(1), A.size(0) if x.size(0) == A.size(1): mode = 'l' elif x.size(0) == A.size(0): mode = 'r' lda, ldc = m, m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ from skcuda import cublas cublas.cublasSetStream(handle, stream) args = [ handle, mode, m, n, A.data_ptr(), lda, x.data_ptr(), incx, out.data_ptr(), ldc ] if isinstance(A, torch.cuda.FloatTensor): cublas.cublasCdgmm(*args) elif isinstance(A, torch.cuda.DoubleTensor): cublas.cublasZdgmm(*args) return out
def cublas_handle(): cublas_handle = cublas.cublasCreate() cublas.cublasSetStream(cublas_handle, stream().handle) return cublas_handle
def cdgmm3d(A, B, inplace=False): """Complex pointwise multiplication. Complex pointwise multiplication between (batched) tensor A and tensor B. Parameters ---------- A : torch tensor Complex torch tensor. B : torch tensor Complex of the same size as A. inplace : boolean, optional If set True, all the operations are performed inplace. Raises ------ RuntimeError In the event that the tensors are not compatibile for multiplication (i.e. the final four dimensions of A do not match with the dimensions of B), or in the event that B is not complex, or in the event that the type of A and B are not the same. TypeError In the event that x is not complex i.e. does not have a final dimension of 2, or in the event that both tensors are not on the same device. Returns ------- output : torch tensor Torch tensor of the same size as A containing the result of the elementwise complex multiplication of A with B. """ if not A.is_contiguous(): warnings.warn("cdgmm3d: tensor A is converted to a contiguous array") A = A.contiguous() if not B.is_contiguous(): warnings.warn("cdgmm3d: tensor B is converted to a contiguous array") B = B.contiguous() if A.shape[-4:] != B.shape: raise RuntimeError( 'The filters are not compatible for multiplication.') if not _is_complex(A) or not _is_complex(B): raise TypeError('The input, filter and output should be complex.') if B.ndimension() != 4: raise RuntimeError('The filters must be simply a complex array.') if type(A) is not type(B): raise RuntimeError('A and B should be same type.') if not A.is_cuda: raise RuntimeError('Use the torch backend for CPU tensors.') C = A.new(A.shape) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
def vis_gpu(antpos, freq, eq2tops, crd_eq, I_sky, bm_cube, nthreads=NTHREADS, max_memory=MAX_MEMORY, real_dtype=np.float32, complex_dtype=np.complex64, verbose=False): # ensure shapes nant = antpos.shape[0] assert (antpos.shape == (nant, 3)) npix = crd_eq.shape[1] assert (crd_eq.shape == (3, npix)) assert (I_sky.shape == (npix, )) beam_px = bm_cube.shape[1] assert (bm_cube.shape == (nant, beam_px, beam_px)) ntimes = eq2tops.shape[0] assert (eq2tops.shape == (ntimes, 3, 3)) # ensure data types antpos = antpos.astype(real_dtype) eq2tops = eq2tops.astype(real_dtype) crd_eq = crd_eq.astype(real_dtype) Isqrt = np.sqrt(I_sky).astype(real_dtype) bm_cube = bm_cube.astype(real_dtype) # XXX complex? chunk = max(min(npix, MIN_CHUNK), 2**int(ceil(np.log2(float(nant * npix) / max_memory / 2)))) npixc = npix / chunk # blocks of threads are mapped to (pixels,ants,freqs) block = (max(1, nthreads / nant), min(nthreads, nant), 1) grid = (int(ceil(npixc / float(block[0]))), int(ceil(nant / float(block[1])))) gpu_code = GPU_TEMPLATE % { 'NANT': nant, 'NPIX': npixc, 'BEAM_PX': beam_px, 'BLOCK_PX': block[0], } gpu_module = compiler.SourceModule(gpu_code) bm_interp = gpu_module.get_function("InterpolateBeam") meas_eq = gpu_module.get_function("MeasEq") bm_texref = gpu_module.get_texref("bm_tex") import pycuda.autoinit h = cublasCreate() # handle for managing cublas # define GPU buffers and transfer initial values bm_texref.set_array( numpy3d_to_array(bm_cube) ) # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT) antpos_gpu = gpuarray.to_gpu( antpos) # never changes, set to -2*pi*antpos/c Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype) A_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU by bm_interp crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) eq2top_gpu = gpuarray.empty(shape=(3, 3), dtype=real_dtype) # sent from CPU each time crdtop_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype) # will be set on GPU tau_gpu = gpuarray.empty(shape=(nant, npixc), dtype=real_dtype) # will be set on GPU v_gpu = gpuarray.empty(shape=(nant, npixc), dtype=complex_dtype) # will be set on GPU vis_gpus = [ gpuarray.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] # output CPU buffers for downloading answers vis_cpus = [ np.empty(shape=(nant, nant), dtype=complex_dtype) for i in xrange(chunk) ] streams = [driver.Stream() for i in xrange(chunk)] event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate', 'meas_eq', 'vis', 'end') vis = np.empty((ntimes, nant, nant), dtype=complex_dtype) for t in xrange(ntimes): if verbose: print '%d/%d' % (t + 1, ntimes) eq2top_gpu.set( eq2tops[t]) # defines sky orientation for this time step events = [{e: driver.Event() for e in event_order} for i in xrange(chunk)] for c in xrange(chunk + 2): cc = c - 1 ccc = c - 2 if 0 <= ccc < chunk: stream = streams[ccc] vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream) events[ccc]['end'].record(stream) if 0 <= cc < chunk: stream = streams[cc] cublasSetStream(h, stream.handle) ## compute crdtop = dot(eq2top,crd_eq) # cublas arrays are in Fortran order, so P=M*N is actually # peformed as P.T = N.T * M.T cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata, npixc, eq2top_gpu.gpudata, 3, 0., crdtop_gpu.gpudata, npixc) events[cc]['eq2top'].record(stream) ## compute tau = dot(antpos,crdtop) cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1., crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3, 0., tau_gpu.gpudata, npixc) events[cc]['tau'].record(stream) ## interpolate bm_tex at specified topocentric coords, store interpolation in A ## threads are parallelized across pixel axis bm_interp(crdtop_gpu, A_gpu, grid=grid, block=block, stream=stream) events[cc]['interpolate'].record(stream) # compute v = A * I * exp(1j*tau*freq) meas_eq(A_gpu, Isqrt_gpu, tau_gpu, real_dtype(freq), v_gpu, grid=grid, block=block, stream=stream) events[cc]['meas_eq'].record(stream) # compute vis = dot(v, v.T) # transpose below incurs about 20% overhead cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata, npixc, v_gpu.gpudata, npixc, 0., vis_gpus[cc].gpudata, nant) events[cc]['vis'].record(stream) if c < chunk: stream = streams[c] events[c]['start'].record(stream) crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc], stream=stream) Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc], stream=stream) events[c]['upload'].record(stream) events[chunk - 1]['end'].synchronize() vis[t] = sum(vis_cpus) if verbose: for c in xrange(chunk): print '%d:%d START->END:' % ( c, chunk), events[c]['start'].time_till( events[c]['end']) * 1e-3 #for i,e in enumerate(event_order[:-1]): # print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3 print 'TOTAL:', events[0]['start'].time_till( events[chunk - 1]['end']) * 1e-3 # teardown GPU configuration cublasDestroy(h) return vis
def cdgmm(A, B, inplace=False): """Complex pointwise multiplication. Complex pointwise multiplication between (batched) tensor A and tensor B. Parameters ---------- A : tensor A is a complex tensor of size (B, C, M, N, 2). B : tensor B is a complex tensor of size (M, N, 2) or real tensor of (M, N, 1). inplace : boolean, optional If set to True, all the operations are performed in place. Raises ------ RuntimeError In the event that the filter B is not a 3-tensor with a last dimension of size 1 or 2, or A and B are not compatible for multiplication, or if A or B are not contiguous. TypeError In the event that A is not complex, or B does not have a final dimension of 1 or 2, or A and B are not of the same dtype, or if A or B are not cuda tensors, or if A and B are not on the same device. Returns ------- C : tensor Output tensor of size (B, C, M, N, 2) such that: C[b, c, m, n, :] = A[b, c, m, n, :] * B[m, n, :]. """ if not _is_complex(A): raise TypeError( 'The input should be complex (i.e. last dimension is 2).') if not _is_complex(B) and not _is_real(B): raise TypeError('The filter should be complex or real, indicated by a ' 'last dimension of size 2 or 1, respectively.') if A.shape[-len(B.shape):-1] != B.shape[:-1]: raise RuntimeError( 'The filters are not compatible for multiplication.') if A.dtype is not B.dtype: raise TypeError('Input and filter must be of the same dtype.') if not A.is_cuda or not B.is_cuda: raise TypeError('Input and filter must be CUDA tensors.') if A.device.index != B.device.index: raise TypeError('Input and filter must be on the same GPU.') if _is_real(B): if inplace: return A.mul_(B) else: return A * B else: if not A.is_contiguous() or not B.is_contiguous(): raise RuntimeError('Tensors must be contiguous.') C = A.new(A.shape) if not inplace else A m, n = B.nelement() // 2, A.nelement() // B.nelement() lda = m ldc = m incx = 1 handle = torch.cuda.current_blas_handle() stream = torch.cuda.current_stream()._as_parameter_ cublas.cublasSetStream(handle, stream) cublas.cublasCdgmm(handle, 'l', m, n, A.data_ptr(), lda, B.data_ptr(), incx, C.data_ptr(), ldc) return C
cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] y_pin = [ cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams]) for i in range(nStreams) ] h = cublas.cublasCreate() x_gpu = np.empty(nStreams, dtype=object) y_gpu = np.empty(nStreams, dtype=object) ans = np.empty(nStreams, dtype=object) for i in range(nStreams): cublas.cublasSetStream(h, streams[i].handle) x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i]) y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i]) cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1, y_gpu[i].gpudata, 1) ans[i] = y_gpu[i].get_async(stream=streams[i]) cublas.cublasDestroy(h) # Uncomment to check for errors in the calculation #y_gpu = np.array([yg.get() for yg in y_gpu]) #y_gpu = np.array(y_gpu).reshape(y.shape) #print np.allclose(y_gpu, a*x + y)