def test_cublasZgemmBatched(self): l, m, k, n = 11, 7, 5, 3 A = (np.random.rand(l, m, k)+1j*np.random.rand(l, m, k)).astype(np.complex128) B = (np.random.rand(l, k, n)+1j*np.random.rand(l, k, n)).astype(np.complex128) C_res = np.einsum('nij,njk->nik', A, B) a_gpu = gpuarray.to_gpu(A) b_gpu = gpuarray.to_gpu(B) c_gpu = gpuarray.empty((l, m, n), np.complex128) alpha = np.complex128(1.0) beta = np.complex128(0.0) a_arr = bptrs(a_gpu) b_arr = bptrs(b_gpu) c_arr = bptrs(c_gpu) cublas.cublasZgemmBatched(self.cublas_handle, 'n','n', n, m, k, alpha, b_arr.gpudata, n, a_arr.gpudata, k, beta, c_arr.gpudata, n, l) assert np.allclose(C_res, c_gpu.get())
def eps_l_noop_batch(x_ptrs, A1_ptrs, A2_ptrs, out, tmp_ptrs, tmp2_ptrs, tmp2, handle): D = out.shape[0] d = len(tmp2) cb.cublasZgemmBatched(handle, 'N', 'C', D, D, D, 1., x_ptrs.gpudata, D, A1_ptrs.gpudata, D, 0., tmp_ptrs.gpudata, D, d) cb.cublasZgemmBatched(handle, 'N', 'N', D, D, D, 1., A2_ptrs.gpudata, D, tmp_ptrs.gpudata, D, 0., tmp2_ptrs.gpudata, D, d) out.fill(0) for s in range(d): cb.cublasZaxpy(handle, D * D, 1., tmp2[s].gpudata, 1, out.gpudata, 1) return out
def eps_r_noop_batch(x_ptrs, A1_ptrs, A2_ptrs, out, tmp_ptrs, tmp2_ptrs, tmp2, handle): D = out.shape[0] Dm1 = D d = len(tmp2) cb.cublasZgemmBatched( handle, "N", "N", D, Dm1, D, 1.0, x_ptrs.gpudata, D, A1_ptrs.gpudata, D, 0.0, tmp_ptrs.gpudata, D, d ) cb.cublasZgemmBatched( handle, "C", "N", Dm1, Dm1, D, 1.0, A2_ptrs.gpudata, D, tmp_ptrs.gpudata, D, 0.0, tmp2_ptrs.gpudata, Dm1, d ) out.fill(0) for s in xrange(d): cb.cublasZaxpy(handle, Dm1 * Dm1, 1.0, tmp2[s].gpudata, 1, out.gpudata, 1) return out