def test_convolution_backward_bias(self): logging.debug("ENTER: test_convolution_backward_bias") bperr_data = numpy.zeros((100, 64, 104, 226), dtype=numpy.float32) bperr_data[:] = 0.1 bperr_desc = cudnn.TensorDescriptor() bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *bperr_data.shape) bperr_buf = cu.MemAlloc(self.ctx, bperr_data) gd_data = numpy.zeros(64, dtype=numpy.float32) gd_desc = cudnn.TensorDescriptor() gd_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, 1, gd_data.size, 1, 1) gd_buf = cu.MemAlloc(self.ctx, gd_data) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.convolution_backward_bias(alpha, bperr_desc, bperr_buf, beta, gd_desc, gd_buf) gd_buf.to_host(gd_data) self.assertEqual(numpy.count_nonzero(gd_data), gd_data.size) logging.debug("EXIT: test_convolution_backward_bias")
def test_transform_tensor(self): logging.debug("ENTER: test_transform_tensor") sh_interleaved = (2, 5, 5, 3) sh_splitted = (2, 3, 5, 5) inp_data = numpy.arange(numpy.prod(sh_interleaved), dtype=numpy.float32).reshape(sh_interleaved) inp_desc = cudnn.TensorDescriptor() inp_desc.set_4d(cudnn.CUDNN_TENSOR_NHWC, cudnn.CUDNN_DATA_FLOAT, *sh_splitted) inp_buf = cu.MemAlloc(self.ctx, inp_data) out_data = numpy.zeros(sh_splitted, dtype=numpy.float32) out_desc = cudnn.TensorDescriptor() out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *sh_splitted) out_buf = cu.MemAlloc(self.ctx, out_data) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.transform_tensor(alpha, inp_desc, inp_buf, beta, out_desc, out_buf) out_buf.to_host(out_data) max_diff = numpy.fabs(out_data - inp_data.transpose(0, 3, 1, 2)).max() self.assertEqual(max_diff, 0.0) logging.debug("EXIT: test_transform_tensor")
def test_memcpy(self): logging.debug("ENTER: test_memcpy") ctx = cu.Devices().create_some_context() a = cu.MemAlloc(ctx, 4096) a.memset32_async(123) b = cu.MemAlloc(ctx, 4096) b.memset32_async(456) test = numpy.zeros(a.size // 4, dtype=numpy.int32) a.from_device_async(b) a.to_host(test) for x in test: self.assertEqual(x, 456) a.memset32_async(123) a.from_device_async(b, 12) a.to_host(test) for x in test[:3]: self.assertEqual(x, 123) for x in test[3:]: self.assertEqual(x, 456) a.memset32_async(123) a.from_device_async(b, 12, 64) a.to_host(test) for x in test[:3]: self.assertEqual(x, 123) for x in test[3:19]: self.assertEqual(x, 456) for x in test[19:]: self.assertEqual(x, 123) logging.debug("EXIT: test_memcpy")
def _test_exec_complex(self, dtype): x = numpy.zeros([32, 64], dtype=dtype) x.real = numpy.random.rand(x.size).reshape(x.shape) - 0.5 x.imag = numpy.random.rand(x.size).reshape(x.shape) - 0.5 y = numpy.ones_like(x) x_gold = x.copy() try: y_gold = numpy.fft.fft2(x) except TypeError: y_gold = None # for pypy xbuf = cu.MemAlloc(self.ctx, x) ybuf = cu.MemAlloc(self.ctx, y) # Forward transform fft = cufft.CUFFT(self.ctx) fft.auto_allocation = False sz = fft.make_plan_many(x.shape, 1, { numpy.complex64: cufft.CUFFT_C2C, numpy.complex128: cufft.CUFFT_Z2Z }[dtype]) tmp = cu.MemAlloc(self.ctx, sz) fft.workarea = tmp self.assertEqual(fft.workarea, tmp) self.assertEqual(fft.execute, { numpy.complex64: fft.exec_c2c, numpy.complex128: fft.exec_z2z }[dtype]) fft.execute(xbuf, ybuf, cufft.CUFFT_FORWARD) ybuf.to_host(y) if y_gold is not None: delta = y - y_gold max_diff = numpy.fabs( numpy.sqrt(delta.real * delta.real + delta.imag * delta.imag)).max() logging.debug("Forward max_diff is %.6e", max_diff) self.assertLess(max_diff, { numpy.complex64: 1.0e-3, numpy.complex128: 1.0e-6 }[dtype]) # Inverse transform y /= x.size # correct scale before inverting ybuf.to_device_async(y) xbuf.memset32_async(0) # reset the resulting vector fft.execute(ybuf, xbuf, cufft.CUFFT_INVERSE) xbuf.to_host(x) delta = x - x_gold max_diff = numpy.fabs( numpy.sqrt(delta.real * delta.real + delta.imag * delta.imag)).max() logging.debug("Inverse max_diff is %.6e", max_diff) self.assertLess(max_diff, { numpy.complex64: 1.0e-3, numpy.complex128: 1.0e-6 }[dtype])
def test_memcpy_3d_async(self): logging.debug("ENTER: test_memcpy_3d_async") p_copy = cu.get_ffi().new("CUDA_MEMCPY3D *") self.assertEqual(cu.get_ffi().sizeof(p_copy[0]), 200) ctx = cu.Devices().create_some_context() logging.debug("Context created") # Create arrays with some values for testing a = numpy.arange(35 * 25 * 15, dtype=numpy.float32).reshape(35, 25, 15) b = numpy.arange(37 * 27 * 17, dtype=numpy.float32).reshape(37, 27, 17) b *= 0.5 c = numpy.empty_like(b) c[:] = 1.0e30 # Create buffers a_ = cu.MemAlloc(ctx, a) b_ = cu.MemAlloc(ctx, b) # Copy 3D rect from one device buffer to another logging.debug("Testing device -> device memcpy_3d_async") sz = a.itemsize a_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], dst=b_) b_.to_host(c) diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) # Copy 3D rect from host buffer to device buffer logging.debug("Testing host -> device memcpy_3d_async") sz = a.itemsize b_.memset32_async() b_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], src=a) c[:] = 1.0e30 b_.to_host(c) diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) # Copy 3D rect from device buffer to host buffer logging.debug("Testing device -> host memcpy_3d_async") sz = a.itemsize c[:] = 1.0e30 a_.memcpy_3d_async( (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20), a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1], dst=c) ctx.synchronize() diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max() self.assertEqual(diff, 0) logging.debug("EXIT: test_memcpy_3d_async")
def test_convolution_backward_data(self): logging.debug("ENTER: test_convolution_backward_data") conv_desc = cudnn.ConvolutionDescriptor() conv_desc.set_2d(5, 5, 1, 1) inp_data = numpy.zeros((100, 8, 96, 96), dtype=numpy.float32) inp_desc = cudnn.TensorDescriptor() inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *inp_data.shape) inp_buf = cu.MemAlloc(self.ctx, inp_data) filter_data = numpy.zeros((64, 8, 11, 11), dtype=numpy.float32) filter_data[:] = 0.1 filter_desc = cudnn.FilterDescriptor() filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape) filter_buf = cu.MemAlloc(self.ctx, filter_data) bperr_data = numpy.zeros((100, 64, 96, 96), dtype=numpy.float32) bperr_data[:] = 0.1 bperr_desc = cudnn.TensorDescriptor() bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *bperr_data.shape) bperr_buf = cu.MemAlloc(self.ctx, bperr_data) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf, bperr_desc, bperr_buf, conv_desc, beta, inp_desc, inp_buf) inp_buf.to_host(inp_data) self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size) if self.cudnn.version >= 4000: algo = self.cudnn.get_convolution_backward_data_algorithm( filter_desc, bperr_desc, conv_desc, inp_desc) logging.debug("Fastest algo is %d", algo) sz = self.cudnn.get_convolution_backward_data_workspace_size( filter_desc, bperr_desc, conv_desc, inp_desc, algo) logging.debug("Workspace size for it is %d", sz) algo = self.cudnn.get_convolution_backward_data_algorithm( filter_desc, bperr_desc, conv_desc, inp_desc, cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 512 * 1024 * 1024) logging.debug("With 512 Mb limit: %d", algo) workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024) inp_buf.memset32_async() self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf, bperr_desc, bperr_buf, conv_desc, beta, inp_desc, inp_buf, algo, workspace, workspace.size) inp_buf.to_host(inp_data) self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size) logging.debug("EXIT: test_convolution_backward_data")
def _test_gemm_with_mode(self, gemm, dtype, mode): self.blas.set_pointer_mode(mode) a = numpy.zeros([127, 353], dtype=dtype) b = numpy.zeros([135, a.shape[1]], dtype=dtype) c = numpy.zeros([a.shape[0], b.shape[0]], dtype=dtype) try: numpy.random.seed(123) except AttributeError: # PyPy workaround pass a[:] = numpy.random.rand(a.size).astype(dtype).reshape(a.shape) - 0.5 b[:] = numpy.random.rand(b.size).astype(dtype).reshape(b.shape) - 0.5 gold_c = numpy.dot(a.astype(numpy.float64), b.transpose().astype(numpy.float64)) a_buf = cu.MemAlloc(self.ctx, a.nbytes) b_buf = cu.MemAlloc(self.ctx, b.nbytes) c_buf = cu.MemAlloc(self.ctx, c.nbytes * 2) alpha = numpy.ones(1, dtype={numpy.float16: numpy.float32}.get(dtype, dtype)) beta = numpy.zeros(1, dtype={numpy.float16: numpy.float32}.get(dtype, dtype)) if mode == blas.CUBLAS_POINTER_MODE_DEVICE: alpha = cu.MemAlloc(self.ctx, alpha) beta = cu.MemAlloc(self.ctx, beta) a_buf.to_device_async(a) b_buf.to_device_async(b) c_buf.to_device_async(c) c_buf.to_device_async(c, c.nbytes) gemm(blas.CUBLAS_OP_T, blas.CUBLAS_OP_N, b.shape[0], a.shape[0], a.shape[1], alpha, b_buf, a_buf, beta, c_buf) c_buf.to_host(c) max_diff = numpy.fabs(gold_c - c.astype(numpy.float64)).max() logging.debug("Maximum difference is %.6f", max_diff) self.assertLess(max_diff, { numpy.float32: 1.0e-5, numpy.float64: 1.0e-13, numpy.float16: 3.0e-3 }[dtype]) c_buf.to_host(c, c.nbytes) max_diff = numpy.fabs(c).max() # To avoid destructor call before gemm completion del beta del alpha self.assertEqual(max_diff, 0, "Written some values outside of the target array")
def test_pooling(self): logging.debug("ENTER: test_pooling") input_data = numpy.zeros((5, 96, 64, 48), dtype=numpy.float32) input_data[:] = numpy.random.rand(input_data.size).reshape( input_data.shape) - 0.5 input_desc = cudnn.TensorDescriptor() input_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *input_data.shape) input_buf = cu.MemAlloc(self.ctx, input_data) pooling_desc = cudnn.PoolingDescriptor() pooling_desc.set_2d((5, 3), (2, 1), (3, 2)) if self.cudnn.version < 4000: output_shape = (5, 96, 22, 24) else: output_shape = cudnn.CUDNN.get_pooling_2d_forward_output_dim( pooling_desc, input_desc) self.assertEqual(len(output_shape), 4) logging.debug("Output shape is %s", output_shape) output_data = numpy.zeros(output_shape, dtype=numpy.float32) output_data[:] = numpy.nan output_desc = cudnn.TensorDescriptor() output_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *output_data.shape) output_buf = cu.MemAlloc(self.ctx, output_data) np_one = numpy.ones(1, dtype=numpy.float32) np_zero = numpy.zeros(1, dtype=numpy.float32) self.cudnn.pooling_forward(pooling_desc, np_one, input_desc, input_buf, np_zero, output_desc, output_buf) output_buf.to_host(output_data) self.assertEqual(numpy.count_nonzero(numpy.isnan(output_data)), 0) diff_desc = output_desc diff_buf = cu.MemAlloc(self.ctx, output_data) grad_desc = input_desc grad_data = input_data.copy() grad_data[:] = numpy.nan grad_buf = cu.MemAlloc(self.ctx, grad_data) self.cudnn.pooling_backward(pooling_desc, np_one, output_desc, output_buf, diff_desc, diff_buf, input_desc, input_buf, np_zero, grad_desc, grad_buf) grad_buf.to_host(grad_data) self.assertEqual(numpy.count_nonzero(numpy.isnan(grad_data)), 0) logging.debug("EXIT: test_pooling")
def test_mem_alloc(self): logging.debug("ENTER: test_mem_alloc") ctx = cu.Devices().create_some_context() self._test_alloc(lambda a: cu.MemAlloc(ctx, a)) self._test_alloc(ctx.mem_alloc) logging.debug("MemAlloc succeeded") logging.debug("EXIT: test_mem_alloc")
def test_softmax(self): logging.debug("ENTER: test_softmax") x_arr = numpy.zeros((7, 37, 1, 1), dtype=numpy.float32) x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape) y_gold = x_arr.copy().reshape(x_arr.shape[0], x_arr.size // x_arr.shape[0]) y_gold[:] = (y_gold.transpose() - y_gold.max(axis=1)).transpose() numpy.exp(y_gold, y_gold) y_gold[:] = (y_gold.transpose() / y_gold.sum(axis=1)).transpose() x_desc = cudnn.TensorDescriptor() x_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *x_arr.shape) x = cu.MemAlloc(self.ctx, x_arr) y_arr = numpy.zeros_like(x_arr) y = cu.MemAlloc(self.ctx, y_arr) np_one = numpy.ones(1, dtype=numpy.float32) np_zero = numpy.zeros(1, dtype=numpy.float32) self.cudnn.softmax_forward(np_one, x_desc, x, np_zero, x_desc, y) y.to_host(y_arr) max_diff = numpy.fabs(y_arr.ravel() - y_gold.ravel()).max() self.assertLess(max_diff, 1.0e-5) # Backpropagtion test dy_arr = numpy.zeros_like(y_gold) dy_arr[:] = numpy.random.rand(dy_arr.size).reshape(dy_arr.shape) + 0.01 dy = cu.MemAlloc(self.ctx, dy_arr) dx_arr = numpy.zeros_like(dy_arr) dx = cu.MemAlloc(self.ctx, dx_arr) self.cudnn.softmax_backward(np_one, x_desc, y, x_desc, dy, np_zero, x_desc, dx) dx.to_host(dx_arr) self.assertEqual(numpy.count_nonzero(dx_arr), dx_arr.size) # TODO(a.kazantsev): add test for gradient correctness. logging.debug("EXIT: test_softmax")
def test_kernel(self): logging.debug("ENTER: test_kernel") cap = self.ctx.device.compute_capability if cap < (3, 5): logging.debug("Requires compute capability >= (3, 5), got %s", cap) logging.debug("EXIT: test_kernel") return with self.ctx: module = cu.Module(self.ctx, source_file=("%s/cublas.cu" % self.path), nvcc_options2=cu.Module.OPTIONS_CUBLAS, compute_capability=(cap[0], 0) if cap >= (6, 0) else cap) # minor version of compute has to be set to 0 # to work on Pascal with CUDA 8.0 logging.debug("Compiled") f = module.create_function("test") logging.debug("Got function") n = 256 a = numpy.random.rand(n, n).astype(numpy.float32) b = numpy.random.rand(n, n).astype(numpy.float32) c = numpy.zeros_like(a) c_gold = numpy.dot(a.transpose(), b.transpose()).transpose() a_ = cu.MemAlloc(self.ctx, a) b_ = cu.MemAlloc(self.ctx, b) c_ = cu.MemAlloc(self.ctx, c) zero_ = cu.MemAlloc(self.ctx, numpy.zeros(1, dtype=numpy.float32)) one_ = cu.MemAlloc(self.ctx, numpy.ones(1, dtype=numpy.float32)) logging.debug("Allocated arrays") f.set_args(numpy.array([n], dtype=numpy.int64), one_, a_, b_, zero_, c_) logging.debug("Set args") f((1, 1, 1), (1, 1, 1)) logging.debug("Executed") c_.to_host(c) max_diff = numpy.fabs(c - c_gold).max() logging.debug("Maximum difference is %.6f", max_diff) self.assertLess(max_diff, 1.0e-3) logging.debug("EXIT: test_kernel")
def test_convolution_forward(self): logging.debug("ENTER: test_convolution_forward") conv_desc = cudnn.ConvolutionDescriptor() conv_desc.set_2d(5, 4, 2, 1) inp_data = numpy.zeros((100, 8, 104, 112), dtype=numpy.float32) inp_data[:] = 0.1 inp_desc = cudnn.TensorDescriptor() inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *inp_data.shape) inp_buf = cu.MemAlloc(self.ctx, inp_data) filter_data = numpy.zeros((64, 8, 11, 7), dtype=numpy.float32) filter_data[:] = 0.3 filter_desc = cudnn.FilterDescriptor() filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape) filter_buf = cu.MemAlloc(self.ctx, filter_data) n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim( conv_desc, inp_desc, filter_desc) out_data = numpy.zeros((n, c, h, w), dtype=numpy.float32) out_desc = cudnn.TensorDescriptor() out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *out_data.shape) out_buf = cu.MemAlloc(self.ctx, out_data) workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024) algo = self.cudnn.get_convolution_forward_algorithm( inp_desc, filter_desc, conv_desc, out_desc, cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace.size) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.convolution_forward(alpha, inp_desc, inp_buf, filter_desc, filter_buf, conv_desc, algo, workspace, workspace.size, beta, out_desc, out_buf) out_buf.to_host(out_data) self.assertEqual(numpy.count_nonzero(out_data), out_data.size) logging.debug("EXIT: test_convolution_forward")
def _test_good(self): ctx = cu.Devices().create_some_context() a = Container() a.ctx = ctx b = Container() b.mem = cu.MemAlloc(ctx, 4096) b.module = cu.Module(ctx, source=""" __global__ void test(float *a) { a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f; }""") b.blas = blas.CUBLAS(ctx) logging.debug("Remaining context count: %d", cu.Context.context_count) # self.assertEqual(cu.Context.context_count, 1) self.assertIsNotNone(ctx) # to hold ctx up to this point
def test_memset(self): logging.debug("ENTER: test_memset") ctx = cu.Devices().create_some_context() mem = cu.MemAlloc(ctx, 4096) mem.memset32_async(123) mem.memset32_async(456, 1) mem.memset32_async(789, 2, 3) a = numpy.zeros(mem.size // 4, dtype=numpy.int32) mem.to_host(a) self.assertEqual(a[0], 123) self.assertEqual(a[1], 456) for i in range(2, 2 + 3): self.assertEqual(a[i], 789) for i in range(2 + 3, a.size): self.assertEqual(a[i], 456) logging.debug("EXIT: test_memset")
def _test_generate_normal(self, ctx, dtype, func, pass_size): a = numpy.zeros(65536, dtype=dtype) a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a) mean = 1.0 stddev = 2.0 if pass_size: func(a_buf, a.size) func(a_buf, a.size, mean, stddev) else: func(a_buf) func(a_buf, mean=mean, stddev=stddev) if ctx is not None: a_buf.to_host(a) else: a[:] = a_buf[:] # TODO(a.kazantsev): add better test for correctness. self.assertGreater(numpy.count_nonzero(a), a.size - a.size // 512) return a
def _test_generate_uniform(self, ctx, dtype, func, pass_size): a = numpy.zeros(65536, dtype=dtype) a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a) if pass_size: func(a_buf, a.size) else: func(a_buf) if ctx is not None: a_buf.to_host(a) else: a[:] = a_buf[:] # Simple test for correctness N = 20 counts = [0 for _i in range(N)] for x in a: counts[int(x * N)] += 1 for c in counts: self.assertLess(abs(c - a.size // N), a.size // N // 8) return a
def test_poisson(self): for ctx in (self.ctx, None): res = [] for pass_size in (True, False): rng = curand.CURAND(ctx) rng.seed = 123 a = numpy.zeros(65536, dtype=numpy.uint32) a_buf = (cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a)) if pass_size: rng.generate_poisson(a_buf, a.size) rng.generate_poisson(a_buf, a.size, 1.0) else: rng.generate_poisson(a_buf) rng.generate_poisson(a_buf, lam=1.0) if ctx is not None: a_buf.to_host(a) else: a[:] = a_buf[:] # TODO(a.kazantsev): add better test for correctness. self.assertGreater(numpy.count_nonzero(a), a.size // 2) res.append(a) for i in range(1): self.assertEqual(numpy.count_nonzero(res[i] - res[i + 1]), 0)
def _test_generate(self, ctx): rng = curand.CURAND(ctx) rng.seed = 123 a = numpy.zeros(65536, dtype=numpy.int32) a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a) rng.generate32(a_buf, a.size) if ctx is not None: a_buf.to_host(a) else: a[:] = a_buf[:] self.assertGreater(numpy.count_nonzero(a), a.size - a.size // 512) # Check that seed matters rng = curand.CURAND(ctx) rng.seed = 123 if ctx is not None: a_buf.memset32_async() else: a_buf[:] = 0 rng.generate32(a_buf, a.size) b = numpy.zeros_like(a) if ctx is not None: a_buf.to_host(b) else: b[:] = a_buf[:] self.assertEqual(numpy.count_nonzero(a - b), 0) rng = curand.CURAND(ctx) rng.seed = 456 if ctx is not None: a_buf.memset32_async() else: a_buf[:] = 0 rng.generate32(a_buf, a.size) if ctx is not None: a_buf.to_host(b) else: b[:] = a_buf[:] self.assertGreater(numpy.count_nonzero(a - b), a.size - a.size // 512) # Check that result will be the same when the size is not passed rng = curand.CURAND(ctx) rng.seed = 123 if ctx is not None: a_buf.memset32_async() else: a_buf[:] = 0 rng.generate32(a_buf) b = numpy.zeros_like(a) if ctx is not None: a_buf.to_host(b) else: b[:] = a_buf[:] self.assertEqual(numpy.count_nonzero(a - b), 0) # Check 64-bit version rng = curand.CURAND(ctx, curand.CURAND_RNG_QUASI_SOBOL64) try: rng.seed = 123 self.assertTrue( False, "CURAND_RNG_QUASI_SOBOL64 should not support seed") except cu.CUDARuntimeError: pass rng.dimensions = 64 if ctx is not None: a_buf.memset32_async() else: a_buf[:] = 0 try: rng.generate32(a_buf, a.size) self.assertTrue( False, "CURAND_RNG_QUASI_SOBOL64 should not support generate32") except cu.CUDARuntimeError: pass a64 = numpy.zeros(a.size // 2, dtype=numpy.int64) rng.generate64(a_buf, a64.size) if ctx is not None: a_buf.to_host(a64) else: a64[:] = a_buf.view(numpy.int64)[:] self.assertGreater(numpy.count_nonzero(a64), a64.size - a64.size // 256) # Check that result will be the same when the size is not passed rng = curand.CURAND(ctx, curand.CURAND_RNG_QUASI_SOBOL64) rng.dimensions = 64 if ctx is not None: a_buf.memset32_async() else: a_buf[:] = 0 rng.generate64(a_buf) b64 = numpy.zeros_like(a64) if ctx is not None: a_buf.to_host(b64) else: b64[:] = a_buf.view(numpy.int64)[:] self.assertEqual(numpy.count_nonzero(a64 - b64), 0)
def test_rnn(self): if self.cudnn.version < 5000: return logging.debug("ENTER: test_rnn") drop = cudnn.DropoutDescriptor() drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size) self.cudnn.set_dropout_descriptor(drop, 0.5, drop_states, drop_states.size, 1234) rnn = cudnn.RNNDescriptor() self.assertEqual(rnn.hidden_size, 0) self.assertEqual(rnn.num_layers, 0) self.assertIsNone(rnn.dropout_desc) self.assertEqual(rnn.input_mode, -1) self.assertEqual(rnn.direction, -1) self.assertEqual(rnn.mode, -1) self.assertEqual(rnn.data_type, -1) self.assertEqual(rnn.num_linear_layers, 0) batch_size = 4 x = numpy.zeros( (batch_size, 32), # minibatch, input size dtype=numpy.float32) numpy.random.seed(1234) x[:] = numpy.random.rand(x.size).reshape(x.shape) - 0.5 x_desc = cudnn.TensorDescriptor() # Set input as 3-dimensional like in cudnn example: # minibatch, input_size, 1 x_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (x.shape[0], x.shape[1], 1)) n_unroll = 16 hidden_size = 64 n_layers = 3 def assert_values(): self.assertEqual(rnn.hidden_size, hidden_size) self.assertEqual(rnn.num_layers, n_layers) self.assertIs(rnn.dropout_desc, drop) self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT) self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL) self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM) self.assertEqual(rnn.data_type, cudnn.CUDNN_DATA_FLOAT) self.assertEqual(rnn.num_linear_layers, 8) # Short syntax rnn.set(hidden_size, n_layers, drop) assert_values() # Check num_linear_layers property for mode, n in ((cudnn.CUDNN_RNN_RELU, 2), (cudnn.CUDNN_RNN_TANH, 2), (cudnn.CUDNN_GRU, 6)): rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, mode=mode) self.assertEqual(rnn.num_linear_layers, n) # Full syntax rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, input_mode=cudnn.CUDNN_LINEAR_INPUT, direction=cudnn.CUDNN_UNIDIRECTIONAL, mode=cudnn.CUDNN_LSTM, data_type=cudnn.CUDNN_DATA_FLOAT) assert_values() def get_sz(func): sz = func(rnn, (x_desc for _i in range(n_unroll))) self.assertIsInstance(sz, int) return sz sz_work = get_sz(self.cudnn.get_rnn_workspace_size) logging.debug("RNN workspace size for %s with %d unrolls is %d", x.shape, n_unroll, sz_work) sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size) logging.debug("RNN train size for %s with %d unrolls is %d", x.shape, n_unroll, sz_train) sz_params = self.cudnn.get_rnn_params_size(rnn, x_desc) logging.debug("RNN params size for %s is %d", x.shape, sz_params) x_desc2 = cudnn.TensorDescriptor() x_desc2.set_nd(cudnn.CUDNN_DATA_DOUBLE, (x.shape[0], x.shape[1], 1)) sz_params2 = self.cudnn.get_rnn_params_size(rnn, x_desc2, cudnn.CUDNN_DATA_DOUBLE) self.assertEqual(sz_params2, sz_params * 2) params_desc = cudnn.FilterDescriptor() params_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (sz_params >> 2, 1, 1)) params = cu.MemAlloc(self.ctx, sz_params) params.memset32_async() w_desc = cudnn.FilterDescriptor() w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc, params_desc, params, 0, w_desc) logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d", w_desc.dims, w_desc.fmt, w.size) self.assertEqual(w.size, hidden_size * x.shape[1] * 4) b_desc = cudnn.FilterDescriptor() b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc, params_desc, params, 0, b_desc) logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d", b_desc.dims, b_desc.fmt, b.size) self.assertEqual(b.size, hidden_size * 4) workspace = cu.MemAlloc(self.ctx, sz_work) x_buf = cu.MemAlloc(self.ctx, x.nbytes * n_unroll) for i in range(n_unroll): # will feed the same input x_buf.to_device(x, x.nbytes * i, x.nbytes) y_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll) hx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) hx_buf.memset32_async() hy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) cx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) cx_buf.memset32_async() cy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) y_desc = cudnn.TensorDescriptor() y_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (batch_size, hidden_size, 1)) h_desc = cudnn.TensorDescriptor() h_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (n_layers, batch_size, hidden_size)) self.ctx.synchronize() logging.debug("Starting forward inference") for i in range(5): self.cudnn.rnn_forward_inference( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, h_desc, cx_buf, params_desc, params, (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc, cy_buf, workspace, sz_work) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Forward inference done in %.6f sec", (time.time() - t0) / 4) train_space = cu.MemAlloc(self.ctx, sz_train) self.cudnn.rnn_forward_training( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, h_desc, cx_buf, params_desc, params, (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc, cy_buf, workspace, sz_work, train_space, sz_train) self.ctx.synchronize() logging.debug("Forward training done") dy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll) dy_buf.from_device_async(y_buf) dhy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dhy_buf.memset32_async() dcy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dcy_buf.memset32_async() dx_buf = cu.MemAlloc(self.ctx, x_buf.size) dhx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dcx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) self.ctx.synchronize() logging.debug("Starting backpropagation") for i in range(5): self.cudnn.rnn_backward_data( rnn, (y_desc for _i in range(n_unroll)), y_buf, (y_desc for _i in range(n_unroll)), dy_buf, h_desc, dhy_buf, h_desc, dcy_buf, params_desc, params, h_desc, hx_buf, h_desc, cx_buf, (x_desc for _i in range(n_unroll)), dx_buf, h_desc, dhx_buf, h_desc, dcx_buf, workspace, sz_work, train_space, sz_train) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Backpropagation done in %.6f sec", (time.time() - t0) / 4) dw = cu.MemAlloc(self.ctx, params.size) logging.debug("Starting gradient computation") for i in range(5): self.cudnn.rnn_backward_weights( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, (y_desc for _i in range(n_unroll)), y_buf, workspace, sz_work, params_desc, dw, train_space, sz_train) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Gradient computation done in %.6f sec", (time.time() - t0) / 4) logging.debug("EXIT: test_rnn")
def test_dropout(self): if self.cudnn.version < 5000: return logging.debug("ENTER: test_dropout") drop_ss = self.cudnn.dropout_states_size self.assertIsInstance(drop_ss, int) logging.debug("Dropout states size is %d", drop_ss) input_data = numpy.zeros((5, 16, 32, 24), dtype=numpy.float32) numpy.random.seed(1234) input_data[:] = numpy.random.rand(input_data.size).reshape( input_data.shape) - 0.5 input_desc = cudnn.TensorDescriptor() input_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *input_data.shape) drop_rss = input_desc.dropout_reserve_space_size self.assertIsInstance(drop_rss, int) logging.debug("Dropout reserve space size for %s is %d", input_data.shape, drop_rss) drop = cudnn.DropoutDescriptor() self.assertIsNone(self.cudnn.dropout_desc) self.assertIsNone(self.cudnn.dropout_states) self.cudnn.set_dropout_descriptor(drop) # with default parameters self.assertIs(self.cudnn.dropout_desc, drop) self.assertIsNone(self.cudnn.dropout_states) states = cu.MemAlloc(self.ctx, drop_ss) self.cudnn.set_dropout_descriptor(drop, 0.5, states, drop_ss, 1234) self.assertIs(self.cudnn.dropout_desc, drop) self.assertIs(self.cudnn.dropout_states, states) input_buf = cu.MemAlloc(self.ctx, input_data) output_buf = cu.MemAlloc(self.ctx, input_buf.size) reserve = cu.MemAlloc(self.ctx, drop_rss) self.cudnn.dropout_forward(drop, input_desc, input_buf, input_desc, output_buf, reserve, reserve.size) output_data = numpy.ones_like(input_data) output_buf.to_host(output_data) n_z = 0 for i, y in numpy.ndenumerate(output_data): if not y: n_z += 1 continue x = input_data[i] self.assertEqual(y, x * 2) self.assertGreater(n_z, (input_data.size >> 1) - (input_data.size >> 5)) err_data = numpy.ones_like(input_data) err_data[:] = numpy.random.rand(output_data.size).reshape( output_data.shape) - 0.5 err_buf = cu.MemAlloc(self.ctx, err_data) self.cudnn.dropout_backward(drop, input_desc, err_buf, input_desc, input_buf, reserve, reserve.size) input_buf.to_host(input_data) n_z = 0 for i, x in numpy.ndenumerate(input_data): if not output_data[i]: self.assertEqual(x, 0.0) n_z += 1 continue y = err_data[i] if self.cudnn.version == 5004: self.assertEqual(x, y) # strangely, it doesn't scale gradient else: self.assertEqual(x, y * 2) self.assertGreater(n_z, (input_data.size >> 1) - (input_data.size >> 5)) logging.debug("EXIT: test_dropout")
def test_lstm(self): if self.cudnn.version < 5000: return logging.debug("ENTER: test_lstm") drop = cudnn.DropoutDescriptor() drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size) self.cudnn.set_dropout_descriptor(drop, 0.0, drop_states, drop_states.size, 1234) rnn = cudnn.RNNDescriptor() self.assertEqual(rnn.hidden_size, 0) self.assertEqual(rnn.num_layers, 0) self.assertIsNone(rnn.dropout_desc) self.assertEqual(rnn.input_mode, -1) self.assertEqual(rnn.direction, -1) self.assertEqual(rnn.mode, -1) self.assertEqual(rnn.data_type, -1) self.assertEqual(rnn.num_linear_layers, 0) batch_size = 8 x_arr = numpy.zeros( (batch_size, 16), # minibatch, input size dtype=DTYPE) numpy.random.seed(1234) x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape) - 0.5 x_desc = cudnn.TensorDescriptor() # Set input as 3-dimensional like in cudnn example. x_desc.set_nd(CUTYPE, (x_arr.shape[0], x_arr.shape[1], 1)) n_unroll = 5 hidden_size = 16 n_layers = 3 def assert_values(): self.assertEqual(rnn.hidden_size, hidden_size) self.assertEqual(rnn.num_layers, n_layers) self.assertIs(rnn.dropout_desc, drop) self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT) self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL) self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM) self.assertEqual(rnn.data_type, CUTYPE) self.assertEqual(rnn.num_linear_layers, 8) # Full syntax rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, input_mode=cudnn.CUDNN_LINEAR_INPUT, direction=cudnn.CUDNN_UNIDIRECTIONAL, mode=cudnn.CUDNN_LSTM, data_type=CUTYPE) assert_values() x_descs = tuple(x_desc for _i in range(n_unroll)) def get_sz(func): sz = func(rnn, x_descs) self.assertIsInstance(sz, int) return sz sz_work = get_sz(self.cudnn.get_rnn_workspace_size) logging.debug("RNN workspace size for %s with %d unrolls is %d", x_arr.shape, n_unroll, sz_work) sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size) logging.debug("RNN train size for %s with %d unrolls is %d", x_arr.shape, n_unroll, sz_train) sz_weights = self.cudnn.get_rnn_params_size(rnn, x_desc) logging.debug("RNN weights size for %s is %d", x_arr.shape, sz_weights) sz_expected = ITEMSIZE * ( 4 * (x_arr.shape[1] + hidden_size + 2) * hidden_size + 4 * (hidden_size + hidden_size + 2) * hidden_size * (n_layers - 1)) self.assertEqual(sz_weights, sz_expected) weights_desc = cudnn.FilterDescriptor() weights_desc.set_nd(CUTYPE, (sz_weights // ITEMSIZE, 1, 1)) weights = cu.MemAlloc(self.ctx, sz_weights) weights_arr = numpy.random.rand(sz_weights // ITEMSIZE).astype(DTYPE) weights_arr -= 0.5 weights_arr *= 0.1 weights.to_device(weights_arr) w_desc = cudnn.FilterDescriptor() w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc, weights_desc, weights, 0, w_desc) logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d", w_desc.dims, w_desc.fmt, w.size) self.assertEqual(w.size, hidden_size * x_arr.shape[1] * ITEMSIZE) b_desc = cudnn.FilterDescriptor() b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc, weights_desc, weights, 0, b_desc) logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d", b_desc.dims, b_desc.fmt, b.size) self.assertEqual(b.size, hidden_size * ITEMSIZE) work_buf = cu.MemAlloc(self.ctx, sz_work) work_buf.memset32_async() x = cu.MemAlloc(self.ctx, x_arr.nbytes * n_unroll) for i in range(n_unroll): # will feed the same input x.to_device(x_arr, x_arr.nbytes * i, x_arr.nbytes) y_arr = numpy.zeros((n_unroll, batch_size, hidden_size), dtype=DTYPE) y = cu.MemAlloc(self.ctx, y_arr) hx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE) hx_arr[:] = numpy.random.rand(hx_arr.size).reshape(hx_arr.shape) hx_arr -= 0.5 hx = cu.MemAlloc(self.ctx, hx_arr) hy = cu.MemAlloc(self.ctx, hx.size) hy.memset32_async() cx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE) cx_arr[:] = numpy.random.rand(cx_arr.size).reshape(cx_arr.shape) cx_arr -= 0.5 cx = cu.MemAlloc(self.ctx, cx_arr) cy = cu.MemAlloc(self.ctx, cx.size) cy.memset32_async() y_desc = cudnn.TensorDescriptor() y_desc.set_nd(CUTYPE, (batch_size, hidden_size, 1)) y_descs = tuple(y_desc for _i in range(n_unroll)) h_desc = cudnn.TensorDescriptor() h_desc.set_nd(CUTYPE, (n_layers, batch_size, hidden_size)) train_buf = cu.MemAlloc(self.ctx, sz_train) train_buf.memset32_async() self.cudnn.rnn_forward_training(rnn, x_descs, x, h_desc, hx, h_desc, cx, weights_desc, weights, y_descs, y, h_desc, hy, h_desc, cy, work_buf, sz_work, train_buf, sz_train) self.ctx.synchronize() logging.debug("Forward training done") y.to_host(y_arr) target = numpy.random.rand(y_arr.size).reshape(y_arr.shape).astype( y_arr.dtype) - 0.5 dy_arr = y_arr - target dy = cu.MemAlloc(self.ctx, dy_arr) dhy = cu.MemAlloc(self.ctx, hx.size) dhy.memset32_async() dcy = cu.MemAlloc(self.ctx, cx.size) dcy.memset32_async() dx_arr = numpy.zeros_like(x_arr) dx = cu.MemAlloc(self.ctx, dx_arr) dhx_arr = numpy.zeros_like(hx_arr) dhx = cu.MemAlloc(self.ctx, dhx_arr) dcx_arr = numpy.zeros_like(cx_arr) dcx = cu.MemAlloc(self.ctx, dcx_arr) self.cudnn.rnn_backward_data(rnn, y_descs, y, y_descs, dy, h_desc, dhy, h_desc, dcy, weights_desc, weights, h_desc, hx, h_desc, cx, x_descs, dx, h_desc, dhx, h_desc, dcx, work_buf, sz_work, train_buf, sz_train) logging.debug("Backpropagation done") dx.to_host(dx_arr) dhx.to_host(dhx_arr) dcx.to_host(dcx_arr) def forward(): x.to_device_async(x_arr) hx.to_device_async(hx_arr) cx.to_device_async(cx_arr) self.cudnn.rnn_forward_inference(rnn, x_descs, x, h_desc, hx, h_desc, cx, weights_desc, weights, y_descs, y, h_desc, hy, h_desc, cy, work_buf, sz_work) y.to_host(y_arr) numdiff = NumDiff() logging.debug("Checking dx...") numdiff.check_diff(x_arr, y_arr, target, dx_arr, forward) logging.debug("Checking dhx...") numdiff.check_diff(hx_arr, y_arr, target, dhx_arr, forward) logging.debug("Checking dcx...") numdiff.check_diff(cx_arr, y_arr, target, dcx_arr, forward) logging.debug("EXIT: test_lstm")
def _alloc_temp_buffer(self, size): # Allocate the buffer return cu.MemAlloc(self.context, size)
def cuda_create_devmem(self): self._devmem_ = cu.MemAlloc(self.device.context, self.plain.nbytes) self.devmem.to_device(self.mem)