Exemple #1
0
    def test_convolution_backward_bias(self):
        logging.debug("ENTER: test_convolution_backward_bias")

        bperr_data = numpy.zeros((100, 64, 104, 226), dtype=numpy.float32)
        bperr_data[:] = 0.1
        bperr_desc = cudnn.TensorDescriptor()
        bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                          *bperr_data.shape)
        bperr_buf = cu.MemAlloc(self.ctx, bperr_data)

        gd_data = numpy.zeros(64, dtype=numpy.float32)
        gd_desc = cudnn.TensorDescriptor()
        gd_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, 1,
                       gd_data.size, 1, 1)
        gd_buf = cu.MemAlloc(self.ctx, gd_data)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.convolution_backward_bias(alpha, bperr_desc, bperr_buf,
                                             beta, gd_desc, gd_buf)

        gd_buf.to_host(gd_data)
        self.assertEqual(numpy.count_nonzero(gd_data), gd_data.size)

        logging.debug("EXIT: test_convolution_backward_bias")
Exemple #2
0
    def test_transform_tensor(self):
        logging.debug("ENTER: test_transform_tensor")

        sh_interleaved = (2, 5, 5, 3)
        sh_splitted = (2, 3, 5, 5)

        inp_data = numpy.arange(numpy.prod(sh_interleaved),
                                dtype=numpy.float32).reshape(sh_interleaved)
        inp_desc = cudnn.TensorDescriptor()
        inp_desc.set_4d(cudnn.CUDNN_TENSOR_NHWC, cudnn.CUDNN_DATA_FLOAT,
                        *sh_splitted)
        inp_buf = cu.MemAlloc(self.ctx, inp_data)

        out_data = numpy.zeros(sh_splitted, dtype=numpy.float32)
        out_desc = cudnn.TensorDescriptor()
        out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *sh_splitted)
        out_buf = cu.MemAlloc(self.ctx, out_data)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.transform_tensor(alpha, inp_desc, inp_buf, beta, out_desc,
                                    out_buf)
        out_buf.to_host(out_data)

        max_diff = numpy.fabs(out_data - inp_data.transpose(0, 3, 1, 2)).max()
        self.assertEqual(max_diff, 0.0)

        logging.debug("EXIT: test_transform_tensor")
Exemple #3
0
 def test_memcpy(self):
     logging.debug("ENTER: test_memcpy")
     ctx = cu.Devices().create_some_context()
     a = cu.MemAlloc(ctx, 4096)
     a.memset32_async(123)
     b = cu.MemAlloc(ctx, 4096)
     b.memset32_async(456)
     test = numpy.zeros(a.size // 4, dtype=numpy.int32)
     a.from_device_async(b)
     a.to_host(test)
     for x in test:
         self.assertEqual(x, 456)
     a.memset32_async(123)
     a.from_device_async(b, 12)
     a.to_host(test)
     for x in test[:3]:
         self.assertEqual(x, 123)
     for x in test[3:]:
         self.assertEqual(x, 456)
     a.memset32_async(123)
     a.from_device_async(b, 12, 64)
     a.to_host(test)
     for x in test[:3]:
         self.assertEqual(x, 123)
     for x in test[3:19]:
         self.assertEqual(x, 456)
     for x in test[19:]:
         self.assertEqual(x, 123)
     logging.debug("EXIT: test_memcpy")
Exemple #4
0
    def _test_exec_complex(self, dtype):
        x = numpy.zeros([32, 64], dtype=dtype)
        x.real = numpy.random.rand(x.size).reshape(x.shape) - 0.5
        x.imag = numpy.random.rand(x.size).reshape(x.shape) - 0.5
        y = numpy.ones_like(x)
        x_gold = x.copy()
        try:
            y_gold = numpy.fft.fft2(x)
        except TypeError:
            y_gold = None  # for pypy
        xbuf = cu.MemAlloc(self.ctx, x)
        ybuf = cu.MemAlloc(self.ctx, y)

        # Forward transform
        fft = cufft.CUFFT(self.ctx)
        fft.auto_allocation = False
        sz = fft.make_plan_many(x.shape, 1, {
            numpy.complex64: cufft.CUFFT_C2C,
            numpy.complex128: cufft.CUFFT_Z2Z
        }[dtype])
        tmp = cu.MemAlloc(self.ctx, sz)
        fft.workarea = tmp
        self.assertEqual(fft.workarea, tmp)

        self.assertEqual(fft.execute, {
            numpy.complex64: fft.exec_c2c,
            numpy.complex128: fft.exec_z2z
        }[dtype])
        fft.execute(xbuf, ybuf, cufft.CUFFT_FORWARD)
        ybuf.to_host(y)

        if y_gold is not None:
            delta = y - y_gold
            max_diff = numpy.fabs(
                numpy.sqrt(delta.real * delta.real +
                           delta.imag * delta.imag)).max()
            logging.debug("Forward max_diff is %.6e", max_diff)
            self.assertLess(max_diff, {
                numpy.complex64: 1.0e-3,
                numpy.complex128: 1.0e-6
            }[dtype])

        # Inverse transform
        y /= x.size  # correct scale before inverting
        ybuf.to_device_async(y)
        xbuf.memset32_async(0)  # reset the resulting vector
        fft.execute(ybuf, xbuf, cufft.CUFFT_INVERSE)
        xbuf.to_host(x)

        delta = x - x_gold
        max_diff = numpy.fabs(
            numpy.sqrt(delta.real * delta.real +
                       delta.imag * delta.imag)).max()
        logging.debug("Inverse max_diff is %.6e", max_diff)
        self.assertLess(max_diff, {
            numpy.complex64: 1.0e-3,
            numpy.complex128: 1.0e-6
        }[dtype])
Exemple #5
0
    def test_memcpy_3d_async(self):
        logging.debug("ENTER: test_memcpy_3d_async")

        p_copy = cu.get_ffi().new("CUDA_MEMCPY3D *")
        self.assertEqual(cu.get_ffi().sizeof(p_copy[0]), 200)

        ctx = cu.Devices().create_some_context()
        logging.debug("Context created")

        # Create arrays with some values for testing
        a = numpy.arange(35 * 25 * 15, dtype=numpy.float32).reshape(35, 25, 15)
        b = numpy.arange(37 * 27 * 17, dtype=numpy.float32).reshape(37, 27, 17)
        b *= 0.5
        c = numpy.empty_like(b)
        c[:] = 1.0e30

        # Create buffers
        a_ = cu.MemAlloc(ctx, a)
        b_ = cu.MemAlloc(ctx, b)

        # Copy 3D rect from one device buffer to another
        logging.debug("Testing device -> device memcpy_3d_async")
        sz = a.itemsize
        a_.memcpy_3d_async(
            (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20),
            a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1],
            dst=b_)
        b_.to_host(c)
        diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max()
        self.assertEqual(diff, 0)

        # Copy 3D rect from host buffer to device buffer
        logging.debug("Testing host -> device memcpy_3d_async")
        sz = a.itemsize
        b_.memset32_async()
        b_.memcpy_3d_async(
            (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20),
            a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1],
            src=a)
        c[:] = 1.0e30
        b_.to_host(c)
        diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max()
        self.assertEqual(diff, 0)

        # Copy 3D rect from device buffer to host buffer
        logging.debug("Testing device -> host memcpy_3d_async")
        sz = a.itemsize
        c[:] = 1.0e30
        a_.memcpy_3d_async(
            (3 * sz, 4, 5), (6 * sz, 7, 8), (5 * sz, 10, 20),
            a.shape[2] * sz, a.shape[1], b.shape[2] * sz, b.shape[1],
            dst=c)
        ctx.synchronize()
        diff = numpy.fabs(c[8:28, 7:17, 6:11] - a[5:25, 4:14, 3:8]).max()
        self.assertEqual(diff, 0)

        logging.debug("EXIT: test_memcpy_3d_async")
Exemple #6
0
    def test_convolution_backward_data(self):
        logging.debug("ENTER: test_convolution_backward_data")

        conv_desc = cudnn.ConvolutionDescriptor()
        conv_desc.set_2d(5, 5, 1, 1)

        inp_data = numpy.zeros((100, 8, 96, 96), dtype=numpy.float32)
        inp_desc = cudnn.TensorDescriptor()
        inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *inp_data.shape)
        inp_buf = cu.MemAlloc(self.ctx, inp_data)

        filter_data = numpy.zeros((64, 8, 11, 11), dtype=numpy.float32)
        filter_data[:] = 0.1
        filter_desc = cudnn.FilterDescriptor()
        filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape)
        filter_buf = cu.MemAlloc(self.ctx, filter_data)

        bperr_data = numpy.zeros((100, 64, 96, 96), dtype=numpy.float32)
        bperr_data[:] = 0.1
        bperr_desc = cudnn.TensorDescriptor()
        bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                          *bperr_data.shape)
        bperr_buf = cu.MemAlloc(self.ctx, bperr_data)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf,
                                             bperr_desc, bperr_buf, conv_desc,
                                             beta, inp_desc, inp_buf)

        inp_buf.to_host(inp_data)
        self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size)

        if self.cudnn.version >= 4000:
            algo = self.cudnn.get_convolution_backward_data_algorithm(
                filter_desc, bperr_desc, conv_desc, inp_desc)
            logging.debug("Fastest algo is %d", algo)
            sz = self.cudnn.get_convolution_backward_data_workspace_size(
                filter_desc, bperr_desc, conv_desc, inp_desc, algo)
            logging.debug("Workspace size for it is %d", sz)
            algo = self.cudnn.get_convolution_backward_data_algorithm(
                filter_desc, bperr_desc, conv_desc, inp_desc,
                cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                512 * 1024 * 1024)
            logging.debug("With 512 Mb limit: %d", algo)
            workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024)
            inp_buf.memset32_async()
            self.cudnn.convolution_backward_data(alpha, filter_desc,
                                                 filter_buf, bperr_desc,
                                                 bperr_buf, conv_desc, beta,
                                                 inp_desc, inp_buf, algo,
                                                 workspace, workspace.size)
            inp_buf.to_host(inp_data)
            self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size)

        logging.debug("EXIT: test_convolution_backward_data")
Exemple #7
0
    def _test_gemm_with_mode(self, gemm, dtype, mode):
        self.blas.set_pointer_mode(mode)
        a = numpy.zeros([127, 353], dtype=dtype)
        b = numpy.zeros([135, a.shape[1]], dtype=dtype)
        c = numpy.zeros([a.shape[0], b.shape[0]], dtype=dtype)
        try:
            numpy.random.seed(123)
        except AttributeError:  # PyPy workaround
            pass
        a[:] = numpy.random.rand(a.size).astype(dtype).reshape(a.shape) - 0.5
        b[:] = numpy.random.rand(b.size).astype(dtype).reshape(b.shape) - 0.5
        gold_c = numpy.dot(a.astype(numpy.float64),
                           b.transpose().astype(numpy.float64))
        a_buf = cu.MemAlloc(self.ctx, a.nbytes)
        b_buf = cu.MemAlloc(self.ctx, b.nbytes)
        c_buf = cu.MemAlloc(self.ctx, c.nbytes * 2)

        alpha = numpy.ones(1,
                           dtype={numpy.float16:
                                  numpy.float32}.get(dtype, dtype))
        beta = numpy.zeros(1,
                           dtype={numpy.float16:
                                  numpy.float32}.get(dtype, dtype))
        if mode == blas.CUBLAS_POINTER_MODE_DEVICE:
            alpha = cu.MemAlloc(self.ctx, alpha)
            beta = cu.MemAlloc(self.ctx, beta)

        a_buf.to_device_async(a)
        b_buf.to_device_async(b)
        c_buf.to_device_async(c)
        c_buf.to_device_async(c, c.nbytes)

        gemm(blas.CUBLAS_OP_T, blas.CUBLAS_OP_N, b.shape[0], a.shape[0],
             a.shape[1], alpha, b_buf, a_buf, beta, c_buf)

        c_buf.to_host(c)
        max_diff = numpy.fabs(gold_c - c.astype(numpy.float64)).max()
        logging.debug("Maximum difference is %.6f", max_diff)
        self.assertLess(max_diff, {
            numpy.float32: 1.0e-5,
            numpy.float64: 1.0e-13,
            numpy.float16: 3.0e-3
        }[dtype])
        c_buf.to_host(c, c.nbytes)
        max_diff = numpy.fabs(c).max()

        # To avoid destructor call before gemm completion
        del beta
        del alpha

        self.assertEqual(max_diff, 0,
                         "Written some values outside of the target array")
Exemple #8
0
    def test_pooling(self):
        logging.debug("ENTER: test_pooling")

        input_data = numpy.zeros((5, 96, 64, 48), dtype=numpy.float32)
        input_data[:] = numpy.random.rand(input_data.size).reshape(
            input_data.shape) - 0.5
        input_desc = cudnn.TensorDescriptor()
        input_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                          *input_data.shape)
        input_buf = cu.MemAlloc(self.ctx, input_data)

        pooling_desc = cudnn.PoolingDescriptor()
        pooling_desc.set_2d((5, 3), (2, 1), (3, 2))

        if self.cudnn.version < 4000:
            output_shape = (5, 96, 22, 24)
        else:
            output_shape = cudnn.CUDNN.get_pooling_2d_forward_output_dim(
                pooling_desc, input_desc)
        self.assertEqual(len(output_shape), 4)
        logging.debug("Output shape is %s", output_shape)

        output_data = numpy.zeros(output_shape, dtype=numpy.float32)
        output_data[:] = numpy.nan
        output_desc = cudnn.TensorDescriptor()
        output_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                           *output_data.shape)
        output_buf = cu.MemAlloc(self.ctx, output_data)

        np_one = numpy.ones(1, dtype=numpy.float32)
        np_zero = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.pooling_forward(pooling_desc, np_one, input_desc, input_buf,
                                   np_zero, output_desc, output_buf)
        output_buf.to_host(output_data)
        self.assertEqual(numpy.count_nonzero(numpy.isnan(output_data)), 0)

        diff_desc = output_desc
        diff_buf = cu.MemAlloc(self.ctx, output_data)
        grad_desc = input_desc
        grad_data = input_data.copy()
        grad_data[:] = numpy.nan
        grad_buf = cu.MemAlloc(self.ctx, grad_data)

        self.cudnn.pooling_backward(pooling_desc, np_one, output_desc,
                                    output_buf, diff_desc, diff_buf,
                                    input_desc, input_buf, np_zero, grad_desc,
                                    grad_buf)
        grad_buf.to_host(grad_data)
        self.assertEqual(numpy.count_nonzero(numpy.isnan(grad_data)), 0)

        logging.debug("EXIT: test_pooling")
Exemple #9
0
 def test_mem_alloc(self):
     logging.debug("ENTER: test_mem_alloc")
     ctx = cu.Devices().create_some_context()
     self._test_alloc(lambda a: cu.MemAlloc(ctx, a))
     self._test_alloc(ctx.mem_alloc)
     logging.debug("MemAlloc succeeded")
     logging.debug("EXIT: test_mem_alloc")
Exemple #10
0
    def test_softmax(self):
        logging.debug("ENTER: test_softmax")

        x_arr = numpy.zeros((7, 37, 1, 1), dtype=numpy.float32)
        x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape)

        y_gold = x_arr.copy().reshape(x_arr.shape[0],
                                      x_arr.size // x_arr.shape[0])
        y_gold[:] = (y_gold.transpose() - y_gold.max(axis=1)).transpose()
        numpy.exp(y_gold, y_gold)
        y_gold[:] = (y_gold.transpose() / y_gold.sum(axis=1)).transpose()

        x_desc = cudnn.TensorDescriptor()
        x_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                      *x_arr.shape)
        x = cu.MemAlloc(self.ctx, x_arr)
        y_arr = numpy.zeros_like(x_arr)
        y = cu.MemAlloc(self.ctx, y_arr)

        np_one = numpy.ones(1, dtype=numpy.float32)
        np_zero = numpy.zeros(1, dtype=numpy.float32)

        self.cudnn.softmax_forward(np_one, x_desc, x, np_zero, x_desc, y)
        y.to_host(y_arr)

        max_diff = numpy.fabs(y_arr.ravel() - y_gold.ravel()).max()
        self.assertLess(max_diff, 1.0e-5)

        # Backpropagtion test
        dy_arr = numpy.zeros_like(y_gold)
        dy_arr[:] = numpy.random.rand(dy_arr.size).reshape(dy_arr.shape) + 0.01
        dy = cu.MemAlloc(self.ctx, dy_arr)

        dx_arr = numpy.zeros_like(dy_arr)
        dx = cu.MemAlloc(self.ctx, dx_arr)

        self.cudnn.softmax_backward(np_one, x_desc, y, x_desc, dy, np_zero,
                                    x_desc, dx)
        dx.to_host(dx_arr)
        self.assertEqual(numpy.count_nonzero(dx_arr), dx_arr.size)

        # TODO(a.kazantsev): add test for gradient correctness.

        logging.debug("EXIT: test_softmax")
Exemple #11
0
    def test_kernel(self):
        logging.debug("ENTER: test_kernel")
        cap = self.ctx.device.compute_capability
        if cap < (3, 5):
            logging.debug("Requires compute capability >= (3, 5), got %s", cap)
            logging.debug("EXIT: test_kernel")
            return
        with self.ctx:
            module = cu.Module(self.ctx,
                               source_file=("%s/cublas.cu" % self.path),
                               nvcc_options2=cu.Module.OPTIONS_CUBLAS,
                               compute_capability=(cap[0], 0) if cap >=
                               (6, 0) else cap)
            # minor version of compute has to be set to 0
            # to work on Pascal with CUDA 8.0
            logging.debug("Compiled")
            f = module.create_function("test")
            logging.debug("Got function")

            n = 256
            a = numpy.random.rand(n, n).astype(numpy.float32)
            b = numpy.random.rand(n, n).astype(numpy.float32)
            c = numpy.zeros_like(a)
            c_gold = numpy.dot(a.transpose(), b.transpose()).transpose()
            a_ = cu.MemAlloc(self.ctx, a)
            b_ = cu.MemAlloc(self.ctx, b)
            c_ = cu.MemAlloc(self.ctx, c)
            zero_ = cu.MemAlloc(self.ctx, numpy.zeros(1, dtype=numpy.float32))
            one_ = cu.MemAlloc(self.ctx, numpy.ones(1, dtype=numpy.float32))
            logging.debug("Allocated arrays")

            f.set_args(numpy.array([n], dtype=numpy.int64), one_, a_, b_,
                       zero_, c_)
            logging.debug("Set args")

            f((1, 1, 1), (1, 1, 1))
            logging.debug("Executed")

            c_.to_host(c)
            max_diff = numpy.fabs(c - c_gold).max()
            logging.debug("Maximum difference is %.6f", max_diff)
            self.assertLess(max_diff, 1.0e-3)
        logging.debug("EXIT: test_kernel")
Exemple #12
0
    def test_convolution_forward(self):
        logging.debug("ENTER: test_convolution_forward")

        conv_desc = cudnn.ConvolutionDescriptor()
        conv_desc.set_2d(5, 4, 2, 1)

        inp_data = numpy.zeros((100, 8, 104, 112), dtype=numpy.float32)
        inp_data[:] = 0.1
        inp_desc = cudnn.TensorDescriptor()
        inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *inp_data.shape)
        inp_buf = cu.MemAlloc(self.ctx, inp_data)

        filter_data = numpy.zeros((64, 8, 11, 7), dtype=numpy.float32)
        filter_data[:] = 0.3
        filter_desc = cudnn.FilterDescriptor()
        filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape)
        filter_buf = cu.MemAlloc(self.ctx, filter_data)

        n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim(
            conv_desc, inp_desc, filter_desc)
        out_data = numpy.zeros((n, c, h, w), dtype=numpy.float32)
        out_desc = cudnn.TensorDescriptor()
        out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *out_data.shape)
        out_buf = cu.MemAlloc(self.ctx, out_data)

        workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024)
        algo = self.cudnn.get_convolution_forward_algorithm(
            inp_desc, filter_desc, conv_desc, out_desc,
            cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace.size)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.convolution_forward(alpha, inp_desc, inp_buf, filter_desc,
                                       filter_buf, conv_desc, algo, workspace,
                                       workspace.size, beta, out_desc, out_buf)

        out_buf.to_host(out_data)
        self.assertEqual(numpy.count_nonzero(out_data), out_data.size)

        logging.debug("EXIT: test_convolution_forward")
Exemple #13
0
    def _test_good(self):
        ctx = cu.Devices().create_some_context()
        a = Container()
        a.ctx = ctx
        b = Container()
        b.mem = cu.MemAlloc(ctx, 4096)
        b.module = cu.Module(ctx, source="""
            __global__ void test(float *a) {
                a[blockIdx.x * blockDim.x + threadIdx.x] *= 1.1f;
            }""")
        b.blas = blas.CUBLAS(ctx)

        logging.debug("Remaining context count: %d", cu.Context.context_count)
        # self.assertEqual(cu.Context.context_count, 1)
        self.assertIsNotNone(ctx)  # to hold ctx up to this point
Exemple #14
0
 def test_memset(self):
     logging.debug("ENTER: test_memset")
     ctx = cu.Devices().create_some_context()
     mem = cu.MemAlloc(ctx, 4096)
     mem.memset32_async(123)
     mem.memset32_async(456, 1)
     mem.memset32_async(789, 2, 3)
     a = numpy.zeros(mem.size // 4, dtype=numpy.int32)
     mem.to_host(a)
     self.assertEqual(a[0], 123)
     self.assertEqual(a[1], 456)
     for i in range(2, 2 + 3):
         self.assertEqual(a[i], 789)
     for i in range(2 + 3, a.size):
         self.assertEqual(a[i], 456)
     logging.debug("EXIT: test_memset")
Exemple #15
0
 def _test_generate_normal(self, ctx, dtype, func, pass_size):
     a = numpy.zeros(65536, dtype=dtype)
     a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a)
     mean = 1.0
     stddev = 2.0
     if pass_size:
         func(a_buf, a.size)
         func(a_buf, a.size, mean, stddev)
     else:
         func(a_buf)
         func(a_buf, mean=mean, stddev=stddev)
     if ctx is not None:
         a_buf.to_host(a)
     else:
         a[:] = a_buf[:]
     # TODO(a.kazantsev): add better test for correctness.
     self.assertGreater(numpy.count_nonzero(a), a.size - a.size // 512)
     return a
Exemple #16
0
 def _test_generate_uniform(self, ctx, dtype, func, pass_size):
     a = numpy.zeros(65536, dtype=dtype)
     a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a)
     if pass_size:
         func(a_buf, a.size)
     else:
         func(a_buf)
     if ctx is not None:
         a_buf.to_host(a)
     else:
         a[:] = a_buf[:]
     # Simple test for correctness
     N = 20
     counts = [0 for _i in range(N)]
     for x in a:
         counts[int(x * N)] += 1
     for c in counts:
         self.assertLess(abs(c - a.size // N), a.size // N // 8)
     return a
Exemple #17
0
 def test_poisson(self):
     for ctx in (self.ctx, None):
         res = []
         for pass_size in (True, False):
             rng = curand.CURAND(ctx)
             rng.seed = 123
             a = numpy.zeros(65536, dtype=numpy.uint32)
             a_buf = (cu.MemAlloc(ctx, a)
                      if ctx is not None else numpy.zeros_like(a))
             if pass_size:
                 rng.generate_poisson(a_buf, a.size)
                 rng.generate_poisson(a_buf, a.size, 1.0)
             else:
                 rng.generate_poisson(a_buf)
                 rng.generate_poisson(a_buf, lam=1.0)
             if ctx is not None:
                 a_buf.to_host(a)
             else:
                 a[:] = a_buf[:]
             # TODO(a.kazantsev): add better test for correctness.
             self.assertGreater(numpy.count_nonzero(a), a.size // 2)
             res.append(a)
         for i in range(1):
             self.assertEqual(numpy.count_nonzero(res[i] - res[i + 1]), 0)
Exemple #18
0
    def _test_generate(self, ctx):
        rng = curand.CURAND(ctx)
        rng.seed = 123
        a = numpy.zeros(65536, dtype=numpy.int32)
        a_buf = cu.MemAlloc(ctx, a) if ctx is not None else numpy.zeros_like(a)
        rng.generate32(a_buf, a.size)
        if ctx is not None:
            a_buf.to_host(a)
        else:
            a[:] = a_buf[:]
        self.assertGreater(numpy.count_nonzero(a), a.size - a.size // 512)

        # Check that seed matters
        rng = curand.CURAND(ctx)
        rng.seed = 123
        if ctx is not None:
            a_buf.memset32_async()
        else:
            a_buf[:] = 0
        rng.generate32(a_buf, a.size)
        b = numpy.zeros_like(a)
        if ctx is not None:
            a_buf.to_host(b)
        else:
            b[:] = a_buf[:]
        self.assertEqual(numpy.count_nonzero(a - b), 0)

        rng = curand.CURAND(ctx)
        rng.seed = 456
        if ctx is not None:
            a_buf.memset32_async()
        else:
            a_buf[:] = 0
        rng.generate32(a_buf, a.size)
        if ctx is not None:
            a_buf.to_host(b)
        else:
            b[:] = a_buf[:]
        self.assertGreater(numpy.count_nonzero(a - b), a.size - a.size // 512)

        # Check that result will be the same when the size is not passed
        rng = curand.CURAND(ctx)
        rng.seed = 123
        if ctx is not None:
            a_buf.memset32_async()
        else:
            a_buf[:] = 0
        rng.generate32(a_buf)
        b = numpy.zeros_like(a)
        if ctx is not None:
            a_buf.to_host(b)
        else:
            b[:] = a_buf[:]
        self.assertEqual(numpy.count_nonzero(a - b), 0)

        # Check 64-bit version
        rng = curand.CURAND(ctx, curand.CURAND_RNG_QUASI_SOBOL64)
        try:
            rng.seed = 123
            self.assertTrue(
                False, "CURAND_RNG_QUASI_SOBOL64 should not support seed")
        except cu.CUDARuntimeError:
            pass
        rng.dimensions = 64
        if ctx is not None:
            a_buf.memset32_async()
        else:
            a_buf[:] = 0
        try:
            rng.generate32(a_buf, a.size)
            self.assertTrue(
                False,
                "CURAND_RNG_QUASI_SOBOL64 should not support generate32")
        except cu.CUDARuntimeError:
            pass
        a64 = numpy.zeros(a.size // 2, dtype=numpy.int64)
        rng.generate64(a_buf, a64.size)
        if ctx is not None:
            a_buf.to_host(a64)
        else:
            a64[:] = a_buf.view(numpy.int64)[:]
        self.assertGreater(numpy.count_nonzero(a64),
                           a64.size - a64.size // 256)

        # Check that result will be the same when the size is not passed
        rng = curand.CURAND(ctx, curand.CURAND_RNG_QUASI_SOBOL64)
        rng.dimensions = 64
        if ctx is not None:
            a_buf.memset32_async()
        else:
            a_buf[:] = 0
        rng.generate64(a_buf)
        b64 = numpy.zeros_like(a64)
        if ctx is not None:
            a_buf.to_host(b64)
        else:
            b64[:] = a_buf.view(numpy.int64)[:]
        self.assertEqual(numpy.count_nonzero(a64 - b64), 0)
Exemple #19
0
    def test_rnn(self):
        if self.cudnn.version < 5000:
            return
        logging.debug("ENTER: test_rnn")

        drop = cudnn.DropoutDescriptor()
        drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size)
        self.cudnn.set_dropout_descriptor(drop, 0.5, drop_states,
                                          drop_states.size, 1234)

        rnn = cudnn.RNNDescriptor()
        self.assertEqual(rnn.hidden_size, 0)
        self.assertEqual(rnn.num_layers, 0)
        self.assertIsNone(rnn.dropout_desc)
        self.assertEqual(rnn.input_mode, -1)
        self.assertEqual(rnn.direction, -1)
        self.assertEqual(rnn.mode, -1)
        self.assertEqual(rnn.data_type, -1)
        self.assertEqual(rnn.num_linear_layers, 0)

        batch_size = 4
        x = numpy.zeros(
            (batch_size, 32),  # minibatch, input size
            dtype=numpy.float32)
        numpy.random.seed(1234)
        x[:] = numpy.random.rand(x.size).reshape(x.shape) - 0.5
        x_desc = cudnn.TensorDescriptor()
        # Set input as 3-dimensional like in cudnn example:
        # minibatch, input_size, 1
        x_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (x.shape[0], x.shape[1], 1))
        n_unroll = 16
        hidden_size = 64
        n_layers = 3

        def assert_values():
            self.assertEqual(rnn.hidden_size, hidden_size)
            self.assertEqual(rnn.num_layers, n_layers)
            self.assertIs(rnn.dropout_desc, drop)
            self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT)
            self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL)
            self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM)
            self.assertEqual(rnn.data_type, cudnn.CUDNN_DATA_FLOAT)
            self.assertEqual(rnn.num_linear_layers, 8)

        # Short syntax
        rnn.set(hidden_size, n_layers, drop)
        assert_values()
        # Check num_linear_layers property
        for mode, n in ((cudnn.CUDNN_RNN_RELU, 2), (cudnn.CUDNN_RNN_TANH, 2),
                        (cudnn.CUDNN_GRU, 6)):
            rnn = cudnn.RNNDescriptor()
            rnn.set(hidden_size, n_layers, drop, mode=mode)
            self.assertEqual(rnn.num_linear_layers, n)

        # Full syntax
        rnn = cudnn.RNNDescriptor()
        rnn.set(hidden_size,
                n_layers,
                drop,
                input_mode=cudnn.CUDNN_LINEAR_INPUT,
                direction=cudnn.CUDNN_UNIDIRECTIONAL,
                mode=cudnn.CUDNN_LSTM,
                data_type=cudnn.CUDNN_DATA_FLOAT)
        assert_values()

        def get_sz(func):
            sz = func(rnn, (x_desc for _i in range(n_unroll)))
            self.assertIsInstance(sz, int)
            return sz

        sz_work = get_sz(self.cudnn.get_rnn_workspace_size)
        logging.debug("RNN workspace size for %s with %d unrolls is %d",
                      x.shape, n_unroll, sz_work)

        sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size)
        logging.debug("RNN train size for %s with %d unrolls is %d", x.shape,
                      n_unroll, sz_train)

        sz_params = self.cudnn.get_rnn_params_size(rnn, x_desc)
        logging.debug("RNN params size for %s is %d", x.shape, sz_params)
        x_desc2 = cudnn.TensorDescriptor()
        x_desc2.set_nd(cudnn.CUDNN_DATA_DOUBLE, (x.shape[0], x.shape[1], 1))
        sz_params2 = self.cudnn.get_rnn_params_size(rnn, x_desc2,
                                                    cudnn.CUDNN_DATA_DOUBLE)
        self.assertEqual(sz_params2, sz_params * 2)

        params_desc = cudnn.FilterDescriptor()
        params_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (sz_params >> 2, 1, 1))
        params = cu.MemAlloc(self.ctx, sz_params)
        params.memset32_async()
        w_desc = cudnn.FilterDescriptor()
        w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc,
                                                       params_desc, params, 0,
                                                       w_desc)
        logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d",
                      w_desc.dims, w_desc.fmt, w.size)
        self.assertEqual(w.size, hidden_size * x.shape[1] * 4)

        b_desc = cudnn.FilterDescriptor()
        b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc,
                                                     params_desc, params, 0,
                                                     b_desc)
        logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d",
                      b_desc.dims, b_desc.fmt, b.size)
        self.assertEqual(b.size, hidden_size * 4)

        workspace = cu.MemAlloc(self.ctx, sz_work)
        x_buf = cu.MemAlloc(self.ctx, x.nbytes * n_unroll)
        for i in range(n_unroll):  # will feed the same input
            x_buf.to_device(x, x.nbytes * i, x.nbytes)
        y_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll)
        hx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        hx_buf.memset32_async()
        hy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        cx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        cx_buf.memset32_async()
        cy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)

        y_desc = cudnn.TensorDescriptor()
        y_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (batch_size, hidden_size, 1))

        h_desc = cudnn.TensorDescriptor()
        h_desc.set_nd(cudnn.CUDNN_DATA_FLOAT,
                      (n_layers, batch_size, hidden_size))

        self.ctx.synchronize()
        logging.debug("Starting forward inference")
        for i in range(5):
            self.cudnn.rnn_forward_inference(
                rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
                h_desc, cx_buf, params_desc, params,
                (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf,
                h_desc, cy_buf, workspace, sz_work)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Forward inference done in %.6f sec",
                      (time.time() - t0) / 4)

        train_space = cu.MemAlloc(self.ctx, sz_train)
        self.cudnn.rnn_forward_training(
            rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
            h_desc, cx_buf, params_desc, params,
            (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc,
            cy_buf, workspace, sz_work, train_space, sz_train)
        self.ctx.synchronize()
        logging.debug("Forward training done")

        dy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll)
        dy_buf.from_device_async(y_buf)
        dhy_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dhy_buf.memset32_async()
        dcy_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dcy_buf.memset32_async()
        dx_buf = cu.MemAlloc(self.ctx, x_buf.size)
        dhx_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dcx_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        self.ctx.synchronize()
        logging.debug("Starting backpropagation")
        for i in range(5):
            self.cudnn.rnn_backward_data(
                rnn, (y_desc for _i in range(n_unroll)), y_buf,
                (y_desc for _i in range(n_unroll)), dy_buf, h_desc, dhy_buf,
                h_desc, dcy_buf, params_desc, params, h_desc, hx_buf, h_desc,
                cx_buf, (x_desc
                         for _i in range(n_unroll)), dx_buf, h_desc, dhx_buf,
                h_desc, dcx_buf, workspace, sz_work, train_space, sz_train)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Backpropagation done in %.6f sec",
                      (time.time() - t0) / 4)

        dw = cu.MemAlloc(self.ctx, params.size)
        logging.debug("Starting gradient computation")
        for i in range(5):
            self.cudnn.rnn_backward_weights(
                rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
                (y_desc for _i in range(n_unroll)), y_buf, workspace, sz_work,
                params_desc, dw, train_space, sz_train)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Gradient computation done in %.6f sec",
                      (time.time() - t0) / 4)

        logging.debug("EXIT: test_rnn")
Exemple #20
0
    def test_dropout(self):
        if self.cudnn.version < 5000:
            return
        logging.debug("ENTER: test_dropout")

        drop_ss = self.cudnn.dropout_states_size
        self.assertIsInstance(drop_ss, int)
        logging.debug("Dropout states size is %d", drop_ss)

        input_data = numpy.zeros((5, 16, 32, 24), dtype=numpy.float32)
        numpy.random.seed(1234)
        input_data[:] = numpy.random.rand(input_data.size).reshape(
            input_data.shape) - 0.5
        input_desc = cudnn.TensorDescriptor()
        input_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                          *input_data.shape)

        drop_rss = input_desc.dropout_reserve_space_size
        self.assertIsInstance(drop_rss, int)
        logging.debug("Dropout reserve space size for %s is %d",
                      input_data.shape, drop_rss)

        drop = cudnn.DropoutDescriptor()
        self.assertIsNone(self.cudnn.dropout_desc)
        self.assertIsNone(self.cudnn.dropout_states)
        self.cudnn.set_dropout_descriptor(drop)  # with default parameters
        self.assertIs(self.cudnn.dropout_desc, drop)
        self.assertIsNone(self.cudnn.dropout_states)
        states = cu.MemAlloc(self.ctx, drop_ss)
        self.cudnn.set_dropout_descriptor(drop, 0.5, states, drop_ss, 1234)
        self.assertIs(self.cudnn.dropout_desc, drop)
        self.assertIs(self.cudnn.dropout_states, states)

        input_buf = cu.MemAlloc(self.ctx, input_data)
        output_buf = cu.MemAlloc(self.ctx, input_buf.size)
        reserve = cu.MemAlloc(self.ctx, drop_rss)

        self.cudnn.dropout_forward(drop, input_desc, input_buf, input_desc,
                                   output_buf, reserve, reserve.size)
        output_data = numpy.ones_like(input_data)
        output_buf.to_host(output_data)
        n_z = 0
        for i, y in numpy.ndenumerate(output_data):
            if not y:
                n_z += 1
                continue
            x = input_data[i]
            self.assertEqual(y, x * 2)
        self.assertGreater(n_z,
                           (input_data.size >> 1) - (input_data.size >> 5))

        err_data = numpy.ones_like(input_data)
        err_data[:] = numpy.random.rand(output_data.size).reshape(
            output_data.shape) - 0.5
        err_buf = cu.MemAlloc(self.ctx, err_data)
        self.cudnn.dropout_backward(drop, input_desc, err_buf, input_desc,
                                    input_buf, reserve, reserve.size)
        input_buf.to_host(input_data)
        n_z = 0
        for i, x in numpy.ndenumerate(input_data):
            if not output_data[i]:
                self.assertEqual(x, 0.0)
                n_z += 1
                continue
            y = err_data[i]
            if self.cudnn.version == 5004:
                self.assertEqual(x, y)  # strangely, it doesn't scale gradient
            else:
                self.assertEqual(x, y * 2)
        self.assertGreater(n_z,
                           (input_data.size >> 1) - (input_data.size >> 5))

        logging.debug("EXIT: test_dropout")
Exemple #21
0
    def test_lstm(self):
        if self.cudnn.version < 5000:
            return
        logging.debug("ENTER: test_lstm")

        drop = cudnn.DropoutDescriptor()
        drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size)
        self.cudnn.set_dropout_descriptor(drop, 0.0, drop_states,
                                          drop_states.size, 1234)

        rnn = cudnn.RNNDescriptor()
        self.assertEqual(rnn.hidden_size, 0)
        self.assertEqual(rnn.num_layers, 0)
        self.assertIsNone(rnn.dropout_desc)
        self.assertEqual(rnn.input_mode, -1)
        self.assertEqual(rnn.direction, -1)
        self.assertEqual(rnn.mode, -1)
        self.assertEqual(rnn.data_type, -1)
        self.assertEqual(rnn.num_linear_layers, 0)

        batch_size = 8
        x_arr = numpy.zeros(
            (batch_size, 16),  # minibatch, input size
            dtype=DTYPE)
        numpy.random.seed(1234)
        x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape) - 0.5
        x_desc = cudnn.TensorDescriptor()
        # Set input as 3-dimensional like in cudnn example.
        x_desc.set_nd(CUTYPE, (x_arr.shape[0], x_arr.shape[1], 1))
        n_unroll = 5
        hidden_size = 16
        n_layers = 3

        def assert_values():
            self.assertEqual(rnn.hidden_size, hidden_size)
            self.assertEqual(rnn.num_layers, n_layers)
            self.assertIs(rnn.dropout_desc, drop)
            self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT)
            self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL)
            self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM)
            self.assertEqual(rnn.data_type, CUTYPE)
            self.assertEqual(rnn.num_linear_layers, 8)

        # Full syntax
        rnn = cudnn.RNNDescriptor()
        rnn.set(hidden_size,
                n_layers,
                drop,
                input_mode=cudnn.CUDNN_LINEAR_INPUT,
                direction=cudnn.CUDNN_UNIDIRECTIONAL,
                mode=cudnn.CUDNN_LSTM,
                data_type=CUTYPE)
        assert_values()

        x_descs = tuple(x_desc for _i in range(n_unroll))

        def get_sz(func):
            sz = func(rnn, x_descs)
            self.assertIsInstance(sz, int)
            return sz

        sz_work = get_sz(self.cudnn.get_rnn_workspace_size)
        logging.debug("RNN workspace size for %s with %d unrolls is %d",
                      x_arr.shape, n_unroll, sz_work)

        sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size)
        logging.debug("RNN train size for %s with %d unrolls is %d",
                      x_arr.shape, n_unroll, sz_train)

        sz_weights = self.cudnn.get_rnn_params_size(rnn, x_desc)
        logging.debug("RNN weights size for %s is %d", x_arr.shape, sz_weights)
        sz_expected = ITEMSIZE * (
            4 * (x_arr.shape[1] + hidden_size + 2) * hidden_size + 4 *
            (hidden_size + hidden_size + 2) * hidden_size * (n_layers - 1))
        self.assertEqual(sz_weights, sz_expected)

        weights_desc = cudnn.FilterDescriptor()
        weights_desc.set_nd(CUTYPE, (sz_weights // ITEMSIZE, 1, 1))
        weights = cu.MemAlloc(self.ctx, sz_weights)
        weights_arr = numpy.random.rand(sz_weights // ITEMSIZE).astype(DTYPE)
        weights_arr -= 0.5
        weights_arr *= 0.1
        weights.to_device(weights_arr)
        w_desc = cudnn.FilterDescriptor()
        w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc,
                                                       weights_desc, weights,
                                                       0, w_desc)
        logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d",
                      w_desc.dims, w_desc.fmt, w.size)
        self.assertEqual(w.size, hidden_size * x_arr.shape[1] * ITEMSIZE)

        b_desc = cudnn.FilterDescriptor()
        b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc,
                                                     weights_desc, weights, 0,
                                                     b_desc)
        logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d",
                      b_desc.dims, b_desc.fmt, b.size)
        self.assertEqual(b.size, hidden_size * ITEMSIZE)

        work_buf = cu.MemAlloc(self.ctx, sz_work)
        work_buf.memset32_async()
        x = cu.MemAlloc(self.ctx, x_arr.nbytes * n_unroll)
        for i in range(n_unroll):  # will feed the same input
            x.to_device(x_arr, x_arr.nbytes * i, x_arr.nbytes)
        y_arr = numpy.zeros((n_unroll, batch_size, hidden_size), dtype=DTYPE)
        y = cu.MemAlloc(self.ctx, y_arr)
        hx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE)
        hx_arr[:] = numpy.random.rand(hx_arr.size).reshape(hx_arr.shape)
        hx_arr -= 0.5
        hx = cu.MemAlloc(self.ctx, hx_arr)
        hy = cu.MemAlloc(self.ctx, hx.size)
        hy.memset32_async()
        cx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE)
        cx_arr[:] = numpy.random.rand(cx_arr.size).reshape(cx_arr.shape)
        cx_arr -= 0.5
        cx = cu.MemAlloc(self.ctx, cx_arr)
        cy = cu.MemAlloc(self.ctx, cx.size)
        cy.memset32_async()

        y_desc = cudnn.TensorDescriptor()
        y_desc.set_nd(CUTYPE, (batch_size, hidden_size, 1))
        y_descs = tuple(y_desc for _i in range(n_unroll))

        h_desc = cudnn.TensorDescriptor()
        h_desc.set_nd(CUTYPE, (n_layers, batch_size, hidden_size))

        train_buf = cu.MemAlloc(self.ctx, sz_train)
        train_buf.memset32_async()
        self.cudnn.rnn_forward_training(rnn, x_descs, x, h_desc, hx, h_desc,
                                        cx, weights_desc, weights, y_descs, y,
                                        h_desc, hy, h_desc, cy, work_buf,
                                        sz_work, train_buf, sz_train)
        self.ctx.synchronize()
        logging.debug("Forward training done")

        y.to_host(y_arr)
        target = numpy.random.rand(y_arr.size).reshape(y_arr.shape).astype(
            y_arr.dtype) - 0.5
        dy_arr = y_arr - target

        dy = cu.MemAlloc(self.ctx, dy_arr)
        dhy = cu.MemAlloc(self.ctx, hx.size)
        dhy.memset32_async()
        dcy = cu.MemAlloc(self.ctx, cx.size)
        dcy.memset32_async()
        dx_arr = numpy.zeros_like(x_arr)
        dx = cu.MemAlloc(self.ctx, dx_arr)
        dhx_arr = numpy.zeros_like(hx_arr)
        dhx = cu.MemAlloc(self.ctx, dhx_arr)
        dcx_arr = numpy.zeros_like(cx_arr)
        dcx = cu.MemAlloc(self.ctx, dcx_arr)

        self.cudnn.rnn_backward_data(rnn, y_descs, y, y_descs, dy, h_desc, dhy,
                                     h_desc, dcy, weights_desc, weights,
                                     h_desc, hx, h_desc, cx, x_descs, dx,
                                     h_desc, dhx, h_desc, dcx, work_buf,
                                     sz_work, train_buf, sz_train)
        logging.debug("Backpropagation done")

        dx.to_host(dx_arr)
        dhx.to_host(dhx_arr)
        dcx.to_host(dcx_arr)

        def forward():
            x.to_device_async(x_arr)
            hx.to_device_async(hx_arr)
            cx.to_device_async(cx_arr)
            self.cudnn.rnn_forward_inference(rnn, x_descs, x, h_desc, hx,
                                             h_desc, cx, weights_desc, weights,
                                             y_descs, y, h_desc, hy, h_desc,
                                             cy, work_buf, sz_work)
            y.to_host(y_arr)

        numdiff = NumDiff()

        logging.debug("Checking dx...")
        numdiff.check_diff(x_arr, y_arr, target, dx_arr, forward)

        logging.debug("Checking dhx...")
        numdiff.check_diff(hx_arr, y_arr, target, dhx_arr, forward)

        logging.debug("Checking dcx...")
        numdiff.check_diff(cx_arr, y_arr, target, dcx_arr, forward)

        logging.debug("EXIT: test_lstm")
Exemple #22
0
 def _alloc_temp_buffer(self, size):
     # Allocate the buffer
     return cu.MemAlloc(self.context, size)
Exemple #23
0
 def cuda_create_devmem(self):
     self._devmem_ = cu.MemAlloc(self.device.context, self.plain.nbytes)
     self.devmem.to_device(self.mem)