コード例 #1
0
    def test_convolution_backward_data(self):
        logging.debug("ENTER: test_convolution_backward_data")

        conv_desc = cudnn.ConvolutionDescriptor()
        conv_desc.set_2d(5, 5, 1, 1)

        inp_data = numpy.zeros((100, 8, 96, 96), dtype=numpy.float32)
        inp_desc = cudnn.TensorDescriptor()
        inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *inp_data.shape)
        inp_buf = cu.MemAlloc(self.ctx, inp_data)

        filter_data = numpy.zeros((64, 8, 11, 11), dtype=numpy.float32)
        filter_data[:] = 0.1
        filter_desc = cudnn.FilterDescriptor()
        filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape)
        filter_buf = cu.MemAlloc(self.ctx, filter_data)

        bperr_data = numpy.zeros((100, 64, 96, 96), dtype=numpy.float32)
        bperr_data[:] = 0.1
        bperr_desc = cudnn.TensorDescriptor()
        bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                          *bperr_data.shape)
        bperr_buf = cu.MemAlloc(self.ctx, bperr_data)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf,
                                             bperr_desc, bperr_buf, conv_desc,
                                             beta, inp_desc, inp_buf)

        inp_buf.to_host(inp_data)
        self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size)

        if self.cudnn.version >= 4000:
            algo = self.cudnn.get_convolution_backward_data_algorithm(
                filter_desc, bperr_desc, conv_desc, inp_desc)
            logging.debug("Fastest algo is %d", algo)
            sz = self.cudnn.get_convolution_backward_data_workspace_size(
                filter_desc, bperr_desc, conv_desc, inp_desc, algo)
            logging.debug("Workspace size for it is %d", sz)
            algo = self.cudnn.get_convolution_backward_data_algorithm(
                filter_desc, bperr_desc, conv_desc, inp_desc,
                cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
                512 * 1024 * 1024)
            logging.debug("With 512 Mb limit: %d", algo)
            workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024)
            inp_buf.memset32_async()
            self.cudnn.convolution_backward_data(alpha, filter_desc,
                                                 filter_buf, bperr_desc,
                                                 bperr_buf, conv_desc, beta,
                                                 inp_desc, inp_buf, algo,
                                                 workspace, workspace.size)
            inp_buf.to_host(inp_data)
            self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size)

        logging.debug("EXIT: test_convolution_backward_data")
コード例 #2
0
 def _init_descriptors(self, include_out=False):
     conv = cudnn.ConvolutionDescriptor()
     conv.set_2d(5, 4, 2, 1)
     inp = cudnn.TensorDescriptor()
     inp.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, 100, 8,
                208, 224)
     filter = cudnn.FilterDescriptor()
     filter.set_4d(cudnn.CUDNN_DATA_FLOAT, 64, 8, 11, 7)
     if not include_out:
         return conv, inp, filter
     n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim(
         conv, inp, filter)
     out = cudnn.TensorDescriptor()
     out.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, n, c, h, w)
     return conv, inp, filter, out
コード例 #3
0
    def test_convolution_forward(self):
        logging.debug("ENTER: test_convolution_forward")

        conv_desc = cudnn.ConvolutionDescriptor()
        conv_desc.set_2d(5, 4, 2, 1)

        inp_data = numpy.zeros((100, 8, 104, 112), dtype=numpy.float32)
        inp_data[:] = 0.1
        inp_desc = cudnn.TensorDescriptor()
        inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *inp_data.shape)
        inp_buf = cu.MemAlloc(self.ctx, inp_data)

        filter_data = numpy.zeros((64, 8, 11, 7), dtype=numpy.float32)
        filter_data[:] = 0.3
        filter_desc = cudnn.FilterDescriptor()
        filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape)
        filter_buf = cu.MemAlloc(self.ctx, filter_data)

        n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim(
            conv_desc, inp_desc, filter_desc)
        out_data = numpy.zeros((n, c, h, w), dtype=numpy.float32)
        out_desc = cudnn.TensorDescriptor()
        out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT,
                        *out_data.shape)
        out_buf = cu.MemAlloc(self.ctx, out_data)

        workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024)
        algo = self.cudnn.get_convolution_forward_algorithm(
            inp_desc, filter_desc, conv_desc, out_desc,
            cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
            workspace.size)

        alpha = numpy.ones(1, dtype=numpy.float32)
        beta = numpy.zeros(1, dtype=numpy.float32)
        self.cudnn.convolution_forward(alpha, inp_desc, inp_buf, filter_desc,
                                       filter_buf, conv_desc, algo, workspace,
                                       workspace.size, beta, out_desc, out_buf)

        out_buf.to_host(out_data)
        self.assertEqual(numpy.count_nonzero(out_data), out_data.size)

        logging.debug("EXIT: test_convolution_forward")
コード例 #4
0
    def test_rnn(self):
        if self.cudnn.version < 5000:
            return
        logging.debug("ENTER: test_rnn")

        drop = cudnn.DropoutDescriptor()
        drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size)
        self.cudnn.set_dropout_descriptor(drop, 0.5, drop_states,
                                          drop_states.size, 1234)

        rnn = cudnn.RNNDescriptor()
        self.assertEqual(rnn.hidden_size, 0)
        self.assertEqual(rnn.num_layers, 0)
        self.assertIsNone(rnn.dropout_desc)
        self.assertEqual(rnn.input_mode, -1)
        self.assertEqual(rnn.direction, -1)
        self.assertEqual(rnn.mode, -1)
        self.assertEqual(rnn.data_type, -1)
        self.assertEqual(rnn.num_linear_layers, 0)

        batch_size = 4
        x = numpy.zeros(
            (batch_size, 32),  # minibatch, input size
            dtype=numpy.float32)
        numpy.random.seed(1234)
        x[:] = numpy.random.rand(x.size).reshape(x.shape) - 0.5
        x_desc = cudnn.TensorDescriptor()
        # Set input as 3-dimensional like in cudnn example:
        # minibatch, input_size, 1
        x_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (x.shape[0], x.shape[1], 1))
        n_unroll = 16
        hidden_size = 64
        n_layers = 3

        def assert_values():
            self.assertEqual(rnn.hidden_size, hidden_size)
            self.assertEqual(rnn.num_layers, n_layers)
            self.assertIs(rnn.dropout_desc, drop)
            self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT)
            self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL)
            self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM)
            self.assertEqual(rnn.data_type, cudnn.CUDNN_DATA_FLOAT)
            self.assertEqual(rnn.num_linear_layers, 8)

        # Short syntax
        rnn.set(hidden_size, n_layers, drop)
        assert_values()
        # Check num_linear_layers property
        for mode, n in ((cudnn.CUDNN_RNN_RELU, 2), (cudnn.CUDNN_RNN_TANH, 2),
                        (cudnn.CUDNN_GRU, 6)):
            rnn = cudnn.RNNDescriptor()
            rnn.set(hidden_size, n_layers, drop, mode=mode)
            self.assertEqual(rnn.num_linear_layers, n)

        # Full syntax
        rnn = cudnn.RNNDescriptor()
        rnn.set(hidden_size,
                n_layers,
                drop,
                input_mode=cudnn.CUDNN_LINEAR_INPUT,
                direction=cudnn.CUDNN_UNIDIRECTIONAL,
                mode=cudnn.CUDNN_LSTM,
                data_type=cudnn.CUDNN_DATA_FLOAT)
        assert_values()

        def get_sz(func):
            sz = func(rnn, (x_desc for _i in range(n_unroll)))
            self.assertIsInstance(sz, int)
            return sz

        sz_work = get_sz(self.cudnn.get_rnn_workspace_size)
        logging.debug("RNN workspace size for %s with %d unrolls is %d",
                      x.shape, n_unroll, sz_work)

        sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size)
        logging.debug("RNN train size for %s with %d unrolls is %d", x.shape,
                      n_unroll, sz_train)

        sz_params = self.cudnn.get_rnn_params_size(rnn, x_desc)
        logging.debug("RNN params size for %s is %d", x.shape, sz_params)
        x_desc2 = cudnn.TensorDescriptor()
        x_desc2.set_nd(cudnn.CUDNN_DATA_DOUBLE, (x.shape[0], x.shape[1], 1))
        sz_params2 = self.cudnn.get_rnn_params_size(rnn, x_desc2,
                                                    cudnn.CUDNN_DATA_DOUBLE)
        self.assertEqual(sz_params2, sz_params * 2)

        params_desc = cudnn.FilterDescriptor()
        params_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (sz_params >> 2, 1, 1))
        params = cu.MemAlloc(self.ctx, sz_params)
        params.memset32_async()
        w_desc = cudnn.FilterDescriptor()
        w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc,
                                                       params_desc, params, 0,
                                                       w_desc)
        logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d",
                      w_desc.dims, w_desc.fmt, w.size)
        self.assertEqual(w.size, hidden_size * x.shape[1] * 4)

        b_desc = cudnn.FilterDescriptor()
        b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc,
                                                     params_desc, params, 0,
                                                     b_desc)
        logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d",
                      b_desc.dims, b_desc.fmt, b.size)
        self.assertEqual(b.size, hidden_size * 4)

        workspace = cu.MemAlloc(self.ctx, sz_work)
        x_buf = cu.MemAlloc(self.ctx, x.nbytes * n_unroll)
        for i in range(n_unroll):  # will feed the same input
            x_buf.to_device(x, x.nbytes * i, x.nbytes)
        y_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll)
        hx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        hx_buf.memset32_async()
        hy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        cx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)
        cx_buf.memset32_async()
        cy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers)

        y_desc = cudnn.TensorDescriptor()
        y_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (batch_size, hidden_size, 1))

        h_desc = cudnn.TensorDescriptor()
        h_desc.set_nd(cudnn.CUDNN_DATA_FLOAT,
                      (n_layers, batch_size, hidden_size))

        self.ctx.synchronize()
        logging.debug("Starting forward inference")
        for i in range(5):
            self.cudnn.rnn_forward_inference(
                rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
                h_desc, cx_buf, params_desc, params,
                (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf,
                h_desc, cy_buf, workspace, sz_work)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Forward inference done in %.6f sec",
                      (time.time() - t0) / 4)

        train_space = cu.MemAlloc(self.ctx, sz_train)
        self.cudnn.rnn_forward_training(
            rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
            h_desc, cx_buf, params_desc, params,
            (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc,
            cy_buf, workspace, sz_work, train_space, sz_train)
        self.ctx.synchronize()
        logging.debug("Forward training done")

        dy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll)
        dy_buf.from_device_async(y_buf)
        dhy_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dhy_buf.memset32_async()
        dcy_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dcy_buf.memset32_async()
        dx_buf = cu.MemAlloc(self.ctx, x_buf.size)
        dhx_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        dcx_buf = cu.MemAlloc(self.ctx,
                              4 * hidden_size * batch_size * n_layers)
        self.ctx.synchronize()
        logging.debug("Starting backpropagation")
        for i in range(5):
            self.cudnn.rnn_backward_data(
                rnn, (y_desc for _i in range(n_unroll)), y_buf,
                (y_desc for _i in range(n_unroll)), dy_buf, h_desc, dhy_buf,
                h_desc, dcy_buf, params_desc, params, h_desc, hx_buf, h_desc,
                cx_buf, (x_desc
                         for _i in range(n_unroll)), dx_buf, h_desc, dhx_buf,
                h_desc, dcx_buf, workspace, sz_work, train_space, sz_train)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Backpropagation done in %.6f sec",
                      (time.time() - t0) / 4)

        dw = cu.MemAlloc(self.ctx, params.size)
        logging.debug("Starting gradient computation")
        for i in range(5):
            self.cudnn.rnn_backward_weights(
                rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf,
                (y_desc for _i in range(n_unroll)), y_buf, workspace, sz_work,
                params_desc, dw, train_space, sz_train)
            if i == 0:
                self.ctx.synchronize()
                t0 = time.time()
        self.ctx.synchronize()
        logging.debug("Gradient computation done in %.6f sec",
                      (time.time() - t0) / 4)

        logging.debug("EXIT: test_rnn")
コード例 #5
0
    def test_filter_descriptor(self):
        logging.debug("ENTER: test_filter_descriptor")

        d = cudnn.FilterDescriptor()
        self.assertIsNotNone(d.handle)
        for dt in (cudnn.CUDNN_DATA_DOUBLE, cudnn.CUDNN_DATA_FLOAT):
            d.set_4d(dt, 64, 3, 11, 12)
            self.assertEqual(d.data_type, dt)
            self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW)
            self.assertEqual(d.k, 64)
            self.assertEqual(d.c, 3)
            self.assertEqual(d.h, 11)
            self.assertEqual(d.w, 12)

        if self.cudnn.version < 5000:
            logging.debug("EXIT: test_filter_descriptor")
            return

        d = cudnn.FilterDescriptor()
        self.assertEqual(d.data_type, -1)
        self.assertEqual(d.fmt, -1)
        self.assertEqual(d.k, 0)
        self.assertEqual(d.c, 0)
        self.assertEqual(d.h, 0)
        self.assertEqual(d.w, 0)

        def assert_attrs():
            self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT)
            self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW)
            self.assertEqual(d.k, 10)
            self.assertEqual(d.c, 5)
            self.assertEqual(d.h, 32)
            self.assertEqual(d.w, 16)

        d.set_4d(cudnn.CUDNN_DATA_FLOAT, 10, 5, 32, 16)
        assert_attrs()

        d._data_type = -1
        d._fmt = -1
        d._k = 0
        d._c = 0
        d._h = 0
        d._w = 0
        d.get_4d()
        assert_attrs()

        d = cudnn.FilterDescriptor()
        d.set_4d(cudnn.CUDNN_DATA_FLOAT, 10, 5, 32, 16,
                 cudnn.CUDNN_TENSOR_NHWC)
        self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NHWC)

        d = cudnn.FilterDescriptor()
        self.assertEqual(len(d.dims), 0)

        d.set_nd(cudnn.CUDNN_DATA_FLOAT, (1, 2, 3))
        self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW)
        self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT)
        self.assertEqual(d.dims, (1, 2, 3))
        d._data_type = -1
        d._fmt = -1
        d._dims = tuple()
        d.get_nd(3)
        self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW)
        self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT)
        self.assertEqual(d.dims, (1, 2, 3))

        d = cudnn.FilterDescriptor()
        d.set_nd(cudnn.CUDNN_DATA_FLOAT, (1, 2, 3), cudnn.CUDNN_TENSOR_NHWC)
        self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NHWC)

        logging.debug("EXIT: test_filter_descriptor")
コード例 #6
0
ファイル: test_lstm.py プロジェクト: ra2003/cuda4py
    def test_lstm(self):
        if self.cudnn.version < 5000:
            return
        logging.debug("ENTER: test_lstm")

        drop = cudnn.DropoutDescriptor()
        drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size)
        self.cudnn.set_dropout_descriptor(drop, 0.0, drop_states,
                                          drop_states.size, 1234)

        rnn = cudnn.RNNDescriptor()
        self.assertEqual(rnn.hidden_size, 0)
        self.assertEqual(rnn.num_layers, 0)
        self.assertIsNone(rnn.dropout_desc)
        self.assertEqual(rnn.input_mode, -1)
        self.assertEqual(rnn.direction, -1)
        self.assertEqual(rnn.mode, -1)
        self.assertEqual(rnn.data_type, -1)
        self.assertEqual(rnn.num_linear_layers, 0)

        batch_size = 8
        x_arr = numpy.zeros(
            (batch_size, 16),  # minibatch, input size
            dtype=DTYPE)
        numpy.random.seed(1234)
        x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape) - 0.5
        x_desc = cudnn.TensorDescriptor()
        # Set input as 3-dimensional like in cudnn example.
        x_desc.set_nd(CUTYPE, (x_arr.shape[0], x_arr.shape[1], 1))
        n_unroll = 5
        hidden_size = 16
        n_layers = 3

        def assert_values():
            self.assertEqual(rnn.hidden_size, hidden_size)
            self.assertEqual(rnn.num_layers, n_layers)
            self.assertIs(rnn.dropout_desc, drop)
            self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT)
            self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL)
            self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM)
            self.assertEqual(rnn.data_type, CUTYPE)
            self.assertEqual(rnn.num_linear_layers, 8)

        # Full syntax
        rnn = cudnn.RNNDescriptor()
        rnn.set(hidden_size,
                n_layers,
                drop,
                input_mode=cudnn.CUDNN_LINEAR_INPUT,
                direction=cudnn.CUDNN_UNIDIRECTIONAL,
                mode=cudnn.CUDNN_LSTM,
                data_type=CUTYPE)
        assert_values()

        x_descs = tuple(x_desc for _i in range(n_unroll))

        def get_sz(func):
            sz = func(rnn, x_descs)
            self.assertIsInstance(sz, int)
            return sz

        sz_work = get_sz(self.cudnn.get_rnn_workspace_size)
        logging.debug("RNN workspace size for %s with %d unrolls is %d",
                      x_arr.shape, n_unroll, sz_work)

        sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size)
        logging.debug("RNN train size for %s with %d unrolls is %d",
                      x_arr.shape, n_unroll, sz_train)

        sz_weights = self.cudnn.get_rnn_params_size(rnn, x_desc)
        logging.debug("RNN weights size for %s is %d", x_arr.shape, sz_weights)
        sz_expected = ITEMSIZE * (
            4 * (x_arr.shape[1] + hidden_size + 2) * hidden_size + 4 *
            (hidden_size + hidden_size + 2) * hidden_size * (n_layers - 1))
        self.assertEqual(sz_weights, sz_expected)

        weights_desc = cudnn.FilterDescriptor()
        weights_desc.set_nd(CUTYPE, (sz_weights // ITEMSIZE, 1, 1))
        weights = cu.MemAlloc(self.ctx, sz_weights)
        weights_arr = numpy.random.rand(sz_weights // ITEMSIZE).astype(DTYPE)
        weights_arr -= 0.5
        weights_arr *= 0.1
        weights.to_device(weights_arr)
        w_desc = cudnn.FilterDescriptor()
        w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc,
                                                       weights_desc, weights,
                                                       0, w_desc)
        logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d",
                      w_desc.dims, w_desc.fmt, w.size)
        self.assertEqual(w.size, hidden_size * x_arr.shape[1] * ITEMSIZE)

        b_desc = cudnn.FilterDescriptor()
        b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc,
                                                     weights_desc, weights, 0,
                                                     b_desc)
        logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d",
                      b_desc.dims, b_desc.fmt, b.size)
        self.assertEqual(b.size, hidden_size * ITEMSIZE)

        work_buf = cu.MemAlloc(self.ctx, sz_work)
        work_buf.memset32_async()
        x = cu.MemAlloc(self.ctx, x_arr.nbytes * n_unroll)
        for i in range(n_unroll):  # will feed the same input
            x.to_device(x_arr, x_arr.nbytes * i, x_arr.nbytes)
        y_arr = numpy.zeros((n_unroll, batch_size, hidden_size), dtype=DTYPE)
        y = cu.MemAlloc(self.ctx, y_arr)
        hx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE)
        hx_arr[:] = numpy.random.rand(hx_arr.size).reshape(hx_arr.shape)
        hx_arr -= 0.5
        hx = cu.MemAlloc(self.ctx, hx_arr)
        hy = cu.MemAlloc(self.ctx, hx.size)
        hy.memset32_async()
        cx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE)
        cx_arr[:] = numpy.random.rand(cx_arr.size).reshape(cx_arr.shape)
        cx_arr -= 0.5
        cx = cu.MemAlloc(self.ctx, cx_arr)
        cy = cu.MemAlloc(self.ctx, cx.size)
        cy.memset32_async()

        y_desc = cudnn.TensorDescriptor()
        y_desc.set_nd(CUTYPE, (batch_size, hidden_size, 1))
        y_descs = tuple(y_desc for _i in range(n_unroll))

        h_desc = cudnn.TensorDescriptor()
        h_desc.set_nd(CUTYPE, (n_layers, batch_size, hidden_size))

        train_buf = cu.MemAlloc(self.ctx, sz_train)
        train_buf.memset32_async()
        self.cudnn.rnn_forward_training(rnn, x_descs, x, h_desc, hx, h_desc,
                                        cx, weights_desc, weights, y_descs, y,
                                        h_desc, hy, h_desc, cy, work_buf,
                                        sz_work, train_buf, sz_train)
        self.ctx.synchronize()
        logging.debug("Forward training done")

        y.to_host(y_arr)
        target = numpy.random.rand(y_arr.size).reshape(y_arr.shape).astype(
            y_arr.dtype) - 0.5
        dy_arr = y_arr - target

        dy = cu.MemAlloc(self.ctx, dy_arr)
        dhy = cu.MemAlloc(self.ctx, hx.size)
        dhy.memset32_async()
        dcy = cu.MemAlloc(self.ctx, cx.size)
        dcy.memset32_async()
        dx_arr = numpy.zeros_like(x_arr)
        dx = cu.MemAlloc(self.ctx, dx_arr)
        dhx_arr = numpy.zeros_like(hx_arr)
        dhx = cu.MemAlloc(self.ctx, dhx_arr)
        dcx_arr = numpy.zeros_like(cx_arr)
        dcx = cu.MemAlloc(self.ctx, dcx_arr)

        self.cudnn.rnn_backward_data(rnn, y_descs, y, y_descs, dy, h_desc, dhy,
                                     h_desc, dcy, weights_desc, weights,
                                     h_desc, hx, h_desc, cx, x_descs, dx,
                                     h_desc, dhx, h_desc, dcx, work_buf,
                                     sz_work, train_buf, sz_train)
        logging.debug("Backpropagation done")

        dx.to_host(dx_arr)
        dhx.to_host(dhx_arr)
        dcx.to_host(dcx_arr)

        def forward():
            x.to_device_async(x_arr)
            hx.to_device_async(hx_arr)
            cx.to_device_async(cx_arr)
            self.cudnn.rnn_forward_inference(rnn, x_descs, x, h_desc, hx,
                                             h_desc, cx, weights_desc, weights,
                                             y_descs, y, h_desc, hy, h_desc,
                                             cy, work_buf, sz_work)
            y.to_host(y_arr)

        numdiff = NumDiff()

        logging.debug("Checking dx...")
        numdiff.check_diff(x_arr, y_arr, target, dx_arr, forward)

        logging.debug("Checking dhx...")
        numdiff.check_diff(hx_arr, y_arr, target, dhx_arr, forward)

        logging.debug("Checking dcx...")
        numdiff.check_diff(cx_arr, y_arr, target, dcx_arr, forward)

        logging.debug("EXIT: test_lstm")