def test_convolution_backward_data(self): logging.debug("ENTER: test_convolution_backward_data") conv_desc = cudnn.ConvolutionDescriptor() conv_desc.set_2d(5, 5, 1, 1) inp_data = numpy.zeros((100, 8, 96, 96), dtype=numpy.float32) inp_desc = cudnn.TensorDescriptor() inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *inp_data.shape) inp_buf = cu.MemAlloc(self.ctx, inp_data) filter_data = numpy.zeros((64, 8, 11, 11), dtype=numpy.float32) filter_data[:] = 0.1 filter_desc = cudnn.FilterDescriptor() filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape) filter_buf = cu.MemAlloc(self.ctx, filter_data) bperr_data = numpy.zeros((100, 64, 96, 96), dtype=numpy.float32) bperr_data[:] = 0.1 bperr_desc = cudnn.TensorDescriptor() bperr_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *bperr_data.shape) bperr_buf = cu.MemAlloc(self.ctx, bperr_data) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf, bperr_desc, bperr_buf, conv_desc, beta, inp_desc, inp_buf) inp_buf.to_host(inp_data) self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size) if self.cudnn.version >= 4000: algo = self.cudnn.get_convolution_backward_data_algorithm( filter_desc, bperr_desc, conv_desc, inp_desc) logging.debug("Fastest algo is %d", algo) sz = self.cudnn.get_convolution_backward_data_workspace_size( filter_desc, bperr_desc, conv_desc, inp_desc, algo) logging.debug("Workspace size for it is %d", sz) algo = self.cudnn.get_convolution_backward_data_algorithm( filter_desc, bperr_desc, conv_desc, inp_desc, cudnn.CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT, 512 * 1024 * 1024) logging.debug("With 512 Mb limit: %d", algo) workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024) inp_buf.memset32_async() self.cudnn.convolution_backward_data(alpha, filter_desc, filter_buf, bperr_desc, bperr_buf, conv_desc, beta, inp_desc, inp_buf, algo, workspace, workspace.size) inp_buf.to_host(inp_data) self.assertEqual(numpy.count_nonzero(inp_data), inp_data.size) logging.debug("EXIT: test_convolution_backward_data")
def _init_descriptors(self, include_out=False): conv = cudnn.ConvolutionDescriptor() conv.set_2d(5, 4, 2, 1) inp = cudnn.TensorDescriptor() inp.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, 100, 8, 208, 224) filter = cudnn.FilterDescriptor() filter.set_4d(cudnn.CUDNN_DATA_FLOAT, 64, 8, 11, 7) if not include_out: return conv, inp, filter n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim( conv, inp, filter) out = cudnn.TensorDescriptor() out.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, n, c, h, w) return conv, inp, filter, out
def test_convolution_forward(self): logging.debug("ENTER: test_convolution_forward") conv_desc = cudnn.ConvolutionDescriptor() conv_desc.set_2d(5, 4, 2, 1) inp_data = numpy.zeros((100, 8, 104, 112), dtype=numpy.float32) inp_data[:] = 0.1 inp_desc = cudnn.TensorDescriptor() inp_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *inp_data.shape) inp_buf = cu.MemAlloc(self.ctx, inp_data) filter_data = numpy.zeros((64, 8, 11, 7), dtype=numpy.float32) filter_data[:] = 0.3 filter_desc = cudnn.FilterDescriptor() filter_desc.set_4d(cudnn.CUDNN_DATA_FLOAT, *filter_data.shape) filter_buf = cu.MemAlloc(self.ctx, filter_data) n, c, h, w = cudnn.CUDNN.get_convolution_2d_forward_output_dim( conv_desc, inp_desc, filter_desc) out_data = numpy.zeros((n, c, h, w), dtype=numpy.float32) out_desc = cudnn.TensorDescriptor() out_desc.set_4d(cudnn.CUDNN_TENSOR_NCHW, cudnn.CUDNN_DATA_FLOAT, *out_data.shape) out_buf = cu.MemAlloc(self.ctx, out_data) workspace = cu.MemAlloc(self.ctx, 512 * 1024 * 1024) algo = self.cudnn.get_convolution_forward_algorithm( inp_desc, filter_desc, conv_desc, out_desc, cudnn.CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, workspace.size) alpha = numpy.ones(1, dtype=numpy.float32) beta = numpy.zeros(1, dtype=numpy.float32) self.cudnn.convolution_forward(alpha, inp_desc, inp_buf, filter_desc, filter_buf, conv_desc, algo, workspace, workspace.size, beta, out_desc, out_buf) out_buf.to_host(out_data) self.assertEqual(numpy.count_nonzero(out_data), out_data.size) logging.debug("EXIT: test_convolution_forward")
def test_rnn(self): if self.cudnn.version < 5000: return logging.debug("ENTER: test_rnn") drop = cudnn.DropoutDescriptor() drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size) self.cudnn.set_dropout_descriptor(drop, 0.5, drop_states, drop_states.size, 1234) rnn = cudnn.RNNDescriptor() self.assertEqual(rnn.hidden_size, 0) self.assertEqual(rnn.num_layers, 0) self.assertIsNone(rnn.dropout_desc) self.assertEqual(rnn.input_mode, -1) self.assertEqual(rnn.direction, -1) self.assertEqual(rnn.mode, -1) self.assertEqual(rnn.data_type, -1) self.assertEqual(rnn.num_linear_layers, 0) batch_size = 4 x = numpy.zeros( (batch_size, 32), # minibatch, input size dtype=numpy.float32) numpy.random.seed(1234) x[:] = numpy.random.rand(x.size).reshape(x.shape) - 0.5 x_desc = cudnn.TensorDescriptor() # Set input as 3-dimensional like in cudnn example: # minibatch, input_size, 1 x_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (x.shape[0], x.shape[1], 1)) n_unroll = 16 hidden_size = 64 n_layers = 3 def assert_values(): self.assertEqual(rnn.hidden_size, hidden_size) self.assertEqual(rnn.num_layers, n_layers) self.assertIs(rnn.dropout_desc, drop) self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT) self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL) self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM) self.assertEqual(rnn.data_type, cudnn.CUDNN_DATA_FLOAT) self.assertEqual(rnn.num_linear_layers, 8) # Short syntax rnn.set(hidden_size, n_layers, drop) assert_values() # Check num_linear_layers property for mode, n in ((cudnn.CUDNN_RNN_RELU, 2), (cudnn.CUDNN_RNN_TANH, 2), (cudnn.CUDNN_GRU, 6)): rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, mode=mode) self.assertEqual(rnn.num_linear_layers, n) # Full syntax rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, input_mode=cudnn.CUDNN_LINEAR_INPUT, direction=cudnn.CUDNN_UNIDIRECTIONAL, mode=cudnn.CUDNN_LSTM, data_type=cudnn.CUDNN_DATA_FLOAT) assert_values() def get_sz(func): sz = func(rnn, (x_desc for _i in range(n_unroll))) self.assertIsInstance(sz, int) return sz sz_work = get_sz(self.cudnn.get_rnn_workspace_size) logging.debug("RNN workspace size for %s with %d unrolls is %d", x.shape, n_unroll, sz_work) sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size) logging.debug("RNN train size for %s with %d unrolls is %d", x.shape, n_unroll, sz_train) sz_params = self.cudnn.get_rnn_params_size(rnn, x_desc) logging.debug("RNN params size for %s is %d", x.shape, sz_params) x_desc2 = cudnn.TensorDescriptor() x_desc2.set_nd(cudnn.CUDNN_DATA_DOUBLE, (x.shape[0], x.shape[1], 1)) sz_params2 = self.cudnn.get_rnn_params_size(rnn, x_desc2, cudnn.CUDNN_DATA_DOUBLE) self.assertEqual(sz_params2, sz_params * 2) params_desc = cudnn.FilterDescriptor() params_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (sz_params >> 2, 1, 1)) params = cu.MemAlloc(self.ctx, sz_params) params.memset32_async() w_desc = cudnn.FilterDescriptor() w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc, params_desc, params, 0, w_desc) logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d", w_desc.dims, w_desc.fmt, w.size) self.assertEqual(w.size, hidden_size * x.shape[1] * 4) b_desc = cudnn.FilterDescriptor() b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc, params_desc, params, 0, b_desc) logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d", b_desc.dims, b_desc.fmt, b.size) self.assertEqual(b.size, hidden_size * 4) workspace = cu.MemAlloc(self.ctx, sz_work) x_buf = cu.MemAlloc(self.ctx, x.nbytes * n_unroll) for i in range(n_unroll): # will feed the same input x_buf.to_device(x, x.nbytes * i, x.nbytes) y_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll) hx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) hx_buf.memset32_async() hy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) cx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) cx_buf.memset32_async() cy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) y_desc = cudnn.TensorDescriptor() y_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (batch_size, hidden_size, 1)) h_desc = cudnn.TensorDescriptor() h_desc.set_nd(cudnn.CUDNN_DATA_FLOAT, (n_layers, batch_size, hidden_size)) self.ctx.synchronize() logging.debug("Starting forward inference") for i in range(5): self.cudnn.rnn_forward_inference( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, h_desc, cx_buf, params_desc, params, (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc, cy_buf, workspace, sz_work) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Forward inference done in %.6f sec", (time.time() - t0) / 4) train_space = cu.MemAlloc(self.ctx, sz_train) self.cudnn.rnn_forward_training( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, h_desc, cx_buf, params_desc, params, (y_desc for _i in range(n_unroll)), y_buf, h_desc, hy_buf, h_desc, cy_buf, workspace, sz_work, train_space, sz_train) self.ctx.synchronize() logging.debug("Forward training done") dy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_unroll) dy_buf.from_device_async(y_buf) dhy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dhy_buf.memset32_async() dcy_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dcy_buf.memset32_async() dx_buf = cu.MemAlloc(self.ctx, x_buf.size) dhx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) dcx_buf = cu.MemAlloc(self.ctx, 4 * hidden_size * batch_size * n_layers) self.ctx.synchronize() logging.debug("Starting backpropagation") for i in range(5): self.cudnn.rnn_backward_data( rnn, (y_desc for _i in range(n_unroll)), y_buf, (y_desc for _i in range(n_unroll)), dy_buf, h_desc, dhy_buf, h_desc, dcy_buf, params_desc, params, h_desc, hx_buf, h_desc, cx_buf, (x_desc for _i in range(n_unroll)), dx_buf, h_desc, dhx_buf, h_desc, dcx_buf, workspace, sz_work, train_space, sz_train) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Backpropagation done in %.6f sec", (time.time() - t0) / 4) dw = cu.MemAlloc(self.ctx, params.size) logging.debug("Starting gradient computation") for i in range(5): self.cudnn.rnn_backward_weights( rnn, (x_desc for _i in range(n_unroll)), x_buf, h_desc, hx_buf, (y_desc for _i in range(n_unroll)), y_buf, workspace, sz_work, params_desc, dw, train_space, sz_train) if i == 0: self.ctx.synchronize() t0 = time.time() self.ctx.synchronize() logging.debug("Gradient computation done in %.6f sec", (time.time() - t0) / 4) logging.debug("EXIT: test_rnn")
def test_filter_descriptor(self): logging.debug("ENTER: test_filter_descriptor") d = cudnn.FilterDescriptor() self.assertIsNotNone(d.handle) for dt in (cudnn.CUDNN_DATA_DOUBLE, cudnn.CUDNN_DATA_FLOAT): d.set_4d(dt, 64, 3, 11, 12) self.assertEqual(d.data_type, dt) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW) self.assertEqual(d.k, 64) self.assertEqual(d.c, 3) self.assertEqual(d.h, 11) self.assertEqual(d.w, 12) if self.cudnn.version < 5000: logging.debug("EXIT: test_filter_descriptor") return d = cudnn.FilterDescriptor() self.assertEqual(d.data_type, -1) self.assertEqual(d.fmt, -1) self.assertEqual(d.k, 0) self.assertEqual(d.c, 0) self.assertEqual(d.h, 0) self.assertEqual(d.w, 0) def assert_attrs(): self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW) self.assertEqual(d.k, 10) self.assertEqual(d.c, 5) self.assertEqual(d.h, 32) self.assertEqual(d.w, 16) d.set_4d(cudnn.CUDNN_DATA_FLOAT, 10, 5, 32, 16) assert_attrs() d._data_type = -1 d._fmt = -1 d._k = 0 d._c = 0 d._h = 0 d._w = 0 d.get_4d() assert_attrs() d = cudnn.FilterDescriptor() d.set_4d(cudnn.CUDNN_DATA_FLOAT, 10, 5, 32, 16, cudnn.CUDNN_TENSOR_NHWC) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NHWC) d = cudnn.FilterDescriptor() self.assertEqual(len(d.dims), 0) d.set_nd(cudnn.CUDNN_DATA_FLOAT, (1, 2, 3)) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW) self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT) self.assertEqual(d.dims, (1, 2, 3)) d._data_type = -1 d._fmt = -1 d._dims = tuple() d.get_nd(3) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NCHW) self.assertEqual(d.data_type, cudnn.CUDNN_DATA_FLOAT) self.assertEqual(d.dims, (1, 2, 3)) d = cudnn.FilterDescriptor() d.set_nd(cudnn.CUDNN_DATA_FLOAT, (1, 2, 3), cudnn.CUDNN_TENSOR_NHWC) self.assertEqual(d.fmt, cudnn.CUDNN_TENSOR_NHWC) logging.debug("EXIT: test_filter_descriptor")
def test_lstm(self): if self.cudnn.version < 5000: return logging.debug("ENTER: test_lstm") drop = cudnn.DropoutDescriptor() drop_states = cu.MemAlloc(self.ctx, self.cudnn.dropout_states_size) self.cudnn.set_dropout_descriptor(drop, 0.0, drop_states, drop_states.size, 1234) rnn = cudnn.RNNDescriptor() self.assertEqual(rnn.hidden_size, 0) self.assertEqual(rnn.num_layers, 0) self.assertIsNone(rnn.dropout_desc) self.assertEqual(rnn.input_mode, -1) self.assertEqual(rnn.direction, -1) self.assertEqual(rnn.mode, -1) self.assertEqual(rnn.data_type, -1) self.assertEqual(rnn.num_linear_layers, 0) batch_size = 8 x_arr = numpy.zeros( (batch_size, 16), # minibatch, input size dtype=DTYPE) numpy.random.seed(1234) x_arr[:] = numpy.random.rand(x_arr.size).reshape(x_arr.shape) - 0.5 x_desc = cudnn.TensorDescriptor() # Set input as 3-dimensional like in cudnn example. x_desc.set_nd(CUTYPE, (x_arr.shape[0], x_arr.shape[1], 1)) n_unroll = 5 hidden_size = 16 n_layers = 3 def assert_values(): self.assertEqual(rnn.hidden_size, hidden_size) self.assertEqual(rnn.num_layers, n_layers) self.assertIs(rnn.dropout_desc, drop) self.assertEqual(rnn.input_mode, cudnn.CUDNN_LINEAR_INPUT) self.assertEqual(rnn.direction, cudnn.CUDNN_UNIDIRECTIONAL) self.assertEqual(rnn.mode, cudnn.CUDNN_LSTM) self.assertEqual(rnn.data_type, CUTYPE) self.assertEqual(rnn.num_linear_layers, 8) # Full syntax rnn = cudnn.RNNDescriptor() rnn.set(hidden_size, n_layers, drop, input_mode=cudnn.CUDNN_LINEAR_INPUT, direction=cudnn.CUDNN_UNIDIRECTIONAL, mode=cudnn.CUDNN_LSTM, data_type=CUTYPE) assert_values() x_descs = tuple(x_desc for _i in range(n_unroll)) def get_sz(func): sz = func(rnn, x_descs) self.assertIsInstance(sz, int) return sz sz_work = get_sz(self.cudnn.get_rnn_workspace_size) logging.debug("RNN workspace size for %s with %d unrolls is %d", x_arr.shape, n_unroll, sz_work) sz_train = get_sz(self.cudnn.get_rnn_training_reserve_size) logging.debug("RNN train size for %s with %d unrolls is %d", x_arr.shape, n_unroll, sz_train) sz_weights = self.cudnn.get_rnn_params_size(rnn, x_desc) logging.debug("RNN weights size for %s is %d", x_arr.shape, sz_weights) sz_expected = ITEMSIZE * ( 4 * (x_arr.shape[1] + hidden_size + 2) * hidden_size + 4 * (hidden_size + hidden_size + 2) * hidden_size * (n_layers - 1)) self.assertEqual(sz_weights, sz_expected) weights_desc = cudnn.FilterDescriptor() weights_desc.set_nd(CUTYPE, (sz_weights // ITEMSIZE, 1, 1)) weights = cu.MemAlloc(self.ctx, sz_weights) weights_arr = numpy.random.rand(sz_weights // ITEMSIZE).astype(DTYPE) weights_arr -= 0.5 weights_arr *= 0.1 weights.to_device(weights_arr) w_desc = cudnn.FilterDescriptor() w = self.cudnn.get_rnn_lin_layer_matrix_params(rnn, 0, x_desc, weights_desc, weights, 0, w_desc) logging.debug("Got matrix 0 of dimensions: %s, fmt=%d, sz=%d", w_desc.dims, w_desc.fmt, w.size) self.assertEqual(w.size, hidden_size * x_arr.shape[1] * ITEMSIZE) b_desc = cudnn.FilterDescriptor() b = self.cudnn.get_rnn_lin_layer_bias_params(rnn, 0, x_desc, weights_desc, weights, 0, b_desc) logging.debug("Got bias 0 of dimensions: %s, fmt=%d, sz=%d", b_desc.dims, b_desc.fmt, b.size) self.assertEqual(b.size, hidden_size * ITEMSIZE) work_buf = cu.MemAlloc(self.ctx, sz_work) work_buf.memset32_async() x = cu.MemAlloc(self.ctx, x_arr.nbytes * n_unroll) for i in range(n_unroll): # will feed the same input x.to_device(x_arr, x_arr.nbytes * i, x_arr.nbytes) y_arr = numpy.zeros((n_unroll, batch_size, hidden_size), dtype=DTYPE) y = cu.MemAlloc(self.ctx, y_arr) hx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE) hx_arr[:] = numpy.random.rand(hx_arr.size).reshape(hx_arr.shape) hx_arr -= 0.5 hx = cu.MemAlloc(self.ctx, hx_arr) hy = cu.MemAlloc(self.ctx, hx.size) hy.memset32_async() cx_arr = numpy.zeros((n_layers, batch_size, hidden_size), dtype=DTYPE) cx_arr[:] = numpy.random.rand(cx_arr.size).reshape(cx_arr.shape) cx_arr -= 0.5 cx = cu.MemAlloc(self.ctx, cx_arr) cy = cu.MemAlloc(self.ctx, cx.size) cy.memset32_async() y_desc = cudnn.TensorDescriptor() y_desc.set_nd(CUTYPE, (batch_size, hidden_size, 1)) y_descs = tuple(y_desc for _i in range(n_unroll)) h_desc = cudnn.TensorDescriptor() h_desc.set_nd(CUTYPE, (n_layers, batch_size, hidden_size)) train_buf = cu.MemAlloc(self.ctx, sz_train) train_buf.memset32_async() self.cudnn.rnn_forward_training(rnn, x_descs, x, h_desc, hx, h_desc, cx, weights_desc, weights, y_descs, y, h_desc, hy, h_desc, cy, work_buf, sz_work, train_buf, sz_train) self.ctx.synchronize() logging.debug("Forward training done") y.to_host(y_arr) target = numpy.random.rand(y_arr.size).reshape(y_arr.shape).astype( y_arr.dtype) - 0.5 dy_arr = y_arr - target dy = cu.MemAlloc(self.ctx, dy_arr) dhy = cu.MemAlloc(self.ctx, hx.size) dhy.memset32_async() dcy = cu.MemAlloc(self.ctx, cx.size) dcy.memset32_async() dx_arr = numpy.zeros_like(x_arr) dx = cu.MemAlloc(self.ctx, dx_arr) dhx_arr = numpy.zeros_like(hx_arr) dhx = cu.MemAlloc(self.ctx, dhx_arr) dcx_arr = numpy.zeros_like(cx_arr) dcx = cu.MemAlloc(self.ctx, dcx_arr) self.cudnn.rnn_backward_data(rnn, y_descs, y, y_descs, dy, h_desc, dhy, h_desc, dcy, weights_desc, weights, h_desc, hx, h_desc, cx, x_descs, dx, h_desc, dhx, h_desc, dcx, work_buf, sz_work, train_buf, sz_train) logging.debug("Backpropagation done") dx.to_host(dx_arr) dhx.to_host(dhx_arr) dcx.to_host(dcx_arr) def forward(): x.to_device_async(x_arr) hx.to_device_async(hx_arr) cx.to_device_async(cx_arr) self.cudnn.rnn_forward_inference(rnn, x_descs, x, h_desc, hx, h_desc, cx, weights_desc, weights, y_descs, y, h_desc, hy, h_desc, cy, work_buf, sz_work) y.to_host(y_arr) numdiff = NumDiff() logging.debug("Checking dx...") numdiff.check_diff(x_arr, y_arr, target, dx_arr, forward) logging.debug("Checking dhx...") numdiff.check_diff(hx_arr, y_arr, target, dhx_arr, forward) logging.debug("Checking dcx...") numdiff.check_diff(cx_arr, y_arr, target, dcx_arr, forward) logging.debug("EXIT: test_lstm")