def gpu_availability(): """ Detect gpu on user system :return: Whether at least a CUDA compatible GPU is detected and usable :rtype: bool :History: 2018-Apr-25 - Written - Henry Leung (University of Toronto) """ # assume if using tensorflow-gpu, then Nvidia GPU is available if is_built_with_cuda(): return is_gpu_available() else: return is_built_with_cuda()
def _testTypes(self, vals): for dtype in [np.float32, np.float64, np.int32, np.int64]: x = np.zeros(vals.shape).astype(dtype) y = vals.astype(dtype) var_value, op_value = self._initAssignFetch(x, y, use_gpu=False) self.assertAllEqual(y, var_value) self.assertAllEqual(y, op_value) var_value, op_value = self._initAssignAddFetch(x, y, use_gpu=False) self.assertAllEqual(x + y, var_value) self.assertAllEqual(x + y, op_value) var_value, op_value = self._initAssignSubFetch(x, y, use_gpu=False) self.assertAllEqual(x - y, var_value) self.assertAllEqual(x - y, op_value) if test.is_built_with_cuda() and dtype in [np.float32, np.float64]: var_value, op_value = self._initAssignFetch(x, y, use_gpu=True) self.assertAllEqual(y, var_value) self.assertAllEqual(y, op_value) var_value, op_value = self._initAssignAddFetch(x, y, use_gpu=True) self.assertAllEqual(x + y, var_value) self.assertAllEqual(x + y, op_value) var_value, op_value = self._initAssignSubFetch(x, y, use_gpu=False) self.assertAllEqual(x - y, var_value) self.assertAllEqual(x - y, op_value)
def testBuildInfo(self): self.assertEqual(build_info.build_info['is_rocm_build'], test.is_built_with_rocm()) self.assertEqual(build_info.build_info['is_cuda_build'], test.is_built_with_cuda()) self.assertEqual(build_info.build_info['is_tensorrt_build'], is_tensorrt_enabled())
def gpu_memory_manage(ratio=None, log_device_placement=False): """ To manage GPU memory usage, prevent Tensorflow preoccupied all the video RAM :param ratio: Optional, ratio of GPU memory pre-allocating to astroNN :type ratio: Union[NoneType, float] :param log_device_placement: whether or not log the device placement :type log_device_placement: bool :History: 2017-Nov-25 - Written - Henry Leung (University of Toronto) """ config = tf.compat.v1.ConfigProto() if ratio is None: config.gpu_options.allow_growth = True else: if is_built_with_cuda(): if ratio <= 0. or ratio > 1.: print(f"Invalid ratio argument -> ratio: {ratio}, it has been reset to ratio=1.0") ratio = 1. config.gpu_options.per_process_gpu_memory_fraction = ratio elif isinstance(ratio, float): warnings.warn("You have set GPU memory limit in astroNN config file but you are not using Tensorflow-GPU!") config.log_device_placement = log_device_placement if tf.compat.v1.get_default_session() is not None: warnings.warn("A Tensorflow session in use is detected, " "astroNN will use that session to prevent overwriting session!") else: # Set global _SESSION for tensorflow to use with astroNN cpu, GPU setting tf.compat.v1.Session(config=config).__enter__() # to register it as tensorflow default session return None
def testInvalidLabel(self): features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.], [1., 2., 3., 4.]] labels = [4, 3, 0, -1] if test.is_built_with_cuda() and test.is_gpu_available(): with self.session(use_gpu=True) as sess: loss, backprop = ( gen_nn_ops.sparse_softmax_cross_entropy_with_logits( features, labels)) tf_loss, tf_backprop = sess.run([loss, backprop]) self.assertAllClose( [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75], [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4], tf_backprop, rtol=1e-3, atol=1e-3) self.assertAllClose( [np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3) with self.session(use_gpu=False) as sess: loss, backprop = ( gen_nn_ops.sparse_softmax_cross_entropy_with_logits(features, labels)) with self.assertRaisesOpError("Received a label value of"): sess.run([loss, backprop])
def testInvalidLabel(self): features = [[1., 1., 1., 1.], [1., 1., 1., 1.], [1., 2., 3., 4.], [1., 2., 3., 4.]] labels = [4, 3, 0, -1] if test.is_built_with_cuda() and test.is_gpu_available(): with self.session(use_gpu=True) as sess: loss, backprop = ( gen_nn_ops.sparse_softmax_cross_entropy_with_logits( features, labels)) tf_loss, tf_backprop = self.evaluate([loss, backprop]) self.assertAllClose( [[np.nan] * 4, [0.25, 0.25, 0.25, -0.75], [-0.968, 0.087, 0.237, 0.6439], [np.nan] * 4], tf_backprop, rtol=1e-3, atol=1e-3) self.assertAllClose([np.nan, 1.3862, 3.4420, np.nan], tf_loss, rtol=1e-3, atol=1e-3) with self.session(use_gpu=False) as sess: loss, backprop = ( gen_nn_ops.sparse_softmax_cross_entropy_with_logits( features, labels)) with self.assertRaisesOpError("Received a label value of"): self.evaluate([loss, backprop])
class CudnnRNNTestBasic(TensorFlowTestCase): @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testLayerBasic(self): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 with vs.variable_scope("main"): kernel_initializer = init_ops.constant_initializer(0.) bias_initializer = init_ops.constant_initializer(0.) inputs = random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Build the layer outputs1, _ = lstm(inputs) # Reuse the layer outputs2, _ = lstm(inputs) total_sum1 = math_ops.reduce_sum(outputs1) total_sum2 = math_ops.reduce_sum(outputs2) with vs.variable_scope("main", reuse=True): lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Reuse the layer outputs3, _ = lstm(inputs) total_sum3 = math_ops.reduce_sum(outputs3) self.assertEqual(1, len(variables.trainable_variables())) self.assertEqual( 1, len(ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS))) self.assertEqual("main/awesome_lstm/opaque_kernel", variables.trainable_variables()[0].op.name) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) (total_sum1_v, total_sum2_v, total_sum3_v) = sess.run([total_sum1, total_sum2, total_sum3]) self.assertEqual(0, total_sum1_v) self.assertEqual(0, total_sum2_v) self.assertEqual(0, total_sum3_v)
def testListLocalDevices(self): devices = device_lib.list_local_devices() self.assertGreater(len(devices), 0) self.assertEqual(devices[0].device_type, "CPU") # GPU test if test.is_built_with_cuda(): self.assertGreater(len(devices), 1) self.assertTrue("GPU" in [d.device_type for d in devices])
def testBuildInfo(self): self.assertEqual(build_info.build_info['is_rocm_build'], test.is_built_with_rocm()) self.assertEqual(build_info.build_info['is_cuda_build'], test.is_built_with_cuda()) # TODO(b/173044576): make the test work for Windows. if platform.system() != 'Windows': # pylint: disable=g-import-not-at-top from tensorflow.compiler.tf2tensorrt._pywrap_py_utils import is_tensorrt_enabled self.assertEqual(build_info.build_info['is_tensorrt_build'], is_tensorrt_enabled())
class CudnnRNNTestParamsSize(TensorFlowTestCase): def _TestOpaqueParamsSize(self, rnn_mode, num_layers, num_units, input_size, direction): logging.info("Testing one lstm param size with config: %s", locals()) dtype = dtypes.float32 model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, dtype=dtype, direction=direction) rnn = model.rnn # Min param size estimate = sum(weights.size) + sum(biases.size) min_params_size = ( np.sum(map(np.prod, rnn.canonical_weight_shapes)) + np.sum([sp[0] for sp in rnn.canonical_bias_shapes])) opaque_params = rnn.trainable_variables[0] with self.test_session(use_gpu=True, graph=ops.get_default_graph()): variables.global_variables_initializer().run() opaque_params_size_v = opaque_params.eval().size self.assertLessEqual(min_params_size, opaque_params_size_v) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testOpaqueParamsSize(self): test_configs = [ [4, 200, 200], [4, 200, 300], [4, 200, 100], [1, 100, 200], [2, 200, 100], [3, 200, 400], ] directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION] rnns = [CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH] for (rnn, config, direction) in itertools.product(rnns, test_configs, directions): num_layers, num_units, input_size = config with ops.Graph().as_default(): self._TestOpaqueParamsSize(rnn, num_layers, num_units, input_size, direction)
class CudnnRNNTestParamsSize(TensorFlowTestCase): def _testOneLSTMParamsSize(self, num_layers, num_units, input_size, direction): logging.info("Testing one lstm param size with config: %s", locals()) min_params_size = _MinLSTMParamSize(num_layers, num_units, input_size, direction) model = _CreateModel(cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units, input_size, direction=direction) params_size = model.params_size() with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: params_size_v = sess.run(params_size) self.assertLessEqual(min_params_size, params_size_v) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testLSTMParamsSize(self): test_configs = [ [4, 200, 200], [4, 200, 300], [4, 200, 100], [1, 100, 200], [2, 200, 100], [3, 200, 400], ] directions = [ cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION, cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION ] for (config, direction) in itertools.product(test_configs, directions): num_layers, num_units, input_size = config with ops.Graph().as_default(): self._testOneLSTMParamsSize(num_layers, num_units, input_size, direction)
class CudnnRNNTestTraining(TensorFlowTestCase): def _ComputeNumericGrad(self, sess, y, x, delta=1e-4, step=1): """Compute the numeric gradient of y wrt to x. Args: sess: The TF session constructed with a graph containing x and y. y: A scalar TF Tensor in the graph constructed in sess. x: A TF Tensor in the graph constructed in sess. delta: Gradient checker's small perturbation of x[i]. step: Only compute numerical gradients for a subset of x values. I.e. dy/dx[i] is computed if i % step == 0. Returns: A Tensor of the same shape and dtype as x. If x[i] is not chosen to compute the numerical gradient dy/x[i], the corresponding value is set to 0. """ x_data = sess.run(x) x_size = x_data.size x_shape = x_data.shape numeric_grad = np.zeros(x_size, dtype=x_data.dtype) for i in range(0, x_size, step): x_pos = x_data.copy() if x_size == 1: x_pos += delta else: x_pos.flat[i] += delta y_pos_feed_dict = dict([(x.name, x_pos)]) y_pos = sess.run(y, feed_dict=y_pos_feed_dict) x_neg = x_data.copy() if x_size == 1: x_neg -= delta else: x_neg.flat[i] -= delta y_neg_feed_dict = dict([(x.name, x_neg)]) y_neg = sess.run(y, feed_dict=y_neg_feed_dict) numeric_grad[i] = (y_pos - y_neg) / (2 * delta) return numeric_grad.reshape(x_shape) def _GetShape(self, sess, inputs): if not isinstance(inputs, collections.Iterable): return sess.run(array_ops.shape(inputs)) else: return sess.run([array_ops.shape(x) for x in inputs]) def _GradientCheckFp16(self, sess, y, xs, num_samples, tolerance=1e-6, delta=1e-4): """Gradient check for Fp16. Fp16 numerical gradients end up being zeros. Use a new way to check gradients: Given multi-variant function: y = f(x1, x2, ... xn) delta_y = f(x1 + delta_x1, x2+delta_x2, ..., xn+delta_xn) - f(x1, x2, ..., xn) = f'(x1) * delta_x1 + f'(x2) * delta_x2 + .. + f'(xn) * delta_xn where: delta_xi are very small disturbance. f'(xi) is the gradient of y w.r.t xi. The gradient check verifies the expected delta_y calculated by the above equation is close to the actual delta_y. Args: sess: tf.Session object. y: output tensor. xs: a tensor or a list of input tensors. num_samples: number of test samples to run. tolerance: error tolerance. delta: the order of magnititued of input disturbance to apply to calculate the output change w.r.t inputs. """ sym_grads = self._ComputeSymGrads(sess, y, xs) xs_shapes = self._GetShape(sess, xs) x_vals = [sess.run(x) for x in xs] for _ in range(num_samples): delta_xs = [delta * np.random.rand(*shape.tolist()) for shape in xs_shapes] feed_dict = {} for x, x_val, delta_x in zip(xs, x_vals, delta_xs): feed_dict[x] = x_val + delta_x actual_delta_y = (float(sess.run(y, feed_dict=feed_dict)) - float(sess.run(y))) expected_delta_y = 0. for sym_grad, delta_x in zip(sym_grads, delta_xs): expected_delta_y += np.dot( sym_grad.astype(np.float32).flatten(), delta_x.astype(np.float32).flatten()) self.assertAllClose(expected_delta_y, actual_delta_y, atol=tolerance, rtol=tolerance) def _GradientCheck(self, sess, y, xs, tolerance=1e-6, delta=1e-4): sym_grads = self._ComputeSymGrads(sess, y, xs) num_grads = [self._ComputeNumericGrad(sess, y, x, delta) for x in xs] self.assertEqual(len(sym_grads), len(num_grads)) for sym, num in zip(sym_grads, num_grads): self.assertFalse(np.any(np.isnan(sym))) self.assertFalse(np.any(np.isnan(num))) self.assertAllClose(sym, num, atol=tolerance, rtol=tolerance) def _ComputeSymGrads(self, sess, y, xs): sym_grads_t = gradients.gradients(y, xs) return sess.run(sym_grads_t) def _TestOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) np.random.seed(1234) random_seed.set_random_seed(5678) has_input_c = (rnn_mode == CUDNN_LSTM) direction = (CUDNN_RNN_UNIDIRECTION if dir_count == 1 else CUDNN_RNN_BIDIRECTION) model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dropout=dropout, dtype=dtype, training=True, bias_initializer=init_ops.random_normal_initializer( mean=1., dtype=dtype)) rnn = model.rnn params = rnn.trainable_variables[0] inputs = variables.Variable( random_ops.random_uniform( [seq_length, batch_size, input_size], dtype=dtype), dtype=dtype) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) initial_state = (input_h, input_c) else: initial_state = (input_h,) total_sum = model.FProp(inputs, initial_state, training=True) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) all_inputs = [inputs, params] for s in initial_state: all_inputs.append(s) if dtype == dtypes.float16: self._GradientCheckFp16( sess, total_sum, all_inputs, num_samples=FLAGS.grad_check_num_samples, tolerance=tolerance, delta=delta) else: for _ in range(FLAGS.grad_check_num_samples): # Each time choose a different set of inputs. sess.run(variables.global_variables_initializer()) self._GradientCheck( sess, total_sum, all_inputs, tolerance=tolerance, delta=delta) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state def _TestSimpleTrainingHelper(self, rnn_mode, test_configs): dropouts = [0, 0.5, 1.] for config, dropout in itertools.product(test_configs, dropouts): dtype = config.get("dtype", dtypes.float32) delta = config.get("delta", 1e-4) tolerance = config.get("tolerance", 1e-6) dir_count = config.get("dir_count", 1) shape = config["shape"] with ops.Graph().as_default(): self._TestOneSimpleTraining(rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], dir_count, dropout, dtype, delta, tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingLSTMFp64(self): test_configs = [ { "dtype": dtypes.float64, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingLSTMFp32(self): test_configs = [ { "dtype": dtypes.float32, "delta": 1e-4, "tolerance": 9e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingLSTMFp16(self): test_configs = [ { "dtype": dtypes.float16, "delta": 1e-3, "tolerance": 9e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, { "dtype": dtypes.float16, "delta": 1e-2, "tolerance": 9e-2, "shape": { "num_layers": 2, "num_units": 6, "input_size": 8, "batch_size": 6, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_LSTM, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingGRUFp64(self): test_configs = [ { "dtype": dtypes.float64, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, } }, ] self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingGRUFp32(self): test_configs = [ { "dtype": dtypes.float32, "delta": 1e-3, "tolerance": 4e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingGRUFp16(self): test_configs = [ { "dtype": dtypes.float16, "delta": 2e-3, "tolerance": 6e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_GRU, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNTanhFp64(self): test_configs = [ { "dtype": dtypes.float64, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNTanhFp32(self): test_configs = [ { "dtype": dtypes.float32, "delta": 1e-3, "tolerance": 5e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNTanhFp16(self): test_configs = [ { "dtype": dtypes.float16, "delta": 1e-3, "tolerance": 5e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_TANH, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNReluFp64(self): test_configs = [ { "dtype": dtypes.float64, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNReluFp32(self): test_configs = [ { "dtype": dtypes.float32, "delta": 1e-4, "tolerance": 3e-1, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTrainingRNNReluFp16(self): test_configs = [ { "dtype": dtypes.float16, "delta": 1e-3, "tolerance": 7e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] self._TestSimpleTrainingHelper(CUDNN_RNN_RELU, test_configs)
class CudnnRNNTestCompatibleRNNCells(TensorFlowTestCase): @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleLSTM(self): self._TestCudnnCompatibleRnnCellsHelper(CUDNN_LSTM) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleGRU(self): self._TestCudnnCompatibleRnnCellsHelper(CUDNN_GRU) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleRNNTanh(self): self._TestCudnnCompatibleRnnCellsHelper(CUDNN_RNN_TANH) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleRNNRelu(self): self._TestCudnnCompatibleRnnCellsHelper(CUDNN_RNN_RELU) def _TestCudnnCompatibleRnnCellsHelper(self, rnn_mode): configs = [ { "num_layers": 1, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 2, "seq_length": 8, "num_units": 4, "input_size": 8, "batch_size": 16, }, { "num_layers": 2, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 1, "seq_length": 2, "num_units": 2, "input_size": 4, "batch_size": 1, }, ] directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION] for cfg, direction in zip(configs, directions): self._TestCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"], cfg["num_units"], cfg["input_size"], cfg["batch_size"], rnn_mode, direction) def _TestCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units, input_size, batch_size, rnn_mode, direction): dtype = dtypes.float32 # Train graph with ops.Graph().as_default() as g: model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype, training=True) target_output = array_ops.placeholder(dtype=dtype) loss_op = losses.log_loss( labels=target_output, predictions=model.total_sum) optimizer = gradient_descent.GradientDescentOptimizer(learning_rate=1e-2) train_op = optimizer.minimize(loss_op) saver = saver_lib.Saver() # Train Cudnn model seed = 0 with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) # Train 128 steps num_steps = 128 for _ in range(num_steps): inputs, _ = model.SynthesizeInput(seq_length, batch_size, seed) targets = np.random.rand() sess.run( train_op, feed_dict={ model.inputs: inputs, model.initial_state: model.ZeroState(batch_size), target_output: targets }) seed += 1 save_path = os.path.join(self.get_temp_dir(), ("cudnn-rnn-%s-test" % rnn_mode)) save_v = saver.save(sess, save_path) self.assertEqual(save_path, save_v) # Cudnn inference graph with ops.Graph().as_default() as g: model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype, training=False) rnn = model.rnn saver = saver_lib.Saver() inference_input = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) saver.restore(sess, save_path) # Cudnn inference cudnn_outputs_v, cudnn_output_states_v = model.Feed( sess, inference_input, return_sum=False) # Canonical RNN inference graph with ops.Graph().as_default() as g: cell_inputs = array_ops.placeholder( dtype, shape=[seq_length, batch_size, input_size]) if direction == CUDNN_RNN_UNIDIRECTION: # outputs is one tensor, states are num_layer tuples, each 2 tensors (outputs, states) = _CreateCudnnCompatibleCanonicalRNN(rnn, cell_inputs) if rnn_mode == CUDNN_LSTM: output_h = array_ops.stack([s.h for s in states]) output_c = array_ops.stack([s.c for s in states]) else: output_state = array_ops.stack([s for s in states]) else: # outputs is one tensor. # states is a tuple of 2 tuples: # each sub tuple is num_layer tuples, each with 2 tensors. (outputs, states) = _CreateCudnnCompatibleCanonicalRNN( rnn, cell_inputs, is_bidi=True) output_state_fw, output_state_bw = states if rnn_mode == CUDNN_LSTM: output_h, output_c = [], [] for s_fw, s_bw in zip(output_state_fw, output_state_bw): output_h.append(array_ops.stack([s_fw.h, s_bw.h])) output_c.append(array_ops.stack([s_fw.c, s_bw.c])) output_h = array_ops.concat(output_h, axis=0) output_c = array_ops.concat(output_c, axis=0) else: output_state = [] for s_fw, s_bw in zip(output_state_fw, output_state_bw): output_state.append(array_ops.stack([s_fw, s_bw])) output_state = array_ops.concat(output_state, axis=0) saver = saver_lib.Saver() with self.test_session(use_gpu=True, graph=g) as sess: saver.restore(sess, save_path) # BlockCell inference if rnn_mode == CUDNN_LSTM: outputs_v, output_h_v, output_c_v = sess.run( [outputs, output_h, output_c], feed_dict={cell_inputs: inference_input}) self.assertAllClose(cudnn_outputs_v, outputs_v) cudnn_output_h_v, cudnn_output_c_v = cudnn_output_states_v self.assertAllClose(cudnn_output_h_v, output_h_v) self.assertAllClose(cudnn_output_c_v, output_c_v) else: outputs_v, output_state_v = sess.run( [outputs, output_state], feed_dict={cell_inputs: inference_input}) self.assertAllClose(cudnn_outputs_v, outputs_v, atol=2e-5, rtol=2e-5) (cudnn_output_h_v,) = cudnn_output_states_v self.assertAllClose(cudnn_output_h_v, output_state_v, atol=2e-5, rtol=2e-5)
class CudnnRNNTestSaveRestore(TensorFlowTestCase): def _CompareWeights(self, lhs, rhs): self.assertEqual(len(lhs), len(rhs)) for lw, rw in zip(lhs, rhs): self.assertAllEqual(lw, rw) def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction): self.assertEqual(len(lhs), len(rhs)) if rnn_mode == CUDNN_LSTM: num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER elif rnn_mode == CUDNN_GRU: num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER elif rnn_mode == CUDNN_RNN_TANH: num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER else: num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2 num_params_per_layer *= num_dirs self.assertEqual(num_params_per_layer * num_layers, len(lhs)) for i in range(num_layers): layer_lhs = lhs[i * num_params_per_layer: (i+1) * num_params_per_layer] layer_rhs = rhs[i * num_params_per_layer: (i+1) * num_params_per_layer] if direction == CUDNN_RNN_UNIDIRECTION: self._CompareSingleLayerBiases(layer_lhs, layer_rhs) else: size = len(layer_lhs) fw_lhs, bw_lhs = layer_lhs[:size//2], layer_lhs[size//2:] fw_rhs, bw_rhs = layer_rhs[:size//2], layer_rhs[size//2:] self._CompareSingleLayerBiases(fw_lhs, fw_rhs) self._CompareSingleLayerBiases(bw_lhs, bw_rhs) def _CompareSingleLayerBiases(self, lhs, rhs): self.assertEqual(len(lhs), len(rhs)) lf_lhs, rt_lhs = lhs[:len(lhs)//2], lhs[len(lhs)//2:] lf_rhs, rt_rhs = rhs[:len(rhs)//2], rhs[len(rhs)//2:] self.assertEqual(len(lf_lhs), len(rt_lhs)) self.assertEqual(len(lf_rhs), len(rt_rhs)) sum_lhs, sum_rhs = [], [] for lf, rt in zip(lf_lhs, rt_lhs): sum_lhs.append(lf + rt) for lf, rt in zip(lf_rhs, rt_rhs): sum_rhs.append(lf + rt) self.assertEqual(len(sum_lhs), len(sum_rhs)) for lf, rt in zip(sum_lhs, sum_rhs): self.assertAllEqual(lf, rt) def _TestSaveRestoreVariable(self, rnn_mode, direction, dtype): input_size = 3 num_layers = 2 num_units = 7 with ops.Graph().as_default() as g: random_seed.set_random_seed(1234) model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype) rnn = model.rnn save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver() weights, biases = model.rnn.saveable._OpaqueParamsToCanonical() opaque_params = rnn.trainable_variables[0] # CudnnTestModel() creates CudnnOpaqueParamsSaveable that helps saver save # Cudnn vars in canonical format. reset_op = state_ops.assign( opaque_params, array_ops.zeros(array_ops.shape(opaque_params), dtype=dtype)) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) val = saver.save(sess, save_path) self.assertEqual(save_path, val) weights_v, biases_v = sess.run([weights, biases]) # Reset opaque param sess.run(reset_op) saver.restore(sess, save_path) weights_v_restored, biases_v_restored = sess.run([weights, biases]) self._CompareWeights(weights_v, weights_v_restored) self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers, direction) def _TestSaveRestoreTwoVariables(self, rnn_mode, direction, dtype): input_size = 3 num_layers = 2 num_units = 7 with ops.Graph().as_default() as g: random_seed.set_random_seed(1234) with vs.variable_scope("m1"): model1 = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype) with vs.variable_scope("m2"): model2 = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype) opaque_params = (model1.rnn.trainable_variables[0], model2.rnn.trainable_variables[0]) weights1, biases1 = model1.rnn.saveable._OpaqueParamsToCanonical() weights2, biases2 = model2.rnn.saveable._OpaqueParamsToCanonical() reset_params = [ state_ops.assign(params, array_ops.zeros_like(params, dtype=dtype)) for params in opaque_params ] reset_op = control_flow_ops.group(*reset_params) save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test2") saver = saver_lib.Saver() # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) val = saver.save(sess, save_path) self.assertEqual(save_path, val) weights1_v, biases1_v = sess.run([weights1, biases1]) weights2_v, biases2_v = sess.run([weights2, biases2]) sess.run(reset_op) saver.restore(sess, save_path) weights1_v_restored, biases1_v_restored = sess.run([weights1, biases1]) weights2_v_restored, biases2_v_restored = sess.run([weights2, biases2]) self._CompareWeights(weights1_v, weights1_v_restored) self._CompareWeights(weights2_v, weights2_v_restored) self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers, direction) self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers, direction) def _TestSaveRestoreOutput(self, rnn_mode, direction, dtype): with ops.Graph().as_default() as g: num_layers = 2 num_units = 7 input_size = 7 seq_length = 8 batch_size = 4 model = CudnnTestModel( rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype, training=False) rnn = model.rnn save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver() # Only one opaque var in a cudnn layer. assert len(rnn.trainable_variables) == 1 reset_params = state_ops.assign( rnn.trainable_variables[0], array_ops.zeros( array_ops.shape(rnn.trainable_variables[0]), dtype=dtype)) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) inputs, initial_state = model.SynthesizeInput(seq_length, batch_size) total_sum_v = model.Feed(sess, inputs, initial_state) val = saver.save(sess, save_path) self.assertEqual(save_path, val) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = model.Feed(sess, inputs, initial_state) self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5) def _TestSaveRestoreHelper(self, rnn_mode): directions = [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION] dtype_list = [dtypes.float16, dtypes.float32, dtypes.float64] for direction, dtype in itertools.product(directions, dtype_list): self._TestSaveRestoreVariable(rnn_mode, direction, dtype) self._TestSaveRestoreTwoVariables(rnn_mode, direction, dtype) self._TestSaveRestoreOutput(rnn_mode, direction, dtype) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestoreRepeatedlyCreateCustomSaveable(self): input_size = 3 num_layers = 2 num_units = 7 with ops.Graph().as_default(): random_seed.set_random_seed(1234) model = CudnnTestModel( CUDNN_LSTM, num_layers, num_units, input_size, direction=CUDNN_RNN_UNIDIRECTION, dtype=dtypes.float32) with self.assertRaisesRegexp(RuntimeError, "Cudnn saveable already created"): model.rnn._create_saveable() @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestoreLSTM(self): self._TestSaveRestoreHelper(CUDNN_LSTM) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestoreGRU(self): self._TestSaveRestoreHelper(CUDNN_GRU) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestoreRNNTanh(self): self._TestSaveRestoreHelper(CUDNN_RNN_TANH) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestoreRNNRelu(self): self._TestSaveRestoreHelper(CUDNN_RNN_RELU)
class CudnnRNNTestBasic(TensorFlowTestCase): @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testLayerBasic(self): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 with vs.variable_scope("main"): kernel_initializer = init_ops.constant_initializer(0.) bias_initializer = init_ops.constant_initializer(0.) inputs = random_ops.random_uniform([ num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Build the layer outputs1, _ = lstm(inputs) # Reuse the layer outputs2, _ = lstm(inputs) total_sum1 = math_ops.reduce_sum(outputs1) total_sum2 = math_ops.reduce_sum(outputs2) with vs.variable_scope("main", reuse=True): lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") # Reuse the layer outputs3, _ = lstm(inputs) total_sum3 = math_ops.reduce_sum(outputs3) self.assertEqual(1, len(variables.trainable_variables())) self.assertEqual(1, len(ops.get_collection(ops.GraphKeys.SAVEABLE_OBJECTS))) self.assertEqual("main/awesome_lstm/opaque_kernel", variables.trainable_variables()[0].op.name) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) (total_sum1_v, total_sum2_v, total_sum3_v) = sess.run( [total_sum1, total_sum2, total_sum3]) self.assertEqual(0, total_sum1_v) self.assertEqual(0, total_sum2_v) self.assertEqual(0, total_sum3_v) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testOptimizersSupport(self): for opt in ("adagrad", "adam", "rmsprop", "momentum", "sgd"): self._TestOptimizerSupportHelper(opt) def _GetOptimizer(self, opt): if opt == "adagrad": return adagrad.AdagradOptimizer(learning_rate=1e-2) elif opt == "adam": return adam.AdamOptimizer(learning_rate=1e-2) elif opt == "rmsprop": return rmsprop.RMSPropOptimizer(learning_rate=1e-2) elif opt == "momentum": return momentum.MomentumOptimizer(learning_rate=1e-2, momentum=0.9) elif opt == "sgd": return gradient_descent.GradientDescentOptimizer(learning_rate=1e-2) else: raise ValueError("Unsupported optimizer: %s" % opt) def _TestOptimizerSupportHelper(self, opt): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 with ops.Graph().as_default() as g: kernel_initializer = init_ops.constant_initializer(0.) bias_initializer = init_ops.constant_initializer(0.) inputs = random_ops.random_uniform([ num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") outputs, _ = lstm(inputs) loss = math_ops.reduce_sum(outputs) optimizer = self._GetOptimizer(opt) train_op = optimizer.minimize(loss) with self.test_session(use_gpu=True, graph=g) as sess: sess.run(variables.global_variables_initializer()) sess.run(train_op) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveableGraphDeviceAssignment(self): num_layers = 4 num_units = 2 batch_size = 8 direction = CUDNN_RNN_UNIDIRECTION dir_count = 1 def DeviceFn(op): if op.type in ("Variable", "VariableV2"): return "/cpu:0" else: return "/gpu:0" with ops.Graph().as_default() as g: with ops.device(DeviceFn): with vs.variable_scope("main"): kernel_initializer = init_ops.constant_initializer(3.14) bias_initializer = init_ops.constant_initializer(1.59) inputs = random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtypes.float32) lstm = cudnn_rnn.CudnnLSTM(num_layers, num_units, direction=direction, kernel_initializer=kernel_initializer, bias_initializer=bias_initializer, name="awesome_lstm") outputs = lstm(inputs) # saver is created in the scope of DeviceFn. saver = saver_lib.Saver() with self.test_session(use_gpu=True, graph=g) as sess: save_path = os.path.join(self.get_temp_dir(), "test-saveable-device-assignment") sess.run(variables.global_variables_initializer()) saver.save(sess, save_path) saver.restore(sess, save_path) sess.run(outputs) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testDifferentShapesEager(self): # Checks that kernel caching does not cause sharing of temporary storage # across different input shapes when executing eagerly. with context.eager_mode(): with ops.device("gpu:0"): first_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 28])) second_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 100])) self.assertAllEqual([28, 100, 100], first_output.shape) self.assertAllEqual([28, 100, 100], second_output.shape) def _LossFunc(): first_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 28])) second_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 100])) return (math_ops.reduce_sum(first_output) + math_ops.reduce_sum(second_output)) backprop.implicit_grad(_LossFunc)() @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testDifferentShapesGraph(self): # Tests that a single kernel instance presented with multiple input shapes # does not crash with graph execution. with ops.device("gpu:0"): layer = cudnn_rnn.CudnnGRU(1, 100) layer(array_ops.zeros([28, 100, 100])) def _Cond(index, accumulation): del accumulation # unused return math_ops.less(index, 4) def _Body(index, accumulation): layer_input = accumulation[:, :, 10 * (1 + index % 2):] output, _ = layer(layer_input) return index + 1, accumulation + output original_input = array_ops.zeros([28, 100, 100]) _, accumulation = control_flow_ops.while_loop(_Cond, _Body, [0, original_input]) grad, = gradients.gradients( math_ops.reduce_sum(accumulation), (original_input,)) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) accumulation_eval, grad_eval = sess.run((accumulation, grad)) self.assertAllEqual([28, 100, 100], accumulation_eval.shape) self.assertAllEqual([28, 100, 100], grad_eval.shape)
class CudnnRNNTest(TensorFlowTestCase): def _CreateModel(self, rnn_mode, num_layers, num_units, input_size, input_mode="linear_input", dropout=0.): if rnn_mode == "lstm": model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "gru": model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "rnn_tanh": model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == "rnn_relu": model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size, dropout=dropout) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) return model def _create_params_savable(self, params, model): """Create a RNNParamsSaveable for the weight and bias parameters. Args: params: a Variable for weight and bias parameters. model: a CudnnRNN model. """ params_saveable = cudnn_rnn_ops.RNNParamsSaveable( model.params_to_canonical, model.canonical_to_params, [params]) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable) def _testSaveRestoreVariable(self, rnn_mode): model = self._CreateModel(rnn_mode, num_layers=2, num_units=7, input_size=3) random_seed.set_random_seed(1234) params_size_t = model.params_size() params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) params_v = sess.run(params) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) params_v_restored = sess.run(params) self.assertAllEqual(params_v, params_v_restored) def _testSaveRestoreOutput(self, rnn_mode): num_layers = 2 num_units = 7 input_size = 7 seq_length = 10 batch_size = 5 dir_count = 1 model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) params_size_t = model.params_size() params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) has_input_c = (rnn_mode == "lstm") input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) outputs = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: outputs = model(input_data=input_data, input_h=input_h, params=params, is_training=False) total_sum = sum(map(math_ops.reduce_sum, outputs)) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run(total_sum) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = sess.run(total_sum) self.assertAllEqual(total_sum_v, total_sum_v_restored) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestore(self): rnn_modes = ["lstm", "gru", "rnn_tanh", "rnn_relu"] for rnn_mode in rnn_modes: self._testSaveRestoreVariable(rnn_mode) self._testSaveRestoreOutput(rnn_mode) def _MinLSTMParamSize(self, num_layers, num_units, input_size, input_mode="auto_select", direction="unidirection"): if direction != "unidirection": # TODO(zhengxq): support bidirection in parameter size estimate. raise ValueError("Only unidirection in parameter size estimate") first_layer_weights = 4 * num_units * (num_units + input_size) higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units all_biases = 8 * num_layers * num_units return first_layer_weights + higher_layer_weights + all_biases def _testOneLSTMParamsSize(self, num_layers, num_units, input_size): min_params_size = self._MinLSTMParamSize(num_layers, num_units, input_size) model = self._CreateModel("lstm", num_layers, num_units, input_size) params_size = model.params_size() with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size) self.assertLessEqual(min_params_size, params_size_v) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testLSTMParamsSize(self): test_configs = [ [4, 200, 200], [4, 200, 300], [4, 200, 100], [1, 100, 200], [2, 200, 100], [3, 200, 400], ] with ops.Graph().as_default(): for (num_layers, num_units, input_size) in test_configs: self._testOneLSTMParamsSize(num_layers, num_units, input_size) def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, expected, tolerance): random_seed.set_random_seed(5678) model = self._CreateModel(rnn_mode, num_layers, num_units, input_size, input_mode="auto_select", dropout=dropout) has_input_c = (rnn_mode == "lstm") params_size_t = model.params_size() input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params, is_training=False) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run([total_sum]) self.assertAllClose(total_sum_v[0], expected, atol=tolerance, rtol=tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleInference(self): # Cudnn scales result for dropout during training, therefore dropout has no # impact for inference results. # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most # demonstrative of the dropout-invariant nature of CudnnRnn.) test_configs = [ { "rnn_mode": "lstm", "dropout": [0., 0.5, 1.], "expected": 231833.22, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": "gru", "dropout": [0., 0.5, 1.], "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": "rnn_tanh", "dropout": [0., 0.5, 1.], "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": "rnn_relu", "dropout": [0., 0.5, 1.], "expected": 130688, "tolerance": 1e-2, "shape": { "num_layers": 2, "num_units": 8, "input_size": 4, "batch_size": 4, "seq_length": 2, "dir_count": 1, }, }, ] with ops.Graph().as_default(): for config in test_configs: rnn_mode = config["rnn_mode"] dropout_list = config.get("dropout", [0.]) expected = config["expected"] tolerance = config["tolerance"] shape = config["shape"] for dropout in dropout_list: self._testOneSimpleInference( rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], dropout, expected, tolerance) def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) has_input_c = (rnn_mode == "lstm") random_seed.set_random_seed(1234) model = self._CreateModel(rnn_mode, num_layers, num_units, input_size, dropout=dropout) params_size_t = model.params_size() input_data = variables.Variable( random_ops.random_uniform([seq_length, batch_size, input_size])) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size_t) inputs_and_shapes = [ (input_data, [seq_length, batch_size, input_size]), (input_h, [num_layers * dir_count, batch_size, num_units]), (params, [params_size_v]), ] if has_input_c: inputs_and_shapes.append( (input_c, [num_layers * dir_count, batch_size, num_units ]), ) sess.run(variables.global_variables_initializer()) all_inputs = [entry[0] for entry in inputs_and_shapes] all_shapes = [entry[1] for entry in inputs_and_shapes] err = gradient_checker.compute_gradient_error( all_inputs, all_shapes, total_sum, [1]) self.assertLess(err, tolerance) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTraining(self): test_configs = [ { "rnn_mode": "lstm", "dropout": [0., 0.5, 1.], "tolerance": 1e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": "gru", "dropout": [0., 0.5, 1.], "tolerance": 4e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": "rnn_tanh", "dropout": [0., 0.5, 1.], "tolerance": 5e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": "rnn_relu", "dropout": [0., 0.5, 1.], "tolerance": 4e-1, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, ] with ops.Graph().as_default(): for config in test_configs: rnn_mode = config["rnn_mode"] dropout_list = config.get("dropout", [0.]) tolerance = config["tolerance"] shape = config["shape"] for dropout in dropout_list: self._testOneSimpleTraining( rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], dropout, tolerance)
class SoftmaxTest(test.TestCase): def _npSoftmax(self, features, dim=-1, log=False): if dim is -1: dim = len(features.shape) - 1 one_only_on_dim = list(features.shape) one_only_on_dim[dim] = 1 is_fp16 = features.dtype == np.float16 if is_fp16: # Do the compute in fp32 and cast the input back to fp32. features = features.astype(np.float32) e = np.exp(features - np.reshape( np.amax( features, axis=dim), one_only_on_dim)) softmax = e / np.reshape(np.sum(e, axis=dim), one_only_on_dim) if log: res = np.log(softmax) else: res = softmax if is_fp16: res = res.astype(np.float16) return res def _testSoftmax(self, np_features, dim=-1, log=False, use_gpu=False): # A previous version of the code checked the op name rather than the op type # to distinguish between log and non-log. Use an arbitrary name to catch # this bug in future. name = "arbitrary" np_softmax = self._npSoftmax(np_features, dim=dim, log=log) with self.test_session(use_gpu=use_gpu): if log: tf_softmax = nn_ops.log_softmax(np_features, axis=dim, name=name) else: tf_softmax = nn_ops.softmax(np_features, axis=dim, name=name) out = tf_softmax.eval() self.assertAllCloseAccordingToType(np_softmax, out) self.assertShapeEqual(np_softmax, tf_softmax) if not log: # Bonus check: the softmaxes should add to one in dimension dim. sum_along_dim = np.sum(out, axis=dim) self.assertAllCloseAccordingToType( np.ones(sum_along_dim.shape), sum_along_dim) def _testAll(self, features): self._testSoftmax(features, use_gpu=True) self._testSoftmax(features, log=True, use_gpu=True) self._testOverflow(use_gpu=True) def testNpSoftmax(self): features = [[1., 1., 1., 1.], [1., 2., 3., 4.]] # Batch 0: All exps are 1. The expected result is # Softmaxes = [0.25, 0.25, 0.25, 0.25] # LogSoftmaxes = [-1.386294, -1.386294, -1.386294, -1.386294] # # Batch 1: # exps = [1., 2.718, 7.389, 20.085] # sum = 31.192 # Softmaxes = exps / sum = [0.0320586, 0.08714432, 0.23688282, 0.64391426] # LogSoftmaxes = [-3.44019 , -2.44019 , -1.44019 , -0.44019] np_sm = self._npSoftmax(np.array(features)) self.assertAllClose( np.array([[0.25, 0.25, 0.25, 0.25], [0.0320586, 0.08714432, 0.23688282, 0.64391426]]), np_sm, rtol=1.e-5, atol=1.e-5) np_lsm = self._npSoftmax(np.array(features), log=True) self.assertAllClose( np.array([[-1.386294, -1.386294, -1.386294, -1.386294], [-3.4401897, -2.4401897, -1.4401897, -0.4401897]]), np_lsm, rtol=1.e-5, atol=1.e-5) def _testOverflow(self, use_gpu=False): if use_gpu: type = np.float32 # pylint: disable=redefined-builtin else: type = np.float64 # pylint: disable=redefined-builtin max = np.finfo(type).max # pylint: disable=redefined-builtin features = np.array([[1., 1., 1., 1.], [max, 1., 2., 3.]]).astype(type) with self.test_session(use_gpu=use_gpu): tf_log_softmax = nn_ops.log_softmax(features) out = tf_log_softmax.eval() self.assertAllClose( np.array([[-1.386294, -1.386294, -1.386294, -1.386294], [0, -max, -max, -max]]), out, rtol=1.e-5, atol=1.e-5) def testFloat(self): self._testAll( np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float32)) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testFloatGPU(self): if test.is_gpu_available(cuda_only=True): rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)] cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)] for row, col in zip(rows, cols): logging.info("Testing softmax float dtype in shape [%d, %d]", row, col) data = np.random.rand(row, col) self._testAll(data.astype(np.float32)) def testHalf(self): self._testAll( np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float16)) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testHalfGPU(self): if test.is_gpu_available(cuda_only=True): rows = [2**x + np.random.randint(0, 16) for x in range(1, 4)] cols = [2**x + np.random.randint(0, 16) for x in range(1, 4)] for row, col in zip(rows, cols): logging.info("Testing softmax half dtype in shape [%d, %d]", row, col) data = np.random.rand(row, col) self._testAll(data.astype(np.float16)) def testDouble(self): self._testSoftmax( np.array([[1., 1., 1., 1.], [1., 2., 3., 4.]]).astype(np.float64)) self._testOverflow() def test1DTensorAsInput(self): self._testSoftmax( np.array([3., 2., 3., 9.]).astype(np.float64), use_gpu=False) self._testOverflow(use_gpu=False) def test1DTensorAsInputNoReshape(self): with compat.forward_compatibility_horizon(2018, 8, 27): self._testSoftmax( np.array([3., 2., 3., 9.]).astype(np.float64), use_gpu=False) self._testOverflow(use_gpu=False) def test3DTensorAsInput(self): self._testSoftmax( np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32), use_gpu=False) self._testOverflow(use_gpu=False) def test3DTensorAsInputNoReshape(self): with compat.forward_compatibility_horizon(2018, 8, 27): self._testSoftmax( np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32), use_gpu=False) self._testOverflow(use_gpu=False) def testAlongFirstDimension(self): self._testSoftmax( np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32), dim=0, use_gpu=False) self._testOverflow(use_gpu=False) def testAlongSecondDimension(self): self._testSoftmax( np.array([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]).astype(np.float32), dim=1, use_gpu=False) self._testOverflow(use_gpu=False) def testShapeInference(self): op = nn_ops.softmax([[[1., 1., 1., 1.], [1., 2., 3., 4.]], [[2., 3., 4., 5.], [6., 7., 8., 9.]], [[5., 4., 3., 2.], [1., 2., 3., 4.]]]) self.assertEqual([3, 2, 4], op.get_shape()) def testEmptyInput(self): with self.cached_session(): x = array_ops.placeholder(dtypes.float32, shape=[0, 3]) self.assertEqual(0, array_ops.size(x).eval()) # reshape would raise if logits is empty with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax(x, axis=0).eval() def testDimTooLarge(self): with self.cached_session(): # Use placeholder to make sure we get runtime error instead of shape # inference error. dim = array_ops.placeholder_with_default(100, shape=[]) with self.assertRaises(errors_impl.InvalidArgumentError): nn_ops.softmax([1., 2., 3., 4.], axis=dim).eval() def testLargeDims(self): # Make sure that we properly handle large inputs. See # https://github.com/tensorflow/tensorflow/issues/4425 for details for dims in [129, 256]: ones = np.random.rand(dims, dims).astype(np.float32) np_softmax = self._npSoftmax(ones) for use_gpu in [True, False]: with self.test_session(use_gpu=use_gpu) as sess: x = array_ops.placeholder(dtypes.float32) y = nn_ops.softmax(x) tf_softmax = sess.run(y, feed_dict={x: ones}) self.assertAllClose(tf_softmax, np_softmax)
class CudnnRNNTest(TensorFlowTestCase): def _CreateModel(self, rnn_mode, num_layers, num_units, input_size, input_mode="linear_input", dropout=0.): if rnn_mode == cudnn_rnn_ops.CUDNN_LSTM: model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_GRU: model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_TANH: model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size, dropout=dropout) elif rnn_mode == cudnn_rnn_ops.CUDNN_RNN_RELU: model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size, dropout=dropout) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) return model def _create_params_savable(self, params, model): """Create a RNNParamsSaveable for the weight and bias parameters. Args: params: a Variable for weight and bias parameters. model: a CudnnRNN model. """ params_saveable = cudnn_rnn_ops.RNNParamsSaveable( model, model.params_to_canonical, model.canonical_to_params, [params], "rnn") ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable) def _testSaveRestoreVariable(self, rnn_mode): # model = self._CreateModel(rnn_mode, num_layers=2, num_units=7, input_size=3) model = self._CreateModel(rnn_mode, num_layers=1, num_units=1, input_size=1) random_seed.set_random_seed(1234) params_size_t = model.params_size() params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) params_v = sess.run(params) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) params_v_restored = sess.run(params) self.assertAllEqual(params_v, params_v_restored) def _build_forward_cudnn_model(self, rnn_mode, num_layers, num_units, input_data, is_training=False): input_data_shape = input_data.get_shape().with_rank(3) batch_size = input_data_shape[1].value input_size = input_data_shape[2].value model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) # Set zero init input states input_h = constant_op.constant(np.zeros( [num_layers, batch_size, num_units]), dtype=dtypes.float32) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) if has_input_c: input_c = constant_op.constant(np.zeros( [num_layers, batch_size, num_units]), dtype=dtypes.float32) # Set rnn params params_size_t = model.params_size() params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) args = { "input_data": input_data, "input_h": input_h, "params": params, "is_training": is_training } if has_input_c: args["input_c"] = input_c # Build cell output_tuple = model(**args) # Create savable objects for params self._create_params_savable(params, model) return output_tuple, model, params @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleRnnCells(self): configs = [ { "num_layers": 1, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 2, "seq_length": 8, "num_units": 4, "input_size": 8, "batch_size": 16, }, { "num_layers": 2, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 1, "seq_length": 2, "num_units": 2, "input_size": 4, "batch_size": 1, }, ] for rnn, cfg, use_block_cell in itertools.product( (cudnn_rnn_ops.CUDNN_LSTM, ), configs, ( True, False, )): self._testCudnnCompatibleRnnCells( cfg["num_layers"], cfg["seq_length"], cfg["num_units"], cfg["input_size"], cfg["batch_size"], rnn, use_block_cell) # TODO(jamesqin): Add CudnnCompatibleGRUBlockCell. for rnn, cfg, use_block_cell in itertools.product( (cudnn_rnn_ops.CUDNN_GRU, ), configs, (False, )): self._testCudnnCompatibleRnnCells( cfg["num_layers"], cfg["seq_length"], cfg["num_units"], cfg["input_size"], cfg["batch_size"], rnn, use_block_cell) def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units, input_size, batch_size, rnn_mode, use_block_cell): has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM np.random.seed(0) # Train graph with ops.Graph().as_default(): random_seed.set_random_seed(299) input_data = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) output_tuple, cudnn_model, cudnn_params = self._build_forward_cudnn_model( rnn_mode, num_layers, num_units, input_data, is_training=True) target_output = array_ops.placeholder(dtype=dtypes.float32, shape=None) total_sum = sum(map(math_ops.reduce_sum, output_tuple)) loss_op = losses.log_loss(labels=target_output, predictions=total_sum) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1e-2) train_op = optimizer.minimize(loss_op) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) # Train Cudnn model with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) # Train 128 steps num_steps = 128 for _ in range(num_steps): inputs = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) targets = np.random.rand() sess.run(train_op, feed_dict={ input_data: inputs, target_output: targets }) save_path = os.path.join(self.get_temp_dir(), ("cudnn-rnn-%s-test" % rnn_mode)) save_v = saver.save(sess, save_path) self.assertEqual(save_path, save_v) cudnn_params_v = sess.run(cudnn_params) # cuDNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cudnn_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (cudnn_output_tuple, cudnn_model, cudnn_params) = self._build_forward_cudnn_model(rnn_mode, num_layers, num_units, cudnn_inputs, is_training=False) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) inference_input = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) saver.restore(sess, save_path) restored_cudnn_params_v = sess.run(cudnn_params) self.assertAllEqual(cudnn_params_v, restored_cudnn_params_v) # Cudnn inference cudnn_output = sess.run( cudnn_output_tuple, feed_dict={cudnn_inputs: inference_input}) # Canonical RNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cell_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (output, states) = _create_cudnn_compatible_canonical_rnn( cudnn_model, cell_inputs, use_block_cell) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: saver.restore(sess, save_path) # BlockCell inference output_v, states_v = sess.run( [output, states], feed_dict={cell_inputs: inference_input}) # output across timestamps are packed into one tensor. self.assertAllClose(cudnn_output[0], output_v, atol=1e-6, rtol=1e-6) for i in range(num_layers): if has_state_c: # output_h self.assertAllClose(cudnn_output[1][i, :], states_v[i].h, atol=1e-6, rtol=1e-6) # output_c self.assertAllClose(cudnn_output[2][i, :], states_v[i].c, atol=1e-6, rtol=1e-6) else: self.assertAllClose(cudnn_output[1][i, :], states_v[i], atol=1e-6, rtol=1e-6) def _testSaveRestoreOutput(self, rnn_mode): num_layers = 2 num_units = 7 input_size = 7 seq_length = 10 batch_size = 5 dir_count = 1 model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) params_size_t = model.params_size() params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) outputs = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: outputs = model(input_data=input_data, input_h=input_h, params=params, is_training=False) total_sum = sum(map(math_ops.reduce_sum, outputs)) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run(total_sum) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = sess.run(total_sum) self.assertAllEqual(total_sum_v, total_sum_v_restored) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestore(self): rnn_modes = [ cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU, cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU ] for rnn_mode in rnn_modes: self._testSaveRestoreVariable(rnn_mode) self._testSaveRestoreOutput(rnn_mode) def _MinLSTMParamSize(self, num_layers, num_units, input_size, input_mode="auto_select", direction=cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION): if direction != cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION: # TODO(zhengxq): support bidirection in parameter size estimate. raise ValueError("Only unidirection in parameter size estimate") first_layer_weights = 4 * num_units * (num_units + input_size) higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units all_biases = 8 * num_layers * num_units return first_layer_weights + higher_layer_weights + all_biases def _testOneLSTMParamsSize(self, num_layers, num_units, input_size): min_params_size = self._MinLSTMParamSize(num_layers, num_units, input_size) model = self._CreateModel(cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units, input_size) params_size = model.params_size() with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size) self.assertLessEqual(min_params_size, params_size_v) # @unittest.skipUnless(test.is_built_with_cuda(), # "Test only applicable when running on GPUs") def testLSTMParamsSize(self): test_configs = [ [4, 200, 200], [4, 200, 300], [4, 200, 100], [1, 100, 200], [2, 200, 100], [3, 200, 400], ] with ops.Graph().as_default(): for (num_layers, num_units, input_size) in test_configs: self._testOneLSTMParamsSize(num_layers, num_units, input_size) def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, expected, tolerance): random_seed.set_random_seed(5678) model = self._CreateModel(rnn_mode, num_layers, num_units, input_size, input_mode="auto_select", dropout=dropout) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) params_size_t = model.params_size() input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params, is_training=False) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run([total_sum]) self.assertAllClose(total_sum_v[0], expected, atol=tolerance, rtol=tolerance) # @unittest.skipUnless(test.is_built_with_cuda(), # "Test only applicable when running on GPUs") def testSimpleInference(self): # Cudnn scales result for dropout during training, therefore dropout has no # impact for inference results. # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most # demonstrative of the dropout-invariant nature of CudnnRnn.) test_configs = [ { "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM, "dropout": [0., 0.5, 1.], "expected": 231833.22, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_GRU, "dropout": [0., 0.5, 1.], "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH, "dropout": [0., 0.5, 1.], "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU, "dropout": [0., 0.5, 1.], "expected": 130688, "tolerance": 1e-2, "shape": { "num_layers": 2, "num_units": 8, "input_size": 4, "batch_size": 4, "seq_length": 2, "dir_count": 1, }, }, ] with ops.Graph().as_default(): for config in test_configs: rnn_mode = config["rnn_mode"] dropout_list = config.get("dropout", [0.]) expected = config["expected"] tolerance = config["tolerance"] shape = config["shape"] for dropout in dropout_list: self._testOneSimpleInference( rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], dropout, expected, tolerance) def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) random_seed.set_random_seed(1234) model = self._CreateModel(rnn_mode, num_layers, num_units, input_size, dropout=dropout) params_size_t = model.params_size() input_data = variables.Variable( random_ops.random_uniform([seq_length, batch_size, input_size])) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size_t) inputs_and_shapes = [ (input_data, [seq_length, batch_size, input_size]), (input_h, [num_layers * dir_count, batch_size, num_units]), (params, [params_size_v]), ] if has_input_c: inputs_and_shapes.append( (input_c, [num_layers * dir_count, batch_size, num_units ]), ) sess.run(variables.global_variables_initializer()) all_inputs = [entry[0] for entry in inputs_and_shapes] all_shapes = [entry[1] for entry in inputs_and_shapes] err = gradient_checker.compute_gradient_error( all_inputs, all_shapes, total_sum, [1]) self.assertLess(err, tolerance) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state # @unittest.skipUnless(test.is_built_with_cuda(), # "Test only applicable when running on GPUs") def testSimpleTraining(self): test_configs = [ { "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM, "dropout": [0., 0.5, 1.], # "tolerance": 1e-2, "tolerance": 1.1e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH, "dropout": [0., 0.5, 1.], # "tolerance": 5e-3, "tolerance": 5.1e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU, "dropout": [0., 0.5, 1.], "tolerance": 4e-1, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_GRU, "dropout": [0., 0.5, 1.], "tolerance": 4e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, ] ops.reset_default_graph() with ops.Graph().as_default(): for config in test_configs: rnn_mode = config["rnn_mode"] dropout_list = config.get("dropout", [0.]) tolerance = config["tolerance"] shape = config["shape"] for dropout in dropout_list: self._testOneSimpleTraining( rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], dropout, tolerance)
class CudnnRNNTestBidirectional(TensorFlowTestCase): # TODO(jamesqin): Test multi-layer bi-Cudnn. @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSingleLayerBidirectionalLSTM(self): # start with 1 layer. test_configs = [{ "input_size": 1, "num_units": 1, "seq_length": 1, "batch_size": 1 }, { "input_size": 2, "num_units": 2, "seq_length": 2, "batch_size": 2 }, { "input_size": 8, "num_units": 4, "seq_length": 4, "batch_size": 4 }, { "input_size": 32, "num_units": 16, "seq_length": 16, "batch_size": 32 }] for config in test_configs: self._testSingleLayerBidirectionalLSTMHelper( config["input_size"], config["num_units"], config["seq_length"], config["batch_size"]) def _testSingleLayerBidirectionalLSTMHelper(self, input_size, num_units, seq_length, batch_size): # Only tests single layer bi-Cudnn LSTM. num_layers = 1 np.random.seed(1234) # canonical bidirectional lstm param_size = _MinLSTMParamSize( num_layers, num_units, input_size, direction=cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION) # np data input_data = np.random.randn(seq_length, batch_size, input_size).astype(np.float32) input_h = np.zeros( (num_layers * 2, batch_size, num_units)).astype(np.float32) input_c = np.zeros( (num_layers * 2, batch_size, num_units)).astype(np.float32) cudnn_params = np.random.randn(param_size).astype(np.float32) with ops.Graph().as_default(): # cudnn bidirectional lstm graph cudnn_params_t = variables.Variable(cudnn_params) input_data_t = constant_op.constant(input_data, dtype=dtypes.float32) input_h_t = constant_op.constant(input_h, dtype=dtypes.float32) input_c_t = constant_op.constant(input_c, dtype=dtypes.float32) cudnn_lstm = _CreateModel( "lstm", num_layers, num_units, input_size, direction=cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION) cudnn_output, cudnn_output_h, cudnn_output_c = cudnn_lstm( input_data=input_data_t, input_h=input_h_t, input_c=input_c_t, params=cudnn_params_t) # canonical bidirectional lstm cell_fw = rnn_cell_impl.LSTMCell(num_units, forget_bias=0.) cell_bw = rnn_cell_impl.LSTMCell(num_units, forget_bias=0.) outputs, output_state_fw, output_state_bw = static_bidirectional_rnn( cell_fw, cell_bw, array_ops.unstack(input_data), dtype=dtypes.float32) weights_list, biases_list = _TransformBidirectionalCudnnLSTMParams( cudnn_lstm, cudnn_params_t) assert len(weights_list) == 2 assert len(biases_list) == 2 with vs.variable_scope("", reuse=True): cell_fw_kernel = vs.get_variable( "bidirectional_rnn/fw/lstm_cell/kernel") cell_fw_bias = vs.get_variable( "bidirectional_rnn/fw/lstm_cell/bias") cell_bw_kernel = vs.get_variable( "bidirectional_rnn/bw/lstm_cell/kernel") cell_bw_bias = vs.get_variable( "bidirectional_rnn/bw/lstm_cell/bias") assign_fw_kernel = state_ops.assign(cell_fw_kernel, weights_list[0]) assign_fw_bias = state_ops.assign(cell_fw_bias, biases_list[0]) assign_bw_kernel = state_ops.assign(cell_bw_kernel, weights_list[1]) assign_bw_bias = state_ops.assign(cell_bw_bias, biases_list[1]) assign_ops = control_flow_ops.group(assign_fw_kernel, assign_fw_bias, assign_bw_kernel, assign_bw_bias) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) cu_out, cu_h, cu_c = sess.run( [cudnn_output, cudnn_output_h, cudnn_output_c]) sess.run(assign_ops) out, fwd_s, bak_s = sess.run( [outputs, output_state_fw, output_state_bw]) out = np.stack(out) fwd_h, fwd_c = fwd_s.h, fwd_s.c bak_h, bak_c = bak_s.h, bak_s.c h = np.concatenate((fwd_h, bak_h), axis=1) c = np.concatenate((fwd_c, bak_c), axis=1) cu_h = [np.array(x) for x in cu_h] cu_c = [np.array(x) for x in cu_c] cu_h = np.concatenate(cu_h, axis=1) cu_c = np.concatenate(cu_c, axis=1) self.assertAllClose(out, cu_out) self.assertAllClose(h, cu_h) self.assertAllClose(c, cu_c)
class CudnnRNNTestSaveRestore(TensorFlowTestCase): def _CompareWeights(self, lhs, rhs): self.assertEqual(len(lhs), len(rhs)) for lw, rw in zip(lhs, rhs): self.assertAllEqual(lw, rw) def _CompareBiases(self, lhs, rhs, rnn_mode, num_layers, direction): self.assertEqual(len(lhs), len(rhs)) if rnn_mode == CUDNN_LSTM: num_params_per_layer = CUDNN_LSTM_PARAMS_PER_LAYER elif rnn_mode == CUDNN_GRU: num_params_per_layer = CUDNN_GRU_PARAMS_PER_LAYER elif rnn_mode == CUDNN_RNN_TANH: num_params_per_layer = CUDNN_RNN_TANH_PARAMS_PER_LAYER else: num_params_per_layer = CUDNN_RNN_RELU_PARAMS_PER_LAYER num_dirs = 1 if direction == CUDNN_RNN_UNIDIRECTION else 2 num_params_per_layer *= num_dirs self.assertEqual(num_params_per_layer * num_layers, len(lhs)) for i in range(num_layers): layer_lhs = lhs[i * num_params_per_layer:(i + 1) * num_params_per_layer] layer_rhs = rhs[i * num_params_per_layer:(i + 1) * num_params_per_layer] if direction == CUDNN_RNN_UNIDIRECTION: self._CompareSingleLayerBiases(layer_lhs, layer_rhs) else: size = len(layer_lhs) fw_lhs, bw_lhs = layer_lhs[:size // 2], layer_lhs[size // 2:] fw_rhs, bw_rhs = layer_rhs[:size // 2], layer_rhs[size // 2:] self._CompareSingleLayerBiases(fw_lhs, fw_rhs) self._CompareSingleLayerBiases(bw_lhs, bw_rhs) def _CompareSingleLayerBiases(self, lhs, rhs): self.assertEqual(len(lhs), len(rhs)) lf_lhs, rt_lhs = lhs[:len(lhs) // 2], lhs[len(lhs) // 2:] lf_rhs, rt_rhs = rhs[:len(rhs) // 2], rhs[len(rhs) // 2:] self.assertEqual(len(lf_lhs), len(rt_lhs)) self.assertEqual(len(lf_rhs), len(rt_rhs)) sum_lhs, sum_rhs = [], [] for lf, rt in zip(lf_lhs, rt_lhs): sum_lhs.append(lf + rt) for lf, rt in zip(lf_rhs, rt_rhs): sum_rhs.append(lf + rt) self.assertEqual(len(sum_lhs), len(sum_rhs)) for lf, rt in zip(sum_lhs, sum_rhs): self.assertAllEqual(lf, rt) def _testSaveRestoreVariable(self, rnn_mode, direction, dtype): num_layers = 2 num_units = 7 input_size = 3 with ops.Graph().as_default(): model = _CreateModel(rnn_mode, num_layers=num_layers, num_units=num_units, input_size=input_size, direction=direction, dtype=dtype) random_seed.set_random_seed(1234) params_size_t = model.params_size() params = variables.VariableV1(random_ops.random_uniform( [params_size_t], dtype=dtype), dtype=dtype, validate_shape=False) saveable = _CreateParamsSavable(params, model) weights, biases = saveable.format_converter._opaque_to_cu_canonical( saveable._variables) reset_params = state_ops.assign(params, array_ops.zeros([params_size_t], dtype=dtype), validate_shape=False) save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) val = saver.save(sess, save_path) self.assertEqual(save_path, val) weights_v, biases_v = sess.run([weights, biases]) sess.run(reset_params) saver.restore(sess, save_path) weights_v_restored, biases_v_restored = sess.run( [weights, biases]) self._CompareWeights(weights_v, weights_v_restored) self._CompareBiases(biases_v, biases_v_restored, rnn_mode, num_layers, direction) def _testSaveRestoreTwoVariables(self, rnn_mode, direction, dtype): num_layers = 2 num_units = 7 input_size = 3 with ops.Graph().as_default(): model = _CreateModel(rnn_mode, num_layers=num_layers, num_units=num_units, input_size=input_size, direction=direction, dtype=dtype) random_seed.set_random_seed(1234) params_size_t = model.params_size() names = ["rnn_1", "rnn_2"] param_vars = [ variables.VariableV1(random_ops.random_uniform([params_size_t], dtype=dtype), dtype=dtype, validate_shape=False) for name in names ] saveables = [] for name, params in zip(names, param_vars): saveables.append( _CreateParamsSavable(params, model, name, name)) weights1, biases1 = saveables[ 0].format_converter._opaque_to_cu_canonical( saveables[0]._variables) weights2, biases2 = saveables[ 1].format_converter._opaque_to_cu_canonical( saveables[1]._variables) reset_params = [ state_ops.assign(params, array_ops.zeros([params_size_t], dtype=dtype), validate_shape=False) for params in param_vars ] save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) val = saver.save(sess, save_path) self.assertEqual(save_path, val) weights1_v, biases1_v = sess.run([weights1, biases1]) weights2_v, biases2_v = sess.run([weights2, biases2]) sess.run(reset_params) saver.restore(sess, save_path) weights1_v_restored, biases1_v_restored = sess.run( [weights1, biases1]) weights2_v_restored, biases2_v_restored = sess.run( [weights2, biases2]) self._CompareWeights(weights1_v, weights1_v_restored) self._CompareWeights(weights2_v, weights2_v_restored) self._CompareBiases(biases1_v, biases1_v_restored, rnn_mode, num_layers, direction) self._CompareBiases(biases2_v, biases2_v_restored, rnn_mode, num_layers, direction) def _testSaveRestoreOutput(self, rnn_mode, direction, dtype): with ops.Graph().as_default(): num_layers = 2 num_units = 7 input_size = 7 seq_length = 10 batch_size = 5 dir_count = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 model = _CreateModel(rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype) params_size_t = model.params_size() params = variables.VariableV1(array_ops.ones([params_size_t], dtype=dtype), validate_shape=False, dtype=dtype) _CreateParamsSavable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) np.random.seed(1234) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) input_data = constant_op.constant(np.random.randn( seq_length, batch_size, input_size), dtype=dtype) input_h = constant_op.constant(np.random.randn( num_layers * dir_count, batch_size, num_units), dtype=dtype) if has_input_c: input_c = constant_op.constant(np.random.randn( num_layers * dir_count, batch_size, num_units), dtype=dtype) outputs = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: outputs = model(input_data=input_data, input_h=input_h, params=params, is_training=False) total_sum = sum(map(math_ops.reduce_sum, outputs)) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run(total_sum) val = saver.save(sess, save_path) self.assertEqual(save_path, val) # Passing graph explicitly, otherwise an old sess would be reused. with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: reset_params = state_ops.assign(params, array_ops.zeros( [params_size_t], dtype=dtype), validate_shape=False) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = sess.run(total_sum) self.assertAllClose(total_sum_v, total_sum_v_restored, atol=1e-5) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestore(self): rnn_modes = [ cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU, cudnn_rnn_ops.CUDNN_RNN_TANH, cudnn_rnn_ops.CUDNN_RNN_RELU ] directions = [ cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION, cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION ] dtype_list = [dtypes.float32, dtypes.float64] for rnn_mode, direction, dtype in itertools.product( rnn_modes, directions, dtype_list): self._testSaveRestoreVariable(rnn_mode, direction, dtype) self._testSaveRestoreTwoVariables(rnn_mode, direction, dtype) self._testSaveRestoreOutput(rnn_mode, direction, dtype)
class CudnnParamsFormatConverterTest(TensorFlowTestCase, parameterized.TestCase): """Class for testing various format converters.""" def _test_lstm_helper(self, num_units, input_size, num_layers, direction): with self.session(use_gpu=True) as sess: random_seed.set_random_seed(0) np.random.seed(0) num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size, direction=direction) ws, bs = [], [] for _ in range(num_layers * num_dirs): w = constant_op.constant( np.random.rand(input_size + num_units, 4 * num_units), dtype=dtypes.float32) b = constant_op.constant( np.random.rand(4 * num_units), dtype=dtypes.float32) ws.append(w) bs.append(b) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size( cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units, input_size, direction=direction) ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params) # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical() # returns the original input. ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r]) for w, w_r in zip(ws, ws_r): self.assertAllClose(w, w_r) for b, b_r in zip(bs, bs_r): self.assertAllClose(b, b_r) # Test opaque_params size lower bound opaque_params_size_v = sess.run(opaque_params_size) min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs) logging.info("min_parm_size: %d vs actual_opaque_param_size: %d", min_params_size, opaque_params_size_v) self.assertLessEqual(min_params_size, opaque_params_size_v) @parameterized.named_parameters((c["testcase_name"], c["num_units"], c["input_size"], c["num_layers"]) for c in NAMED_RNN_TESTCASES) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_lstm(self, num_units, input_size, num_layers): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_lstm_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION) @parameterized.named_parameters((c["testcase_name"], c["num_units"], c["input_size"], c["num_layers"]) for c in NAMED_RNN_TESTCASES) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_lstm_bidi(self, num_units, input_size, num_layers): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_lstm_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION) def _test_gru_helper(self, num_units, input_size, num_layers, direction): with self.session(use_gpu=True) as sess: random_seed.set_random_seed(0) np.random.seed(0) num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU( num_layers, num_units, input_size, direction=direction) ws, bs = [], [] for _ in range(num_layers * num_dirs): gate_kernel = constant_op.constant( np.random.rand(input_size + num_units, num_units * 2), dtype=dtypes.float32) gate_bias = constant_op.constant( np.random.rand(num_units * 2), dtype=dtypes.float32) candidate_inp_kernel = constant_op.constant( np.random.rand(input_size, num_units), dtype=dtypes.float32) candidate_inp_bias = constant_op.constant( np.random.rand(num_units), dtype=dtypes.float32) candidate_hid_kernel = constant_op.constant( np.random.rand(num_units, num_units), dtype=dtypes.float32) candidate_hid_bias = constant_op.constant( np.random.rand(num_units), dtype=dtypes.float32) ws.extend([gate_kernel, candidate_inp_kernel, candidate_hid_kernel]) bs.extend([gate_bias, candidate_inp_bias, candidate_hid_bias]) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size( cudnn_rnn_ops.CUDNN_GRU, num_layers, num_units, input_size, direction=direction) ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params) # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical() # returns the original input. ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r]) for w, w_r in zip(ws, ws_r): self.assertAllClose(w, w_r) for b, b_r in zip(bs, bs_r): self.assertAllClose(b, b_r) # Test opaque_params size lower bound opaque_params_size_v = sess.run(opaque_params_size) min_params_size = sum(x.size for x in ws) + sum(x.size for x in bs) logging.info("min_parm_size: %d vs actual_opaque_param_size: %d", min_params_size, opaque_params_size_v) self.assertLessEqual(min_params_size, opaque_params_size_v) @parameterized.named_parameters((c["testcase_name"], c["num_units"], c["input_size"], c["num_layers"]) for c in NAMED_RNN_TESTCASES) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_gru(self, num_units, input_size, num_layers): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_gru_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION) @parameterized.named_parameters((c["testcase_name"], c["num_units"], c["input_size"], c["num_layers"]) for c in NAMED_RNN_TESTCASES) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_gru_bidi(self, num_units, input_size, num_layers): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_gru_helper(num_units, input_size, num_layers, cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION)
class CudnnGRUTest(TensorFlowTestCase, parameterized.TestCase): def _test_training_helper(self, num_units, input_size, batch_size, time, num_layers, dtype, variable_seq_lengths, time_major, dynamic_shape_input=False, rtol=3e-6, atol=3e-6): with self.session(use_gpu=True) as sess: (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) = RunGRU( sess, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol) self.assertAllClose(h, cu_h, rtol=rtol, atol=atol) self.assertAllClose(hgrad, cu_hgrad, rtol=rtol, atol=atol) self.assertAllClose(inp_grad, cu_inp_grad, rtol=rtol, atol=atol) for bg, cu_bg in zip(bgrad, cu_bgrad): self.assertAllClose(bg, cu_bg, rtol=rtol, atol=atol) for wg, cu_wg in zip(wgrad, cu_wgrad): self.assertAllClose(wg, cu_wg, rtol=rtol, atol=atol) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, **{ "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_training(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, dynamic_shape_input): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_training_helper( num_units, input_size, batch_size, time, num_layers, dtypes.float32, variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, **{ "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_training_fp16(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, dynamic_shape_input): if not context.context().num_gpus(): self.skipTest("No GPUs found") self._test_training_helper( num_units, input_size, batch_size, time, num_layers, dtypes.float16, rtol=5e-3, atol=5e-4, variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, **{ "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_inference(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, dynamic_shape_input): if not context.context().num_gpus(): self.skipTest("No GPUs found") with self.session(use_gpu=True) as sess: (outputs, cu_outputs, h, cu_h) = RunGRU( sess, num_units, input_size, batch_size, time, num_layers, is_training=False, variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) self.assertAllClose(outputs, cu_outputs) self.assertAllClose(h, cu_h) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, **{ "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_inference_fp16(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, dynamic_shape_input): if not context.context().num_gpus(): self.skipTest("No GPUs found") with self.session(use_gpu=True) as sess: (outputs, cu_outputs, h, cu_h) = RunGRU( sess, num_units, input_size, batch_size, time, num_layers, is_training=False, dtype=dtypes.float16, variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) rtol, atol = 5e-3, 5e-4 self.assertAllClose(outputs, cu_outputs, rtol=rtol, atol=atol) self.assertAllClose(h, cu_h, rtol=rtol, atol=atol) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, **{ "variable_seq_lengths": [True, False], "time_major": [True, False], "dynamic_shape_input": [True, False], })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_inference_with_dropout(self, num_units, input_size, batch_size, time, num_layers, variable_seq_lengths, time_major, dynamic_shape_input): """Validates that dropout does not affect Cudnn Rnn inference.""" # Hand-picked dropouts are used below (0. and 1.) if not context.context().num_gpus(): self.skipTest("No GPUs found") with ops.Graph().as_default() as g: with self.session(use_gpu=True, graph=g) as sess: # 1st time w/o dropout. (_, cu_outputs, _, cu_h) = RunGRU( sess, num_units, input_size, batch_size, time, num_layers, is_training=False, dropout=0., variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) with ops.Graph().as_default() as g: with self.session(use_gpu=True, graph=g) as sess: (_, cu_outputs2, _, cu_h2) = RunGRU( sess, num_units, input_size, batch_size, time, num_layers, is_training=False, dropout=1., variable_seq_lengths=variable_seq_lengths, time_major=time_major, dynamic_shape_input=dynamic_shape_input) self.assertAllClose(cu_outputs, cu_outputs2) self.assertAllClose(cu_h[0], cu_h2[0])
class CudnnRnnSaveRestoreTest(TensorFlowTestCase, parameterized.TestCase): """Class for testing various Cudnn Rnn SaveableObjects.""" def _create_opaque_param(self, rnn_mode, num_units, input_size, num_layers, direction, name=None): param_size_t = cudnn_rnn_ops.cudnn_rnn_opaque_params_size( rnn_mode, num_layers, num_units, input_size, direction=direction) init_val = random_ops.random_uniform([param_size_t]) return variable_scope.get_variable( name or "opaque_param", initializer=init_val, validate_shape=False) def _create_saveable(self, opaque_param, rnn_mode, num_units, input_size, num_layers, direction): if rnn_mode == CUDNN_LSTM: fn = cudnn_rnn_ops.CudnnLSTMSaveable elif rnn_mode == CUDNN_GRU: fn = cudnn_rnn_ops.CudnnGRUSaveable elif rnn_mode == CUDNN_RNN_TANH: fn = cudnn_rnn_ops.CudnnRNNTanhSaveable elif rnn_mode == CUDNN_RNN_RELU: fn = cudnn_rnn_ops.CudnnRNNReluSaveable saveable = fn( opaque_param, num_layers, num_units, input_size, direction=direction) return saveable def _compare_weights(self, lhs, rhs): self.assertLen(rhs, len(lhs)) for lw, rw in zip(lhs, rhs): self.assertAllEqual(lw, rw) def _compare_biases(self, lhs, rhs): self.assertLen(rhs, len(lhs)) for lf, rt in zip(lhs, rhs): self.assertAllEqual(lf, rt) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, "time", "batch_size", **{ "rnn_mode": [ CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH ], "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION] })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_save_restore_variable(self, rnn_mode, num_units, input_size, num_layers, direction): # Verify the restored opaque param, once converted to tf_canonical format, # is the same as the tf canonicals of the pre-restored param. if not context.context().num_gpus(): self.skipTest("No GPUs found") with self.session(use_gpu=True) as sess: opaque_param = self._create_opaque_param(rnn_mode, num_units, input_size, num_layers, direction) saveable = self._create_saveable(opaque_param, rnn_mode, num_units, input_size, num_layers, direction) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) weights_op, biases_op = saveable.format_converter.opaque_to_tf_canonical( saveable._variables) save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) init_op = variables.global_variables_initializer() reset_op = state_ops.assign(opaque_param, array_ops.zeros_like(opaque_param)) sess.run(init_op) self.assertEqual(save_path, saver.save(sess, save_path)) # Get the tf canonical vals before reset-restore weights, biases = sess.run([weights_op, biases_op]) # Reset the opaque param value sess.run(reset_op) # Assert reset happened. weights_z, biases_z = sess.run([weights_op, biases_op]) for w in weights_z: self.assertAllClose(w, np.zeros_like(w)) for b in biases_z: self.assertAllClose(b, np.zeros_like(b)) # Restore opaque param value from checkpoint. saver.restore(sess, save_path) weights_r, biases_r = sess.run([weights_op, biases_op]) self._compare_weights(weights, weights_r) self._compare_biases(biases, biases_r) @parameterized.named_parameters( ExpandNamedTestCases( NAMED_RNN_TESTCASES, "time", "batch_size", **{ "rnn_mode": [ CUDNN_LSTM, CUDNN_GRU, CUDNN_RNN_RELU, CUDNN_RNN_TANH ], "direction": [CUDNN_RNN_UNIDIRECTION, CUDNN_RNN_BIDIRECTION] })) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_save_restore_multi_variables(self, rnn_mode, num_units, input_size, num_layers, direction): # Verify the restored opaque param, once converted to tf_canonical format, # is the same as the tf canonicals of the pre-restored param. if not context.context().num_gpus(): self.skipTest("No GPUs found") with self.session(use_gpu=True) as sess: opaque_params = [] saveables = [] num_opaque_params = 2 for i in range(num_opaque_params): opaque_params.append( self._create_opaque_param( rnn_mode, num_units, input_size, num_layers, direction, name="opaque_param_%d" % i)) saveable = self._create_saveable(opaque_params[i], rnn_mode, num_units, input_size, num_layers, direction) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, saveable) saveables.append(saveable) weights_ops, biases_ops = [], [] for i in range(num_opaque_params): weights_op, biases_op = ( saveables[i].format_converter.opaque_to_tf_canonical( saveables[i]._variables)) weights_ops.append(weights_op) biases_ops.append(biases_op) save_path = os.path.join(self.get_temp_dir(), "save_restore_var_test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) init_op = variables.global_variables_initializer() reset_ops = [] for i in range(num_opaque_params): reset_ops.append( state_ops.assign(opaque_params[i], array_ops.zeros_like(opaque_params[i]))) sess.run(init_op) self.assertEqual(save_path, saver.save(sess, save_path)) # Get the tf canonical vals before reset-restore for i in range(num_opaque_params): weights, biases = sess.run([weights_ops[i], biases_ops[i]]) # Reset the opaque param value sess.run(reset_ops[i]) # Assert reset happened. weights_z, biases_z = sess.run([weights_ops[i], biases_ops[i]]) for w in weights_z: self.assertAllClose(w, np.zeros_like(w)) for b in biases_z: self.assertAllClose(b, np.zeros_like(b)) # Restore opaque param value from checkpoint. saver.restore(sess, save_path) weights_r, biases_r = sess.run([weights_ops[i], biases_ops[i]]) self._compare_weights(weights, weights_r) self._compare_biases(biases, biases_r)
def testBuildInfo(self): self.assertEqual(build_info.is_rocm_build, test.is_built_with_rocm()) self.assertEqual(build_info.is_cuda_build, test.is_built_with_cuda())
class CudnnRNNTest(TensorFlowTestCase): def _CreateModel(self, rnn_mode, num_layers, num_units, input_size): if rnn_mode == "lstm": model = cudnn_rnn_ops.CudnnLSTM(num_layers, num_units, input_size) elif rnn_mode == "gru": model = cudnn_rnn_ops.CudnnGRU(num_layers, num_units, input_size) elif rnn_mode == "rnn_tanh": model = cudnn_rnn_ops.CudnnRNNTanh(num_layers, num_units, input_size) elif rnn_mode == "rnn_relu": model = cudnn_rnn_ops.CudnnRNNRelu(num_layers, num_units, input_size) else: raise ValueError("Invalid rnn_mode: %s" % rnn_mode) return model def _create_params_savable(self, params, model): """Create a RNNParamsSaveable for the weight and bias parameters. Args: params: a Variable for weight and bias parameters. model: a CudnnRNN model. """ params_saveable = cudnn_rnn_ops.RNNParamsSaveable( model.params_to_canonical, model.canonical_to_params, params) ops.add_to_collection(ops.GraphKeys.SAVEABLE_OBJECTS, params_saveable) def _testSaveRestoreVariable(self, rnn_mode): model = self._CreateModel(rnn_mode, num_layers=2, num_units=7, input_size=3) random_seed.set_random_seed(1234) params_size_t = model.params_size() params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-variable-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) params_v = sess.run(params) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) params_v_restored = sess.run(params) self.assertAllEqual(params_v, params_v_restored) def _testSaveRestoreOutput(self, rnn_mode): num_layers = 2 num_units = 7 input_size = 7 seq_length = 10 batch_size = 5 dir_count = 1 model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) params_size_t = model.params_size() params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) self._create_params_savable(params, model) save_path = os.path.join(self.get_temp_dir(), "save-restore-output-test") saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) has_input_c = (rnn_mode == "lstm") input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) outputs = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: outputs = model(input_data=input_data, input_h=input_h, params=params, is_training=False) total_sum = sum(map(math_ops.reduce_sum, outputs)) with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run(total_sum) val = saver.save(sess, save_path) self.assertEqual(save_path, val) with self.test_session(use_gpu=True) as sess: reset_params = state_ops.assign(params, array_ops.zeros([params_size_t])) sess.run(reset_params) saver.restore(sess, save_path) total_sum_v_restored = sess.run(total_sum) self.assertAllEqual(total_sum_v, total_sum_v_restored) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSaveRestore(self): rnn_modes = ["lstm", "gru", "rnn_tanh", "rnn_relu"] for rnn_mode in rnn_modes: self._testSaveRestoreVariable(rnn_mode) self._testSaveRestoreOutput(rnn_mode) def _MinLSTMParamSize(self, num_layers, num_units, input_size, input_mode="auto_select", direction="unidirection"): if direction != "unidirection": # TODO(zhengxq): support bidirection in parameter size estimate. raise ValueError("Only unidirection in parameter size estimate") first_layer_weights = 4 * num_units * (num_units + input_size) higher_layer_weights = 8 * (num_layers - 1) * num_units * num_units all_biases = 8 * num_layers * num_units return first_layer_weights + higher_layer_weights + all_biases def _testOneLSTMParamsSize(self, num_layers, num_units, input_size): min_params_size = self._MinLSTMParamSize(num_layers, num_units, input_size) model = self._CreateModel("lstm", num_layers, num_units, input_size) params_size = model.params_size() with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size) self.assertLessEqual(min_params_size, params_size_v) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testLSTMParamsSize(self): test_configs = [ [4, 200, 200], [4, 200, 300], [4, 200, 100], [1, 100, 200], [2, 200, 100], [3, 200, 400], ] with ops.Graph().as_default(): for (num_layers, num_units, input_size) in test_configs: self._testOneLSTMParamsSize(num_layers, num_units, input_size) def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, expected, tolerance): model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) has_input_c = (rnn_mode == "lstm") params_size_t = model.params_size() input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) params = variables.Variable(array_ops.ones([params_size_t]), validate_shape=False) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params, is_training=False) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run([total_sum]) self.assertAllClose(total_sum_v[0], expected, atol=tolerance, rtol=tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleInference(self): test_configs = [ [ "lstm", 231833.22, 1e-2, { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, ], [ "gru", 56000, 1e-2, { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, ], [ "rnn_tanh", 56000, 1e-2, { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, ], [ "rnn_relu", 130688, 1e-2, { "num_layers": 2, "num_units": 8, "input_size": 4, "batch_size": 4, "seq_length": 2, "dir_count": 1, }, ], ] with ops.Graph().as_default(): for config in test_configs: rnn_mode = config[0] expected = config[1] tolerance = config[2] shapes = config[3] self._testOneSimpleInference( rnn_mode, shapes["num_layers"], shapes["num_units"], shapes["input_size"], shapes["batch_size"], shapes["seq_length"], shapes["dir_count"], expected, tolerance) def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, tolerance): has_input_c = (rnn_mode == "lstm") random_seed.set_random_seed(1234) model = self._CreateModel(rnn_mode, num_layers, num_units, input_size) params_size_t = model.params_size() input_data = variables.Variable( random_ops.random_uniform([seq_length, batch_size, input_size])) input_h = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) params = variables.Variable(random_ops.random_uniform([params_size_t]), validate_shape=False) if has_input_c: input_c = variables.Variable( random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units])) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True) as sess: params_size_v = sess.run(params_size_t) inputs_and_shapes = [ (input_data, [seq_length, batch_size, input_size]), (input_h, [num_layers * dir_count, batch_size, num_units]), (params, [params_size_v]), ] if has_input_c: inputs_and_shapes.append( (input_c, [num_layers * dir_count, batch_size, num_units ]), ) sess.run(variables.global_variables_initializer()) all_inputs = [entry[0] for entry in inputs_and_shapes] all_shapes = [entry[1] for entry in inputs_and_shapes] err = gradient_checker.compute_gradient_error( all_inputs, all_shapes, total_sum, [1]) self.assertLess(err, tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleTraining(self): test_configs = [ [ "lstm", 1e-2, { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, ], [ "gru", 4e-3, { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, ], [ "rnn_tanh", 5e-3, { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, ], [ "rnn_relu", 3e-1, { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, ], ] with ops.Graph().as_default(): for config in test_configs: rnn_mode = config[0] tolerance = config[1] shape = config[2] self._testOneSimpleTraining(rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], tolerance)
# Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Cuda op Python library.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function import os.path #import tensorflow as tf from tensorflow.python.platform.test import is_built_with_cuda from tensorflow.python.framework.load_library import load_op_library from tensorflow.python.platform.resource_loader import get_data_files_path if is_built_with_cuda(): _cuda_op_module = load_op_library( os.path.join(get_data_files_path(), 'bf16cut_bp.so')) #Bf16cutBp must be a camel naming style # and when I invoke it in python, I need to seperate cammel head with _ bf16cut_bp = _cuda_op_module.bf16cut_bp
class CudnnLayersTest(tf.test.TestCase): def test_stacked_bilstm(self): with tf.Graph().as_default(): input_emb = tf.random_uniform([3, 5, 8]) input_len = tf.constant([4, 5, 2]) output_emb = cudnn_layers.stacked_bilstm( input_emb=input_emb, input_len=input_len, hidden_size=10, num_layers=3, dropout_ratio=0.2, mode=tf.estimator.ModeKeys.TRAIN) with tf.Session() as sess: sess.run(tf.global_variables_initializer()) actual_output_emb = sess.run(output_emb) self.assertAllEqual(actual_output_emb.shape, [3, 5, 10 * 2]) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def test_stacked_bilstm_compatibility(self): checkpoint_dir = tempfile.mkdtemp(prefix="checkpoint_dir") checkpoint_path = os.path.join(checkpoint_dir, "model.ckpt") hidden_size = 10 num_layers = 3 dropout_ratio = 0.0 input_emb = np.random.uniform(size=[3, 5, 9]).astype(np.float32) input_len = [4, 5, 2] # Make sure we fail explicitly if the specified devices can't be used. config = tf.ConfigProto(allow_soft_placement=False, log_device_placement=True) with tf.Graph().as_default(): with tf.device("/gpu:0"): output_emb = cudnn_layers.stacked_bilstm( input_emb=input_emb, input_len=input_len, hidden_size=hidden_size, num_layers=num_layers, dropout_ratio=dropout_ratio, mode=tf.estimator.ModeKeys.TRAIN, use_cudnn=True) saver = tf.train.Saver() with tf.Session(config=config) as sess: sess.run(tf.global_variables_initializer()) gpu_output_emb = sess.run(output_emb) saver.save(sess, checkpoint_path) with tf.Graph().as_default(): with tf.device("/cpu:0"): output_emb = cudnn_layers.stacked_bilstm( input_emb=input_emb, input_len=input_len, hidden_size=hidden_size, num_layers=num_layers, dropout_ratio=dropout_ratio, mode=tf.estimator.ModeKeys.TRAIN, use_cudnn=False) saver = tf.train.Saver() with tf.Session(config=config) as sess: saver.restore(sess, checkpoint_path) cpu_output_emb = sess.run(output_emb) for c, g, l in zip(cpu_output_emb, gpu_output_emb, input_len): self.assertAllClose(c[:l], g[:l])
def test_rocm_cuda_info_matches(self): build_info = sysconfig.get_build_info() self.assertEqual(build_info["is_rocm_build"], test.is_built_with_rocm()) self.assertEqual(build_info["is_cuda_build"], test.is_built_with_cuda())
class CudnnRNNTestInference(TensorFlowTestCase): def _testOneSimpleInference(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, expected, tolerance): random_seed.set_random_seed(5678) model = _CreateModel( rnn_mode, num_layers, num_units, input_size, input_mode="auto_select", direction=(cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION), dropout=dropout) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) params_size_t = model.params_size() input_data = array_ops.ones([seq_length, batch_size, input_size]) input_h = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) params = variables.VariableV1(array_ops.ones([params_size_t]), validate_shape=False) if has_input_c: input_c = array_ops.ones( [num_layers * dir_count, batch_size, num_units]) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params, is_training=False) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params, is_training=False) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) total_sum_v = sess.run([total_sum]) self.assertAllClose(total_sum_v[0], expected, atol=tolerance, rtol=tolerance) @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testSimpleInference(self): test_configs = [ { "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM, "expected": 231833.22, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_GRU, "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH, "expected": 56000, "tolerance": 1e-2, "shape": { "num_layers": 4, "num_units": 200, "input_size": 200, "batch_size": 20, "seq_length": 10, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU, "expected": 130688, "tolerance": 1e-2, "shape": { "num_layers": 2, "num_units": 8, "input_size": 4, "batch_size": 4, "seq_length": 2, "dir_count": 1, }, }, ] # Cudnn scales result for dropout during training, therefore dropout has no # impact for inference results. # (lstm, gru, rnn_tanh are saturated in the test. rnn_relu case is most # demonstrative of the dropout-invariant nature of CudnnRnn.) dropouts = [0., 0.5, 1.] for (config, dropout) in itertools.product(test_configs, dropouts): rnn_mode = config["rnn_mode"] expected = config["expected"] tolerance = config["tolerance"] shape = config["shape"] with ops.Graph().as_default(): self._testOneSimpleInference( rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], shape["dir_count"], dropout, expected, tolerance)
class CudnnRNNTestCompatibleRnnCells(TensorFlowTestCase): @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def testCudnnCompatibleRnnCells(self): configs = [ { "num_layers": 1, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 2, "seq_length": 8, "num_units": 4, "input_size": 8, "batch_size": 16, }, { "num_layers": 2, "seq_length": 3, "num_units": 4, "input_size": 5, "batch_size": 6, }, { "num_layers": 1, "seq_length": 2, "num_units": 2, "input_size": 4, "batch_size": 1, }, ] for rnn, cfg in itertools.product((cudnn_rnn_ops.CUDNN_LSTM, ), configs): self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"], cfg["num_units"], cfg["input_size"], cfg["batch_size"], rnn) # TODO(jamesqin): Add CudnnCompatibleGRUBlockCell. for rnn, cfg in itertools.product((cudnn_rnn_ops.CUDNN_GRU, ), configs): self._testCudnnCompatibleRnnCells(cfg["num_layers"], cfg["seq_length"], cfg["num_units"], cfg["input_size"], cfg["batch_size"], rnn) def _testCudnnCompatibleRnnCells(self, num_layers, seq_length, num_units, input_size, batch_size, rnn_mode): has_state_c = rnn_mode == cudnn_rnn_ops.CUDNN_LSTM np.random.seed(0) # Train graph with ops.Graph().as_default(): random_seed.set_random_seed(299) input_data = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) output_tuple, cudnn_model = _BuildCudnnForward(rnn_mode, num_layers, num_units, input_data, is_training=True) target_output = array_ops.placeholder(dtype=dtypes.float32, shape=None) total_sum = sum(map(math_ops.reduce_sum, output_tuple)) loss_op = losses.log_loss(labels=target_output, predictions=total_sum) optimizer = gradient_descent.GradientDescentOptimizer( learning_rate=1e-2) train_op = optimizer.minimize(loss_op) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) # Train Cudnn model with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) # Train 128 steps num_steps = 128 for _ in range(num_steps): inputs = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) targets = np.random.rand() sess.run(train_op, feed_dict={ input_data: inputs, target_output: targets }) save_path = os.path.join(self.get_temp_dir(), ("cudnn-rnn-%s-test" % rnn_mode)) save_v = saver.save(sess, save_path) self.assertEqual(save_path, save_v) # cuDNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cudnn_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (cudnn_output_tuple, cudnn_model) = _BuildCudnnForward(rnn_mode, num_layers, num_units, cudnn_inputs, is_training=False) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) inference_input = np.random.rand(seq_length, batch_size, input_size).astype(np.float32) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: sess.run(variables.global_variables_initializer()) saver.restore(sess, save_path) # Cudnn inference cudnn_output = sess.run( cudnn_output_tuple, feed_dict={cudnn_inputs: inference_input}) # Canonical RNN inference graph with ops.Graph().as_default(): random_seed.set_random_seed(299) cell_inputs = array_ops.placeholder( dtypes.float32, shape=[seq_length, batch_size, input_size]) (output, states) = _CreateCudnnCompatibleCanonicalRNN( cudnn_model, cell_inputs) saver = saver_lib.Saver(write_version=saver_pb2.SaverDef.V2) with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: saver.restore(sess, save_path) # BlockCell inference output_v, states_v = sess.run( [output, states], feed_dict={cell_inputs: inference_input}) # output across timestamps are packed into one tensor. self.assertAllClose(cudnn_output[0], output_v, atol=1e-6, rtol=1e-6) for i in range(num_layers): if has_state_c: # output_h self.assertAllClose(cudnn_output[1][i, :], states_v[i].h, atol=1e-6, rtol=1e-6) # output_c self.assertAllClose(cudnn_output[2][i, :], states_v[i].c, atol=1e-6, rtol=1e-6) else: self.assertAllClose(cudnn_output[1][i, :], states_v[i], atol=1e-6, rtol=1e-6)
class CudnnRNNTestTraining(TensorFlowTestCase): def _testOneSimpleTraining(self, rnn_mode, num_layers, num_units, input_size, batch_size, seq_length, dir_count, dropout, dtype, delta, tolerance): # Gradient checking runs two forward ops with almost the same input. Need to # make sure the drop patterns across the two runs are the same. logging.info("Training test with config: %s", locals()) old_env_state = os.environ.get("TF_CUDNN_RESET_RND_GEN_STATE", str(False)) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = str(True) has_input_c = (rnn_mode == cudnn_rnn_ops.CUDNN_LSTM) random_seed.set_random_seed(5678) direction = (cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION if dir_count == 1 else cudnn_rnn_ops.CUDNN_RNN_BIDIRECTION) model = _CreateModel(rnn_mode, num_layers, num_units, input_size, direction=direction, dtype=dtype, dropout=dropout) params_size_t = model.params_size() input_data = variables.VariableV1(random_ops.random_uniform( [seq_length, batch_size, input_size], dtype=dtype), dtype=dtype) input_h = variables.VariableV1(random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) params = variables.VariableV1(random_ops.random_uniform( [params_size_t], dtype=dtype), validate_shape=False, dtype=dtype) if has_input_c: input_c = variables.VariableV1(random_ops.random_uniform( [num_layers * dir_count, batch_size, num_units], dtype=dtype), dtype=dtype) output, output_h, output_c = model(input_data=input_data, input_h=input_h, input_c=input_c, params=params) else: output, output_h = model(input_data=input_data, input_h=input_h, params=params) output_sum = math_ops.reduce_sum(output) output_h_sum = math_ops.reduce_sum(output_h) total_sum = output_sum + output_h_sum if has_input_c: output_c_sum = math_ops.reduce_sum(output_c) total_sum += output_c_sum with self.test_session(use_gpu=True, graph=ops.get_default_graph()) as sess: params_size_v = sess.run(params_size_t) inputs_and_shapes = [ (input_data, [seq_length, batch_size, input_size]), (input_h, [num_layers * dir_count, batch_size, num_units]), (params, [params_size_v]), ] if has_input_c: inputs_and_shapes.append( (input_c, [num_layers * dir_count, batch_size, num_units ]), ) sess.run(variables.global_variables_initializer()) all_inputs = [entry[0] for entry in inputs_and_shapes] all_shapes = [entry[1] for entry in inputs_and_shapes] err = gradient_checker.compute_gradient_error(all_inputs, all_shapes, total_sum, [1], delta=delta) self.assertLess(err, tolerance) os.environ["TF_CUDNN_RESET_RND_GEN_STATE"] = old_env_state @unittest.skipUnless(test.is_built_with_cuda(), "Test only applicable when running on GPUs") def DISABLED_testSimpleTraining(self): # TODO(jamesqin): fix b/117989214 test_configs = [ { "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM, "dtype": dtypes.float64, "delta": 1e-4, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_GRU, "dtype": dtypes.float64, "delta": 1e-4, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH, "dtype": dtypes.float64, "delta": 1e-4, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU, "dtype": dtypes.float64, "delta": 1e-4, "tolerance": 5e-6, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, "dir_count": 1, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_LSTM, "dtype": dtypes.float32, "tolerance": 1.5e-2, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_GRU, "dtype": dtypes.float32, "tolerance": 4e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_TANH, "dtype": dtypes.float32, "tolerance": 5e-3, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, { "rnn_mode": cudnn_rnn_ops.CUDNN_RNN_RELU, "dtype": dtypes.float32, "tolerance": 5e-1, "shape": { "num_layers": 2, "num_units": 3, "input_size": 4, "batch_size": 3, "seq_length": 4, }, }, ] dropouts = [0., 0.5, 1.] dir_counts = [1] for config, dropout, dir_count in itertools.product( test_configs, dropouts, dir_counts): rnn_mode = config["rnn_mode"] dtype = config.get("dtype", dtypes.float32) delta = config.get("delta", 1e-3) tolerance = config["tolerance"] shape = config["shape"] with ops.Graph().as_default(): self._testOneSimpleTraining(rnn_mode, shape["num_layers"], shape["num_units"], shape["input_size"], shape["batch_size"], shape["seq_length"], dir_count, dropout, dtype, delta, tolerance)
def testBuildInfo(self): self.assertEqual(build_info.is_cuda_build, test.is_built_with_cuda())