def _test_lstm_helper(self, num_units, input_size, num_layers, direction): with self.session(use_gpu=True) as sess: random_seed.set_random_seed(0) np.random.seed(0) num_dirs = 1 if direction == cudnn_rnn_ops.CUDNN_RNN_UNIDIRECTION else 2 format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size, direction=direction) ws, bs = [], [] for _ in range(num_layers * num_dirs): w = constant_op.constant(np.random.rand( input_size + num_units, 4 * num_units), dtype=dtypes.float32) b = constant_op.constant(np.random.rand(4 * num_units), dtype=dtypes.float32) ws.append(w) bs.append(b) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) opaque_params_size = cudnn_rnn_ops.cudnn_rnn_opaque_params_size( cudnn_rnn_ops.CUDNN_LSTM, num_layers, num_units, input_size, direction=direction) ws_r, bs_r = format_converter.opaque_to_tf_canonical(opaque_params) # Test tf_canonical_to_opaque() followed by opaque_to_tf_canonical() # returns the original input. ws, ws_r, bs, bs_r = sess.run([ws, ws_r, bs, bs_r]) for w, w_r in zip(ws, ws_r): self.assertAllClose(w, w_r) for b, b_r in zip(bs, bs_r): self.assertAllClose(b, b_r) # Test opaque_params size lower bound opaque_params_size_v = sess.run(opaque_params_size) min_params_size = sum(x.size for x in ws) + np.sum(x.size for x in bs) logging.info("min_parm_size: %d vs actual_opaque_param_size: %d", min_params_size, opaque_params_size_v) self.assertLessEqual(min_params_size, opaque_params_size_v)
def RunLSTM(sess, num_units, input_size, batch_size, time, num_layers=1, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, is_training=True, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initial_c_op = variable_scope.get_variable( "initial_c_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): w = variable_scope.get_variable( "rnn/lstm_cell/kernel", shape=[input_size + num_units, num_units * 4], dtype=dtype) b = variable_scope.get_variable( "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype) # canonical lstm. must set forget_bias to 0. to align with cudnn lstm. cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True) outputs_op, state_tuple_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=rnn_cell_impl.LSTMStateTuple( h=initial_h_op, c=initial_c_op), dtype=dtype, time_major=time_major, scope=None) # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque([w, b]) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_initial_c_op = array_ops.expand_dims( initial_c_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, cu_initial_c_op, opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_LSTM) # Remove the trivial 1st dimension. cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple( c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1), h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1)) if is_training: (inp_grad_op, hgrad_op, cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b]) (cu_inp_grad_op, cu_hgrad_op, cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) # Remove the trivial 1st dimension cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) cu_wgrad_op = cu_wgrad_op[0] cu_bgrad_op = cu_bgrad_op[0] # cudnn lstm has 2 biases each gate. When converting to tf canonical format, # the two biases are summed into one. Thus here bias gradient should be # halved when comparing with tf lstm. cu_bgrad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ outputs_op, state_tuple_op, inp_grad_op, (hgrad_op, cgrad_op), wgrad_op, bgrad_op ]) (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "state_grad: %s" % str(state_grad)) logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad)) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, state_tuple = sess.run([outputs_op, state_tuple_op]) cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) return outputs, cu_outputs, state_tuple, cu_state_tuple