def _create_cudnn_compatible_canonical_rnn(cudnn_model, inputs, use_block_cell, scope="rnn"): model = cudnn_model.rnn_mode if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU): raise ValueError("%s is not supported!" % model) if model == cudnn_rnn_ops.CUDNN_GRU and use_block_cell: raise ValueError("gru is not supported when using block cell!") num_units = cudnn_model.num_units num_layers = cudnn_model.num_layers # To reuse cuDNN-trained models, must use cudnn compatible rnn cells. if use_block_cell: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMBlockCell( num_units) else: if model == cudnn_rnn_ops.CUDNN_LSTM: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell( num_units) else: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell( num_units) cell = rnn_cell_impl.MultiRNNCell( [single_cell() for _ in range(num_layers)]) return rnn_lib.dynamic_rnn(cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope)
def _CreateCudnnCompatibleCanonicalRNN(cudnn_model, inputs, scope=None): model = cudnn_model.rnn_mode if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU): raise ValueError("%s is not supported!" % model) num_units = cudnn_model.num_units num_layers = cudnn_model.num_layers # To reuse cuDNN-trained models, must use cudnn compatible rnn cells. if model == cudnn_rnn_ops.CUDNN_LSTM: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units) else: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units) cell = rnn_cell_impl.MultiRNNCell( [single_cell() for _ in range(num_layers)]) return rnn_lib.dynamic_rnn(cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope)
def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None): mode = rnn.rnn_mode num_units = rnn.num_units num_layers = rnn.num_layers # To reuse cuDNN-trained models, must use cudnn compatible rnn cells. if mode == CUDNN_LSTM: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units) elif mode == CUDNN_GRU: single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units) elif mode == CUDNN_RNN_TANH: single_cell = ( lambda: rnn_cell_impl.BasicRNNCell(num_units, math_ops.tanh)) elif mode == CUDNN_RNN_RELU: single_cell = ( lambda: rnn_cell_impl.BasicRNNCell(num_units, gen_nn_ops.relu)) else: raise ValueError("%s is not supported!" % mode) if not is_bidi: cell = rnn_cell_impl.MultiRNNCell( [single_cell() for _ in range(num_layers)]) return rnn_lib.dynamic_rnn(cell, inputs, dtype=dtypes.float32, time_major=True, scope=scope) else: cells_fw = [single_cell() for _ in range(num_layers)] cells_bw = [single_cell() for _ in range(num_layers)] (outputs, output_state_fw, output_state_bw) = contrib_rnn_lib.stack_bidirectional_dynamic_rnn( cells_fw, cells_bw, inputs, dtype=dtypes.float32, time_major=True, scope=scope) return outputs, (output_state_fw, output_state_bw)
def __call__(self, is_train, scope=None): return cudnn_rnn_ops.CudnnCompatibleGRUCell(self.num_units)
def RunGRU(sess, num_units, input_size, batch_size, time, num_layers=1, is_training=True, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): gate_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/kernel", shape=[input_size + num_units, num_units * 2], dtype=dtype) gate_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/bias", shape=[num_units * 2], dtype=dtype) candidate_inp_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel", shape=[input_size, num_units], dtype=dtype) candidate_inp_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias", shape=[num_units], dtype=dtype) candidate_hid_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel", shape=[num_units, num_units], dtype=dtype) candidate_hid_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias", shape=[num_units], dtype=dtype) cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True) outputs_op, h_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=initial_h_op, dtype=dtype, time_major=time_major, scope=None) ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel] bs = [gate_bias, candidate_inp_bias, candidate_hid_bias] # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, array_ops.zeros_like(cu_initial_h_op), # not used opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_GRU) if is_training: (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op, cib_grad_op, chb_grad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op] + ws + bs) (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op # cudnn gru has 2 biases for reset and update gates. When converting to tf # canonical format, the two biases are summed into one. Thus here relevant # bias gradient should be halved before comparing with tf gru. cu_gb_grad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([ outputs_op, h_op, inp_grad_op, hgrad_op, (gk_grad_op, cik_grad_op, chk_grad_op), (gb_grad_op, cib_grad_op, chb_grad_op) ]) (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op, (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op), (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) # Remove the trivial 1st dimension cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "hgrad: %s" % hgrad) logging.vlog(1, "cu_hgrad: %s" % cu_hgrad) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, h = sess.run([outputs_op, h_op]) cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) # Remove the trivial 1st dimension. cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) return outputs, cu_outputs, h, cu_h