def _forward(self, inputs, h, c, opaque_params, training): output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn( # pylint:disable=protected-access inputs, h, c, opaque_params, training, self._rnn_mode, input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, seed=self._seed) return output, (output_h, output_c)
def _forward(self, inputs, h, c, opaque_params, training): output, output_h, output_c = cudnn_rnn_ops._cudnn_rnn( # pylint:disable=protected-access inputs, h, c, opaque_params, training, self._rnn_mode, input_mode=self._input_mode, direction=self._direction, dropout=self._dropout, seed=self._seed) return output, (output_h, output_c)
def RunLSTM(sess, num_units, input_size, batch_size, time, num_layers=1, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, is_training=True, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) initial_c_op = variable_scope.get_variable( "initial_c_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): w = variable_scope.get_variable( "rnn/lstm_cell/kernel", shape=[input_size + num_units, num_units * 4], dtype=dtype) b = variable_scope.get_variable( "rnn/lstm_cell/bias", shape=[num_units * 4], dtype=dtype) # canonical lstm. must set forget_bias to 0. to align with cudnn lstm. cell = rnn_cell_impl.LSTMCell(num_units, forget_bias=0., reuse=True) outputs_op, state_tuple_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=rnn_cell_impl.LSTMStateTuple( h=initial_h_op, c=initial_c_op), dtype=dtype, time_major=time_major, scope=None) # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterLSTM( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque([w, b]) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_initial_c_op = array_ops.expand_dims( initial_c_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, cu_c_op = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, cu_initial_c_op, opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_LSTM) # Remove the trivial 1st dimension. cu_state_tuple_op = rnn_cell_impl.LSTMStateTuple( c=array_ops.squeeze(cu_c_op, axis=0 if time_major else 1), h=array_ops.squeeze(cu_h_op, axis=0 if time_major else 1)) if is_training: (inp_grad_op, hgrad_op, cgrad_op, wgrad_op, bgrad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op, initial_c_op, w, b]) (cu_inp_grad_op, cu_hgrad_op, cu_cgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, cu_initial_c_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) # Remove the trivial 1st dimension cu_cgrad_op = array_ops.squeeze(cu_cgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) cu_wgrad_op = cu_wgrad_op[0] cu_bgrad_op = cu_bgrad_op[0] # cudnn lstm has 2 biases each gate. When converting to tf canonical format, # the two biases are summed into one. Thus here bias gradient should be # halved when comparing with tf lstm. cu_bgrad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, state_tuple, inp_grad, state_grad, wgrad, bgrad = sess.run([ outputs_op, state_tuple_op, inp_grad_op, (hgrad_op, cgrad_op), wgrad_op, bgrad_op ]) (cu_outputs, cu_state_tuple, cu_inp_grad, cu_state_grad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_state_tuple_op, cu_inp_grad_op, (cu_hgrad_op, cu_cgrad_op), cu_wgrad_op, cu_bgrad_op ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "state_grad: %s" % str(state_grad)) logging.vlog(1, "cu_state_grad: %s" % str(cu_state_grad)) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, state_tuple, cu_state_tuple, inp_grad, cu_inp_grad, state_grad, cu_state_grad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, state_tuple = sess.run([outputs_op, state_tuple_op]) cu_outputs, cu_state_tuple = sess.run([cu_outputs_op, cu_state_tuple_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "state_tuple: %s" % str(state_tuple)) logging.vlog(1, "cu_state_tuple: %s" % str(cu_state_tuple)) return outputs, cu_outputs, state_tuple, cu_state_tuple
def RunGRU(sess, num_units, input_size, batch_size, time, num_layers=1, is_training=True, variable_seq_lengths=False, time_major=True, dynamic_shape_input=False, dropout=0., num_dirs=True, dtype=dtypes.float32): # TODO(jamesqin): add multi-layer tests. # TODO(jamesqin): add multi-dir tests assert num_layers == 1 assert num_dirs == 1 if is_training and not np.isclose(dropout, 0): raise ValueError("dropout can not be 0. when test training.") # set graph level random seed and numpy random seed. random_seed.set_random_seed(0) np.random.seed(0) shape = ([time, batch_size, input_size] if time_major else [batch_size, time, input_size]) inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype) inputs_static = variable_scope.get_variable( "inputs", initializer=inputs_np, dtype=dtype) inputs_dynamic = array_ops.placeholder( dtype, shape=[None, None, None], name="inputs") inputs = inputs_dynamic if dynamic_shape_input else inputs_static initial_h_op = variable_scope.get_variable( "initial_h_op", initializer=np.random.rand(batch_size, num_units).astype(dtype.as_numpy_dtype), dtype=dtype) if variable_seq_lengths: lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size) lengths_v[0] = time # make sure the max sequence has 'time' elems lengths = ops.convert_to_tensor(lengths_v.astype(np.int32)) else: lengths = None initializer = init_ops.random_uniform_initializer( -0.01, 0.01, dtype=dtype, seed=19980904) with variable_scope.variable_scope("test", initializer=initializer): gate_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/kernel", shape=[input_size + num_units, num_units * 2], dtype=dtype) gate_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/gates/bias", shape=[num_units * 2], dtype=dtype) candidate_inp_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel", shape=[input_size, num_units], dtype=dtype) candidate_inp_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias", shape=[num_units], dtype=dtype) candidate_hid_kernel = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel", shape=[num_units, num_units], dtype=dtype) candidate_hid_bias = variable_scope.get_variable( "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias", shape=[num_units], dtype=dtype) cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True) outputs_op, h_op = rnn.dynamic_rnn( cell, inputs_static, sequence_length=lengths, initial_state=initial_h_op, dtype=dtype, time_major=time_major, scope=None) ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel] bs = [gate_bias, candidate_inp_bias, candidate_hid_bias] # Convert to cudnn opaque param. format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU( num_layers, num_units, input_size) opaque_params = format_converter.tf_canonical_to_opaque(ws + bs) cu_initial_h_op = array_ops.expand_dims( initial_h_op, axis=(0 if time_major else 1)) cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn( inputs, cu_initial_h_op, array_ops.zeros_like(cu_initial_h_op), # not used opaque_params, sequence_lengths=lengths, time_major=time_major, dropout=dropout, is_training=is_training, rnn_mode=cudnn_rnn_ops.CUDNN_GRU) if is_training: (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op, cib_grad_op, chb_grad_op) = gradients_impl.gradients( outputs_op, [inputs_static, initial_h_op] + ws + bs) (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients( cu_outputs_op, [inputs, cu_initial_h_op, opaque_params]) # Remove the trivial 1st dimension cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1) cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical( opaque_grad_op) (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op # cudnn gru has 2 biases for reset and update gates. When converting to tf # canonical format, the two biases are summed into one. Thus here relevant # bias gradient should be halved before comparing with tf gru. cu_gb_grad_op *= 0.5 init_op = variables.global_variables_initializer() sess.run(init_op) if is_training: outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([ outputs_op, h_op, inp_grad_op, hgrad_op, (gk_grad_op, cik_grad_op, chk_grad_op), (gb_grad_op, cib_grad_op, chb_grad_op) ]) (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run( [ cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op, (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op), (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) ], feed_dict={inputs: inputs_np} if dynamic_shape_input else None) # Remove the trivial 1st dimension cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) logging.vlog(1, "inp_grad: %s" % inp_grad) logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad) logging.vlog(1, "hgrad: %s" % hgrad) logging.vlog(1, "cu_hgrad: %s" % cu_hgrad) logging.vlog(1, "wgrad: %s" % str(wgrad)) logging.vlog(1, "bgrad: %s" % str(bgrad)) logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad)) logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad)) return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad, cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad) else: outputs, h = sess.run([outputs_op, h_op]) cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op], feed_dict=({ inputs: inputs_np } if dynamic_shape_input else None)) # Remove the trivial 1st dimension. cu_h = np.squeeze(cu_h, axis=0 if time_major else 1) logging.vlog(1, "outputs: %s" % outputs) logging.vlog(1, "cu_outputs: %s" % cu_outputs) logging.vlog(1, "h: %s" % h) logging.vlog(1, "cu_h: %s" % h) return outputs, cu_outputs, h, cu_h