Example #1
0
def _create_cudnn_compatible_canonical_rnn(cudnn_model,
                                           inputs,
                                           use_block_cell,
                                           scope="rnn"):
    model = cudnn_model.rnn_mode
    if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU):
        raise ValueError("%s is not supported!" % model)
    if model == cudnn_rnn_ops.CUDNN_GRU and use_block_cell:
        raise ValueError("gru is not supported when using block cell!")

    num_units = cudnn_model.num_units
    num_layers = cudnn_model.num_layers
    # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
    if use_block_cell:
        single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMBlockCell(
            num_units)
    else:
        if model == cudnn_rnn_ops.CUDNN_LSTM:
            single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(
                num_units)
        else:
            single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(
                num_units)
    cell = rnn_cell_impl.MultiRNNCell(
        [single_cell() for _ in range(num_layers)])
    return rnn_lib.dynamic_rnn(cell,
                               inputs,
                               dtype=dtypes.float32,
                               time_major=True,
                               scope=scope)
def _CreateCudnnCompatibleCanonicalRNN(cudnn_model, inputs, scope=None):
    model = cudnn_model.rnn_mode
    if model not in (cudnn_rnn_ops.CUDNN_LSTM, cudnn_rnn_ops.CUDNN_GRU):
        raise ValueError("%s is not supported!" % model)

    num_units = cudnn_model.num_units
    num_layers = cudnn_model.num_layers
    # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
    if model == cudnn_rnn_ops.CUDNN_LSTM:
        single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units)
    else:
        single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units)
    cell = rnn_cell_impl.MultiRNNCell(
        [single_cell() for _ in range(num_layers)])
    return rnn_lib.dynamic_rnn(cell,
                               inputs,
                               dtype=dtypes.float32,
                               time_major=True,
                               scope=scope)
Example #3
0
def _CreateCudnnCompatibleCanonicalRNN(rnn, inputs, is_bidi=False, scope=None):
    mode = rnn.rnn_mode
    num_units = rnn.num_units
    num_layers = rnn.num_layers

    # To reuse cuDNN-trained models, must use cudnn compatible rnn cells.
    if mode == CUDNN_LSTM:
        single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleLSTMCell(num_units)
    elif mode == CUDNN_GRU:
        single_cell = lambda: cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units)
    elif mode == CUDNN_RNN_TANH:
        single_cell = (
            lambda: rnn_cell_impl.BasicRNNCell(num_units, math_ops.tanh))
    elif mode == CUDNN_RNN_RELU:
        single_cell = (
            lambda: rnn_cell_impl.BasicRNNCell(num_units, gen_nn_ops.relu))
    else:
        raise ValueError("%s is not supported!" % mode)

    if not is_bidi:
        cell = rnn_cell_impl.MultiRNNCell(
            [single_cell() for _ in range(num_layers)])
        return rnn_lib.dynamic_rnn(cell,
                                   inputs,
                                   dtype=dtypes.float32,
                                   time_major=True,
                                   scope=scope)
    else:
        cells_fw = [single_cell() for _ in range(num_layers)]
        cells_bw = [single_cell() for _ in range(num_layers)]

        (outputs, output_state_fw,
         output_state_bw) = contrib_rnn_lib.stack_bidirectional_dynamic_rnn(
             cells_fw,
             cells_bw,
             inputs,
             dtype=dtypes.float32,
             time_major=True,
             scope=scope)
        return outputs, (output_state_fw, output_state_bw)
Example #4
0
 def __call__(self, is_train, scope=None):
     return cudnn_rnn_ops.CudnnCompatibleGRUCell(self.num_units)
Example #5
0
def RunGRU(sess,
           num_units,
           input_size,
           batch_size,
           time,
           num_layers=1,
           is_training=True,
           variable_seq_lengths=False,
           time_major=True,
           dynamic_shape_input=False,
           dropout=0.,
           num_dirs=True,
           dtype=dtypes.float32):
  # TODO(jamesqin): add multi-layer tests.
  # TODO(jamesqin): add multi-dir tests
  assert num_layers == 1
  assert num_dirs == 1
  if is_training and not np.isclose(dropout, 0):
    raise ValueError("dropout can not be 0. when test training.")

  # set graph level random seed and numpy random seed.
  random_seed.set_random_seed(0)
  np.random.seed(0)

  shape = ([time, batch_size, input_size]
           if time_major else [batch_size, time, input_size])
  inputs_np = np.random.rand(*shape).astype(dtype.as_numpy_dtype)
  inputs_static = variable_scope.get_variable(
      "inputs", initializer=inputs_np, dtype=dtype)
  inputs_dynamic = array_ops.placeholder(
      dtype, shape=[None, None, None], name="inputs")
  inputs = inputs_dynamic if dynamic_shape_input else inputs_static
  initial_h_op = variable_scope.get_variable(
      "initial_h_op",
      initializer=np.random.rand(batch_size,
                                 num_units).astype(dtype.as_numpy_dtype),
      dtype=dtype)

  if variable_seq_lengths:
    lengths_v = np.random.randint(low=1, high=time + 1, size=batch_size)
    lengths_v[0] = time  # make sure the max sequence has 'time' elems
    lengths = ops.convert_to_tensor(lengths_v.astype(np.int32))
  else:
    lengths = None

  initializer = init_ops.random_uniform_initializer(
      -0.01, 0.01, dtype=dtype, seed=19980904)
  with variable_scope.variable_scope("test", initializer=initializer):
    gate_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/kernel",
        shape=[input_size + num_units, num_units * 2],
        dtype=dtype)
    gate_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/gates/bias",
        shape=[num_units * 2],
        dtype=dtype)
    candidate_inp_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/kernel",
        shape=[input_size, num_units],
        dtype=dtype)
    candidate_inp_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/input_projection/bias",
        shape=[num_units],
        dtype=dtype)
    candidate_hid_kernel = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/kernel",
        shape=[num_units, num_units],
        dtype=dtype)
    candidate_hid_bias = variable_scope.get_variable(
        "rnn/cudnn_compatible_gru_cell/candidate/hidden_projection/bias",
        shape=[num_units],
        dtype=dtype)

    cell = cudnn_rnn_ops.CudnnCompatibleGRUCell(num_units, reuse=True)
    outputs_op, h_op = rnn.dynamic_rnn(
        cell,
        inputs_static,
        sequence_length=lengths,
        initial_state=initial_h_op,
        dtype=dtype,
        time_major=time_major,
        scope=None)

  ws = [gate_kernel, candidate_inp_kernel, candidate_hid_kernel]
  bs = [gate_bias, candidate_inp_bias, candidate_hid_bias]
  # Convert to cudnn opaque param.
  format_converter = cudnn_rnn_ops.CudnnParamsFormatConverterGRU(
      num_layers, num_units, input_size)
  opaque_params = format_converter.tf_canonical_to_opaque(ws + bs)


  cu_initial_h_op = array_ops.expand_dims(
      initial_h_op, axis=(0 if time_major else 1))
  cu_outputs_op, cu_h_op, _ = cudnn_rnn_ops._cudnn_rnn(
      inputs,
      cu_initial_h_op,
      array_ops.zeros_like(cu_initial_h_op),  # not used
      opaque_params,
      sequence_lengths=lengths,
      time_major=time_major,
      dropout=dropout,
      is_training=is_training,
      rnn_mode=cudnn_rnn_ops.CUDNN_GRU)

  if is_training:
    (inp_grad_op, hgrad_op, gk_grad_op, cik_grad_op, chk_grad_op, gb_grad_op,
     cib_grad_op, chb_grad_op) = gradients_impl.gradients(
         outputs_op, [inputs_static, initial_h_op] + ws + bs)

    (cu_inp_grad_op, cu_hgrad_op, opaque_grad_op) = gradients_impl.gradients(
        cu_outputs_op, [inputs, cu_initial_h_op, opaque_params])
    # Remove the trivial 1st dimension
    cu_hgrad_op = array_ops.squeeze(cu_hgrad_op, axis=0 if time_major else 1)

    cu_wgrad_op, cu_bgrad_op = format_converter.opaque_to_tf_canonical(
        opaque_grad_op)
    (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op) = cu_wgrad_op
    (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op) = cu_bgrad_op
    # cudnn gru has 2 biases for reset and update gates. When converting to tf
    # canonical format, the two biases are summed into one.  Thus here relevant
    # bias gradient should be halved before comparing with tf gru.
    cu_gb_grad_op *= 0.5

  init_op = variables.global_variables_initializer()
  sess.run(init_op)

  if is_training:
    outputs, h, inp_grad, hgrad, wgrad, bgrad = sess.run([
        outputs_op, h_op, inp_grad_op, hgrad_op,
        (gk_grad_op, cik_grad_op, chk_grad_op),
        (gb_grad_op, cib_grad_op, chb_grad_op)
    ])
    (cu_outputs, cu_h, cu_inp_grad, cu_hgrad, cu_wgrad, cu_bgrad) = sess.run(
        [
            cu_outputs_op, cu_h_op, cu_inp_grad_op, cu_hgrad_op,
            (cu_gk_grad_op, cu_cik_grad_op, cu_chk_grad_op),
            (cu_gb_grad_op, cu_cib_grad_op, cu_chb_grad_op)
        ],
        feed_dict={inputs: inputs_np} if dynamic_shape_input else None)
    # Remove the trivial 1st dimension
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
    logging.vlog(1, "inp_grad: %s" % inp_grad)
    logging.vlog(1, "cu_inp_grad: %s" % cu_inp_grad)
    logging.vlog(1, "hgrad: %s" % hgrad)
    logging.vlog(1, "cu_hgrad: %s" % cu_hgrad)
    logging.vlog(1, "wgrad: %s" % str(wgrad))
    logging.vlog(1, "bgrad: %s" % str(bgrad))
    logging.vlog(1, "cu_wgrad: %s" % str(cu_wgrad))
    logging.vlog(1, "cu_bgrad: %s" % str(cu_bgrad))
    return (outputs, cu_outputs, h, cu_h, inp_grad, cu_inp_grad, hgrad,
            cu_hgrad, wgrad, bgrad, cu_wgrad, cu_bgrad)
  else:
    outputs, h = sess.run([outputs_op, h_op])
    cu_outputs, cu_h = sess.run([cu_outputs_op, cu_h_op],
                                feed_dict=({
                                    inputs: inputs_np
                                } if dynamic_shape_input else None))
    # Remove the trivial 1st dimension.
    cu_h = np.squeeze(cu_h, axis=0 if time_major else 1)

    logging.vlog(1, "outputs: %s" % outputs)
    logging.vlog(1, "cu_outputs: %s" % cu_outputs)
    logging.vlog(1, "h: %s" % h)
    logging.vlog(1, "cu_h: %s" % h)
  return outputs, cu_outputs, h, cu_h