Ejemplo n.º 1
0
def ndlstm_base_unrolled(inputs, noutput, scope=None, reverse=False):
  """Run an LSTM, either forward or backward.

  This is a 1D LSTM implementation using unrolling and the TensorFlow
  LSTM op.

  Args:
    inputs: input sequence (length, batch_size, ninput)
    noutput: depth of output
    scope: optional scope name
    reverse: run LSTM in reverse

  Returns:
    Output sequence (length, batch_size, noutput)

  """
  with variable_scope.variable_scope(scope, "SeqLstmUnrolled", [inputs]):
    length, batch_size, _ = _shape(inputs)
    lstm_cell = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False)
    state = array_ops.zeros([batch_size, lstm_cell.state_size])
    output_u = []
    inputs_u = array_ops.unstack(inputs)
    if reverse:
      inputs_u = list(reversed(inputs_u))
    for i in xrange(length):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      output, state = lstm_cell(inputs_u[i], state)
      output_u += [output]
    if reverse:
      output_u = list(reversed(output_u))
    outputs = array_ops.stack(output_u)
    return outputs
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper2"
      with ops.device("/cpu:0"):
        if self._initializer:
          initializer = self._initializer
        elif vs.get_variable_scope().initializer:
          initializer = vs.get_variable_scope().initializer
        else:
          # Default initializer for embeddings should have variance=1.
          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
        embeddings = []
        for i in xrange(len(self._embedding_classes)):
            embeddings.append(vs.get_variable("embedding"+str(i), [self._embedding_classes[i],
                                                  self._embedding_sizes[i]],
                                    initializer=initializer))
        embedded = []
        for i in xrange(len(self._embedding_classes)):
            embedded.append(embedding_ops.embedding_lookup(
                  embeddings[i], array_ops.reshape(inputs[i], [-1])))

        finalEmbedded = tf.concat(1, embedded)

    return self._cell(finalEmbedded, state)
Ejemplo n.º 3
0
def rnn_decoder(decoder_inputs, initial_state, cell, softmax_w, softmax_b,
                scope=None):
  # Currently only support Mean Squared Error. Need to support Cross Entropy
  # By cchanging linear activation to argmax of the logits
  with variable_scope.variable_scope(scope or "rnn_decoder"):
    state_train = initial_state
    state_valid = initial_state
    outputs_train = []
    outputs_valid = []
    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      output_train, state_train = cell(inp, state_train)
      outputs_train.append(output_train)
      if i > 0:
        # For the next decoder input, the decoder input of train and valid are
        # different. For train, we use the true decoder input, for test, we use
        # the output of the previous
        # ipdb.set_trace()
        output_valid, state_valid = cell(tf.matmul(outputs_valid[-1],
            softmax_w) + softmax_b, state_valid)
      else:
        # For the first decoder input, the decoder input of train and valid
        # are the same, since they are both fed the decoder_input[0]
        state_valid, output_valid  = state_train, output_train
      outputs_valid.append(output_valid)
  return outputs_train, state_train, outputs_valid, state_valid
Ejemplo n.º 4
0
 def testReturnsExistingConcatenatedValueIfReuse(self):
   with variable_scope.variable_scope(
       "scope0", partitioner=axis0_into2_partitioner):
     v_concat = variable_scope.get_variable("name0", shape=(3, 1, 1))
     variable_scope.get_variable_scope().reuse_variables()
     v_concat_2 = variable_scope.get_variable("name0", shape=(3, 1, 1))
     self.assertEqual(v_concat, v_concat_2)
Ejemplo n.º 5
0
 def testAtrousFullyConvolutionalValues(self):
   """Verify dense feature extraction with atrous convolution."""
   nominal_stride = 32
   for output_stride in [4, 8, 16, 32, None]:
     with arg_scope(resnet_utils.resnet_arg_scope()):
       with ops.Graph().as_default():
         with self.test_session() as sess:
           random_seed.set_random_seed(0)
           inputs = create_test_input(2, 81, 81, 3)
           # Dense feature extraction followed by subsampling.
           output, _ = self._resnet_small(
               inputs,
               None,
               is_training=False,
               global_pool=False,
               output_stride=output_stride)
           if output_stride is None:
             factor = 1
           else:
             factor = nominal_stride // output_stride
           output = resnet_utils.subsample(output, factor)
           # Make the two networks use the same weights.
           variable_scope.get_variable_scope().reuse_variables()
           # Feature extraction at the nominal network rate.
           expected, _ = self._resnet_small(
               inputs, None, is_training=False, global_pool=False)
           sess.run(variables.global_variables_initializer())
           self.assertAllClose(
               output.eval(), expected.eval(), atol=1e-4, rtol=1e-4)
Ejemplo n.º 6
0
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with _checked_scope(self, scope or "embedding_wrapper", reuse=self._reuse):
      with ops.device("/cpu:0"):
        if self._initializer:
          initializer = self._initializer
        elif vs.get_variable_scope().initializer:
          initializer = vs.get_variable_scope().initializer
        else:
          # Default initializer for embeddings should have variance=1.
          sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
          initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)

        if type(state) is tuple:
          data_type = state[0].dtype
        else:
          data_type = state.dtype

        embedding = vs.get_variable(
            "embedding", [self._embedding_classes, self._embedding_size],
            initializer=initializer,
            dtype=data_type)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))
    return self._cell(embedded, state)
Ejemplo n.º 7
0
  def __call__(self, inputs, state, scope=None):
    """Run the cell on embedded inputs."""
    with vs.variable_scope(scope or type(self).__name__):  # "EmbeddingWrapper"
      with ops.device("/cpu:0"):
        if self._embedding:
          embedding = self._embedding
        else:
          if self._initializer:
            initializer = self._initializer
          elif vs.get_variable_scope().initializer:
            initializer = vs.get_variable_scope().initializer
          else:
            # Default initializer for embeddings should have variance=1.
            sqrt3 = math.sqrt(3)  # Uniform(-sqrt(3), sqrt(3)) has variance=1.
            initializer = init_ops.random_uniform_initializer(-sqrt3, sqrt3)
          embedding = vs.get_variable("embedding", [self._embedding_classes,
                                                    self._cell.input_size],
                                      initializer=initializer)
        embedded = embedding_ops.embedding_lookup(
            embedding, array_ops.reshape(inputs, [-1]))

        """print (embedded)
        print ("{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{{}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}}")"""

    return self._cell(embedded, state)
Ejemplo n.º 8
0
def rnn_decoder(decoder_inputs, initial_state, cell, scope=None):
  """RNN Decoder that creates training and sampling sub-graphs.

  Args:
    decoder_inputs: Inputs for decoder, list of tensors.
      This is used only in training sub-graph.
    initial_state: Initial state for the decoder.
    cell: RNN cell to use for decoder.
    scope: Scope to use, if None new will be produced.

  Returns:
    List of tensors for outputs and states for training and sampling sub-graphs.
  """
  with vs.variable_scope(scope or "dnn_decoder"):
    states, sampling_states = [initial_state], [initial_state]
    outputs, sampling_outputs = [], []
    with ops.op_scope([decoder_inputs, initial_state], "training"):
      for i, inp in enumerate(decoder_inputs):
        if i > 0:
          vs.get_variable_scope().reuse_variables()
        output, new_state = cell(inp, states[-1])
        outputs.append(output)
        states.append(new_state)
    with ops.op_scope([initial_state], "sampling"):
      for i, _ in enumerate(decoder_inputs):
        if i == 0:
          sampling_outputs.append(outputs[i])
          sampling_states.append(states[i])
        else:
          sampling_output, sampling_state = cell(sampling_outputs[-1],
                                                 sampling_states[-1])
          sampling_outputs.append(sampling_output)
          sampling_states.append(sampling_state)
  return outputs, states, sampling_outputs, sampling_states
Ejemplo n.º 9
0
  def _TestCreateOrGetQuantizationStep(self, use_resource):
    g = ops.Graph()
    with session.Session(graph=g) as sess:
      variable_scope.get_variable_scope().set_use_resource(use_resource)
      quantization_step_tensor = common.CreateOrGetQuantizationStep()

      # Check that operations are added to the graph.
      num_nodes = len(g.get_operations())
      self.assertGreater(num_nodes, 0)

      # Check that getting the quantization step doesn't change the graph.
      get_quantization_step_tensor = common.CreateOrGetQuantizationStep()
      self.assertEqual(quantization_step_tensor, get_quantization_step_tensor)
      self.assertEqual(num_nodes, len(g.get_operations()))

      # Ensure that running the graph increments the quantization step.
      sess.run(variables.global_variables_initializer())
      step_val = sess.run(quantization_step_tensor)
      self.assertEqual(step_val, 1)

      # Ensure that even running a graph that depends on the quantization step
      # multiple times only executes it once.
      a = quantization_step_tensor + 1
      b = a + quantization_step_tensor
      _, step_val = sess.run([b, quantization_step_tensor])
      self.assertEqual(step_val, 2)
Ejemplo n.º 10
0
  def testBasicLSTMCellStateTupleType(self):
    with self.test_session():
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 2])
        m0 = (array_ops.zeros([1, 2]),) * 2
        m1 = (array_ops.zeros([1, 2]),) * 2
        cell = rnn_cell_impl.MultiRNNCell(
            [rnn_cell_impl.BasicLSTMCell(2) for _ in range(2)],
            state_is_tuple=True)
        self.assertTrue(isinstance(cell.state_size, tuple))
        self.assertTrue(
            isinstance(cell.state_size[0], rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(
            isinstance(cell.state_size[1], rnn_cell_impl.LSTMStateTuple))

        # Pass in regular tuples
        _, (out_m0, out_m1) = cell(x, (m0, m1))
        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))

        # Pass in LSTMStateTuples
        variable_scope.get_variable_scope().reuse_variables()
        zero_state = cell.zero_state(1, dtypes.float32)
        self.assertTrue(isinstance(zero_state, tuple))
        self.assertTrue(isinstance(zero_state[0], rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(zero_state[1], rnn_cell_impl.LSTMStateTuple))
        _, (out_m0, out_m1) = cell(x, zero_state)
        self.assertTrue(isinstance(out_m0, rnn_cell_impl.LSTMStateTuple))
        self.assertTrue(isinstance(out_m1, rnn_cell_impl.LSTMStateTuple))
Ejemplo n.º 11
0
  def testResidualWrapperWithSlice(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 5])
        m = array_ops.zeros([1, 3])
        base_cell = rnn_cell_impl.GRUCell(3)
        g, m_new = base_cell(x, m)
        variable_scope.get_variable_scope().reuse_variables()

        def residual_with_slice_fn(inp, out):
          inp_sliced = array_ops.slice(inp, [0, 0], [-1, 3])
          return inp_sliced + out

        g_res, m_new_res = rnn_cell_impl.ResidualWrapper(
            base_cell, residual_with_slice_fn)(x, m)
        sess.run([variables_lib.global_variables_initializer()])
        res_g, res_g_res, res_m_new, res_m_new_res = sess.run(
            [g, g_res, m_new, m_new_res], {
                x: np.array([[1., 1., 1., 1., 1.]]),
                m: np.array([[0.1, 0.1, 0.1]])
            })
        # Residual connections
        self.assertAllClose(res_g_res, res_g + [1., 1., 1.])
        # States are left untouched
        self.assertAllClose(res_m_new, res_m_new_res)
Ejemplo n.º 12
0
def sequence_to_final(inputs, noutput, scope=None, name=None, reverse=False):
  """Run an LSTM across all steps and returns only the final state.

  Args:
    inputs: (length, batch_size, depth) tensor
    noutput: size of output vector
    scope: optional scope name
    name: optional name for output tensor
    reverse: run in reverse

  Returns:
    Batch of size (batch_size, noutput).
  """
  with variable_scope.variable_scope(scope, "SequenceToFinal", [inputs]):
    length, batch_size, _ = _shape(inputs)
    lstm = core_rnn_cell_impl.BasicLSTMCell(noutput, state_is_tuple=False)
    state = array_ops.zeros([batch_size, lstm.state_size])
    inputs_u = array_ops.unstack(inputs)
    if reverse:
      inputs_u = list(reversed(inputs_u))
    for i in xrange(length):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      output, state = lstm(inputs_u[i], state)
    outputs = array_ops.reshape(output, [batch_size, noutput], name=name)
    return outputs
Ejemplo n.º 13
0
  def embed(self, func, embedding_classes, embedding_size, inputs, dtype=None, scope=None,
            keep_prob=1.0, initializer=None):
    embedder_cell = func(self._cell, embedding_classes, embedding_size, initializer=initializer)

    # Like rnn(..) in rnn.py, but we call only the Embedder, not the RNN cell
    outputs = []
    with vs.variable_scope(scope or "Embedder") as varscope:
      if varscope.caching_device is None:
        varscope.set_caching_device(lambda op: op.device)

      for time, input_ in enumerate(inputs):
        if time > 0: vs.get_variable_scope().reuse_variables()
        embedding = embedder_cell.__call__(input_, scope)
        if keep_prob < 1:
          embedding = tf.nn.dropout(embedding, keep_prob)

        # annotation = C~_t = tanh ( E(x_t) + b_c)
        b_c = tf.get_variable("annotation_b", [embedding_size])
        annotation = tanh(tf.nn.bias_add(embedding, b_c))

        # weighted annotation = i_t * C~_t
        # i = sigmoid ( E(x_t) + b_i)
        b_i = tf.get_variable("input_b", [embedding_size])
        i = sigmoid(tf.nn.bias_add(embedding, b_i))
        w_annotation = i * annotation
        outputs.append(w_annotation)

      # return empty state, will be initialized by decoder
      batch_size = array_ops.shape(inputs[0])[0]
      state = self._cell.zero_state(batch_size, dtype)
      return (outputs, state)
Ejemplo n.º 14
0
  def testResidualWrapper(self):
    with self.test_session() as sess:
      with variable_scope.variable_scope(
          "root", initializer=init_ops.constant_initializer(0.5)):
        x = array_ops.zeros([1, 3])
        m = array_ops.zeros([1, 3])
        base_cell = rnn_cell_impl.GRUCell(3)
        g, m_new = base_cell(x, m)
        variable_scope.get_variable_scope().reuse_variables()
        wrapper_object = rnn_cell_impl.ResidualWrapper(base_cell)
        (name, dep), = wrapper_object._checkpoint_dependencies
        wrapper_object.get_config()  # Should not throw an error
        self.assertIs(dep, base_cell)
        self.assertEqual("cell", name)

        g_res, m_new_res = wrapper_object(x, m)
        sess.run([variables_lib.global_variables_initializer()])
        res = sess.run([g, g_res, m_new, m_new_res], {
            x: np.array([[1., 1., 1.]]),
            m: np.array([[0.1, 0.1, 0.1]])
        })
        # Residual connections
        self.assertAllClose(res[1], res[0] + [1., 1., 1.])
        # States are left untouched
        self.assertAllClose(res[2], res[3])
Ejemplo n.º 15
0
def _create_slot_var(primary, val, scope, validate_shape, shape, dtype):
  """Helper function for creating a slot variable."""

  # TODO(lukaszkaiser): Consider allowing partitioners to be set in the current
  # scope.
  current_partitioner = variable_scope.get_variable_scope().partitioner
  variable_scope.get_variable_scope().set_partitioner(None)
  # When init from val instead of callable initializer, the shape is expected to
  # be None, not <unknown> or any fully defined shape.
  shape = shape if callable(val) else None
  slot = variable_scope.get_variable(
      scope, initializer=val, trainable=False,
      use_resource=resource_variable_ops.is_resource_variable(primary),
      shape=shape, dtype=dtype,
      validate_shape=validate_shape)
  variable_scope.get_variable_scope().set_partitioner(current_partitioner)

  # pylint: disable=protected-access
  if isinstance(primary, variables.Variable) and primary._save_slice_info:
    # Primary is a partitioned variable, so we need to also indicate that
    # the slot is a partitioned variable.  Slots have the same partitioning
    # as their primaries.
    # For examples when using AdamOptimizer in linear model, slot.name
    # here can be "linear//weights/Adam:0", while primary.op.name is
    # "linear//weight". We want to get 'Adam' as real_slot_name, so we
    # remove "'linear//weight' + '/'" and ':0'.
    real_slot_name = slot.name[len(primary.op.name + "/"):-2]
    slice_info = primary._save_slice_info
    slot._set_save_slice_info(variables.Variable.SaveSliceInfo(
        slice_info.full_name + "/" + real_slot_name,
        slice_info.full_shape[:],
        slice_info.var_offset[:],
        slice_info.var_shape[:]))
  # pylint: enable=protected-access
  return slot
Ejemplo n.º 16
0
def tied_rnn_seq2seq(encoder_inputs, decoder_inputs, cell,
                     loop_function=None, dtype=dtypes.float32, scope=None):
  """RNN sequence-to-sequence model with tied encoder and decoder parameters.

  This model first runs an RNN to encode encoder_inputs into a state vector, and
  then runs decoder, initialized with the last encoder state, on decoder_inputs.
  Encoder and decoder use the same RNN cell and share parameters.

  Args:
    encoder_inputs: A list of 2D Tensors [batch_size x cell.input_size].
    decoder_inputs: A list of 2D Tensors [batch_size x cell.input_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol), see rnn_decoder for details.
    dtype: The dtype of the initial state of the rnn cell (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "tied_rnn_seq2seq".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x cell.output_size] containing the generated outputs.
      state: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
  """
  with variable_scope.variable_scope("combined_tied_rnn_seq2seq"):
    scope = scope or "tied_rnn_seq2seq"
    _, enc_state = rnn.rnn(
        cell, encoder_inputs, dtype=dtype, scope=scope)
    variable_scope.get_variable_scope().reuse_variables()
    return rnn_decoder(decoder_inputs, enc_state, cell,
                       loop_function=loop_function, scope=scope)
Ejemplo n.º 17
0
def beam_rnn_decoder(decoder_inputs, initial_state, cell, loop_function=None,
                scope=None,output_projection=None, beam_size=10):
  """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """
  with variable_scope.variable_scope(scope or "rnn_decoder"):
    state = initial_state
    outputs = []
    prev = None
    log_beam_probs, beam_path, beam_symbols = [],[],[]
    state_size = int(initial_state.get_shape().with_rank(2)[1])

    for i, inp in enumerate(decoder_inputs):
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i,log_beam_probs, beam_path, beam_symbols)
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()

      input_size = inp.get_shape().with_rank(2)[1]
      print input_size
      x = inp
      output, state = cell(x, state)

      if loop_function is not None:
        prev = output
      if  i ==0:
          states =[]
          for kk in range(beam_size):
                states.append(state)
          state = tf.reshape(tf.concat(0, states), [-1, state_size])

      outputs.append(tf.argmax(nn_ops.xw_plus_b(
          output, output_projection[0], output_projection[1]), dimension=1))
  return outputs, state, tf.reshape(tf.concat(0, beam_path),[-1,beam_size]), tf.reshape(tf.concat(0, beam_symbols),[-1,beam_size])
Ejemplo n.º 18
0
 def testBowEncodersSharingEmbeddingsSharedScope(self):
   with self.cached_session() as sess:
     docs = [[0, 1], [2, 3]]
     enc_1 = encoders.bow_encoder(docs, 4, 3, scope='bow')
     variable_scope.get_variable_scope().reuse_variables()
     enc_2 = encoders.bow_encoder(docs, 4, 3, scope='bow')
     sess.run(variables.global_variables_initializer())
     avg_1, avg_2 = sess.run([enc_1, enc_2])
     self.assertAllEqual(avg_1, avg_2)
Ejemplo n.º 19
0
 def decoder(cell, dec_outputs, states, scope):
     outputs = []
     with variable_scope.variable_scope(scope):
         for i in range(len(states)):
             if i > 0:
                 variable_scope.get_variable_scope().reuse_variables()
             outs, _ = seq2seq.rnn_decoder(dec_outputs, states[i], cell)
             outputs.extend(outs)
     return outputs
Ejemplo n.º 20
0
 def encoder(cell, inputs, n_steps, batch_size=1, dtype=tf.float32, scope=None):
     states = []
     with variable_scope.variable_scope(scope):
         init_state = cell.zero_state(batch_size, dtype)
         for i in range(0, len(inputs), n_steps):
             if i > 0:
                 variable_scope.get_variable_scope().reuse_variables()
             _, state = rnn(cell, inputs[i: i + n_steps], init_state, dtype)
             states.append(state)
     return states
Ejemplo n.º 21
0
def my_rnn(alphabetEnc, cell, inputs, initial_state=None, dtype=None,
        sequence_length=None, scope=None):

  if not isinstance(cell, rnn_cell.RNNCell):
    raise TypeError("cell must be an instance of RNNCell")
  if not isinstance(inputs, list):
    raise TypeError("inputs must be a list")
  if not inputs:
    raise ValueError("inputs must not be empty")

  outputs = []
  with vs.variable_scope(scope or "RNN"):
    fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0]
    if fixed_batch_size.value:
      batch_size = fixed_batch_size.value
    else:
      batch_size = array_ops.shape(inputs[0])[0]
    if initial_state is not None:
      state = initial_state
    else:
      if not dtype:
        raise ValueError("If no initial_state is provided, dtype must be.")
      state = cell.zero_state(batch_size, dtype)

    if sequence_length is not None:
      sequence_length = math_ops.to_int32(sequence_length)

    if sequence_length:  # Prepare variables
      zero_output = array_ops.zeros(
          array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype)
      zero_output.set_shape(
          tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size]))


      min_sequence_length = math_ops.reduce_min(sequence_length)
      max_sequence_length = math_ops.reduce_max(sequence_length)


    for time, input_ in enumerate(inputs):
      if time > 0: vs.get_variable_scope().reuse_variables()
      # pylint: disable=cell-var-from-loop
      call_cell = lambda: cell([ input_ , alphabetEnc[time] ], state)
      # pylint: enable=cell-var-from-loop
      if sequence_length:

        (output, state) = _rnn_step(
            time, sequence_length, min_sequence_length, max_sequence_length,
            zero_output, state, call_cell)
      else:

        (output, state) = call_cell()

      outputs.append(output)

    return (outputs, state)
Ejemplo n.º 22
0
  def testModelWithBucketsScopeAndLoss(self):
    """Test that variable scope reuse is not reset after model_with_buckets."""
    classes = 10
    buckets = [(4, 4), (8, 8)]

    with self.test_session():
      # Here comes a sample Seq2Seq model using GRU cells.
      def SampleGRUSeq2Seq(enc_inp, dec_inp, weights, per_example_loss):
        """Example sequence-to-sequence model that uses GRU cells."""

        def GRUSeq2Seq(enc_inp, dec_inp):
          cell = core_rnn_cell_impl.MultiRNNCell(
              [core_rnn_cell_impl.GRUCell(24) for _ in range(2)],
              state_is_tuple=True)
          return seq2seq_lib.embedding_attention_seq2seq(
              enc_inp,
              dec_inp,
              cell,
              num_encoder_symbols=classes,
              num_decoder_symbols=classes,
              embedding_size=24)

        targets = [dec_inp[i + 1] for i in range(len(dec_inp) - 1)] + [0]
        return seq2seq_lib.model_with_buckets(
            enc_inp,
            dec_inp,
            targets,
            weights,
            buckets,
            GRUSeq2Seq,
            per_example_loss=per_example_loss)

      # Now we construct the copy model.
      inp = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      out = [
          array_ops.placeholder(
              dtypes.int32, shape=[None]) for _ in range(8)
      ]
      weights = [
          array_ops.ones_like(
              inp[0], dtype=dtypes.float32) for _ in range(8)
      ]
      with variable_scope.variable_scope("root"):
        _, losses1 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=False)
        # Now check that we did not accidentally set reuse.
        self.assertEqual(False, variable_scope.get_variable_scope().reuse)
        # Construct one more model with per-example loss.
        variable_scope.get_variable_scope().reuse_variables()
        _, losses2 = SampleGRUSeq2Seq(inp, out, weights, per_example_loss=True)
        # First loss is scalar, the second one is a 1-dimensinal tensor.
        self.assertEqual([], losses1[0].get_shape().as_list())
        self.assertEqual([None], losses2[0].get_shape().as_list())
Ejemplo n.º 23
0
 def testExceptions(self):
   with self.test_session():
     x = constant_op.constant(self.dtype([0.1, 0.2]))
     wrapped_fn, _ = variable_utils.externalize_variables_as_args(
         test_fn,
         [x],
         possible_ancestor_vars=[],
         assert_variable_override=True)
     varscope_ops.get_variable_scope().reuse_variables()
     with self.assertRaisesRegexp(ValueError, r"not found"):
       wrapped_fn(self.dtype(2))
Ejemplo n.º 24
0
  def testBasic(self):
    for i, dtype in enumerate(self.float_types):
      with self.cached_session(), self.test_scope():
        variable_scope.get_variable_scope().set_use_resource(True)
        # Initialize variables for numpy implementation.
        m0, v0, m1, v1 = 0.0, 0.0, 0.0, 0.0
        var0_np = np.array([1.0, 2.0], dtype=dtype)
        grads0_np = np.array([0.1, 0.1], dtype=dtype)
        var1_np = np.array([3.0, 4.0], dtype=dtype)
        grads1_np = np.array([0.01, 0.01], dtype=dtype)

        var0 = resource_variable_ops.ResourceVariable(
            var0_np, name="var0_%d" % i)
        var1 = resource_variable_ops.ResourceVariable(
            var1_np, name="var1_%d" % i)
        grads0 = constant_op.constant(grads0_np)
        grads1 = constant_op.constant(grads1_np)

        opt = adamax.AdaMaxOptimizer()
        update = opt.apply_gradients(zip([grads0, grads1], [var0, var1]))
        opt_variables = opt.variables()
        beta1_power = opt._get_beta_accumulators()
        self.assertTrue(beta1_power is not None)
        self.assertIn(beta1_power, opt_variables)

        with ops.Graph().as_default():
          # Shouldn't return non-slot variables from other graphs.
          self.assertEqual(0, len(opt.variables()))

        variables.global_variables_initializer().run()
        # Fetch params to validate initial values
        self.assertAllClose([1.0, 2.0], self.evaluate(var0))
        self.assertAllClose([3.0, 4.0], self.evaluate(var1))

        beta1_power = opt._get_beta_accumulators()

        # Run 3 steps of AdaMax
        for t in range(1, 4):
          update.run()

          self.assertAllCloseAccordingToType(0.9**(t + 1),
                                             self.evaluate(beta1_power))

          var0_np, m0, v0 = adamax_update_numpy(var0_np, grads0_np, t, m0, v0)
          var1_np, m1, v1 = adamax_update_numpy(var1_np, grads1_np, t, m1, v1)

          # Validate updated params
          self.assertAllCloseAccordingToType(
              var0_np, self.evaluate(var0), rtol=1e-2)
          self.assertAllCloseAccordingToType(
              var1_np, self.evaluate(var1), rtol=1e-2)
          self.assertEqual("var0_%d/AdaMax:0" % (i,),
                           opt.get_slot(var=var0, name="m").name)
Ejemplo n.º 25
0
 def _maybe_get_unique(name):
   """Get name for a unique variable, if not `reuse=True`."""
   if variable_scope.get_variable_scope().reuse:
     return name
   vs_vars = [x.op.name for x in
              variable_scope.get_variable_scope().global_variables()]
   full_name = variable_scope.get_variable_scope().name + "/" + name
   if full_name not in vs_vars: return name
   idx = 1
   while full_name + ("_%d" % idx) in vs_vars:
     idx += 1
   return name + ("_%d" % idx)
Ejemplo n.º 26
0
def _propagate(dim_indices, conf, cells, c_prev, m_prev, new_output, new_state,
               first_call):
  """Propagates through all the cells in dim_indices dimensions.
  """
  if len(dim_indices) == 0:
    return

  # Because of the way RNNCells are implemented, we take the last dimension
  # (H_{N-1}) out and feed it as the state of the RNN cell
  # (in `last_dim_output`).
  # The input of the cell (H_0 to H_{N-2}) are concatenated into `cell_inputs`
  if conf.num_dims > 1:
    ls_cell_inputs = [None] * (conf.num_dims - 1)
    for d in conf.dims[:-1]:
      ls_cell_inputs[d.idx] = new_output[d.idx] if new_output[
          d.idx] is not None else m_prev[d.idx]
    cell_inputs = array_ops.concat(ls_cell_inputs, 1)
  else:
    cell_inputs = array_ops.zeros([m_prev[0].get_shape().as_list()[0], 0],
                                  m_prev[0].dtype)

  last_dim_output = new_output[-1] if new_output[-1] is not None else m_prev[-1]

  for i in dim_indices:
    d = conf.dims[i]
    if d.non_recurrent_fn:
      linear_args = array_ops.concat(
          [cell_inputs, last_dim_output],
          1) if conf.num_dims > 1 else last_dim_output
      with vs.variable_scope('non_recurrent' if conf.tied else
                             'non_recurrent/cell_{}'.format(i)):
        if conf.tied and not (first_call and i == dim_indices[0]):
          vs.get_variable_scope().reuse_variables()
        new_output[d.idx] = layers.legacy_fully_connected(
            linear_args,
            num_output_units=conf.num_units,
            activation_fn=d.non_recurrent_fn,
            weight_init=vs.get_variable_scope().initializer or
            layers.initializers.xavier_initializer)
    else:
      if c_prev[i] is not None:
        cell_state = array_ops.concat([c_prev[i], last_dim_output], 1)
      else:
        # for GRU/RNN, the state is just the previous output
        cell_state = last_dim_output

      with vs.variable_scope('recurrent' if conf.tied else
                             'recurrent/cell_{}'.format(i)):
        if conf.tied and not (first_call and i == dim_indices[0]):
          vs.get_variable_scope().reuse_variables()
        cell = cells[i]
        new_output[d.idx], new_state[d.idx] = cell(cell_inputs, cell_state)
Ejemplo n.º 27
0
def dialog_attention_seq2seq(encoder_inputs, decoder_inputs, cell, vocab_size,
                             num_heads=1, output_projection=None,
                             feed_previous=False, dtype=dtypes.float32,
                             scope=None, initial_state_attention=False):
    if len(encoder_inputs) != len(decoder_inputs):
        raise Exception

    with variable_scope.variable_scope(scope or "dialog_attention_seq2seq"):

        encoder_cell = rnn_cell.EmbeddingWrapper(cell, vocab_size)
        outputs = []

        fixed_batch_size = encoder_inputs[0][0].get_shape().with_rank_at_least(1)[0]
        if fixed_batch_size.value:
          batch_size = fixed_batch_size.value
        else:
          batch_size = array_ops.shape(encoder_inputs[0][0])[0]

        drnn_state = cell.zero_state(batch_size, dtype)

        for i in range(0, len(encoder_inputs)):
            if i > 0: variable_scope.get_variable_scope().reuse_variables()

            encoder_outputs, encoder_state = rnn.rnn(
                encoder_cell, encoder_inputs[i], dtype=dtype)

            # First calculate a concatenation of encoder outputs to put attention on.
            top_states = [array_ops.reshape(e, [-1, 1, cell.output_size])
                          for e in encoder_outputs]
            attention_states = array_ops.concat(1, top_states)

            with variable_scope.variable_scope("DRNN"):
                drnn_out, drnn_state = cell(encoder_state, drnn_state)

            # Decoder.
            output_size = None
            if output_projection is None:
                cell = rnn_cell.OutputProjectionWrapper(cell, vocab_size)
                output_size = vocab_size

            answer_output, answer_state = embedding_attention_decoder(
                decoder_inputs[i], drnn_state, attention_states, cell,
                vocab_size, num_heads=num_heads, output_size=output_size,
                output_projection=output_projection, feed_previous=feed_previous,
                initial_state_attention=initial_state_attention)

            outputs.append(answer_output)
            with variable_scope.variable_scope("DRNN", reuse=True):
                drnn_out, drnn_state = cell(answer_state, drnn_state)

        return outputs, drnn_state
Ejemplo n.º 28
0
  def test_unique_name_and_reuse(self):
    tmpl1 = template.make_template(
        "_", variable_scoped_function, unique_name_="s1")
    v1 = tmpl1()
    v2 = tmpl1()

    variable_scope.get_variable_scope().reuse_variables()
    tmpl2 = template.make_template(
        "_", variable_scoped_function, unique_name_="s1")
    v3 = tmpl2()

    self.assertEqual(v1, v2)
    self.assertEqual(v1, v3)
    self.assertEqual("s1/dummy:0", v1.name)
  def _TestQuantize_AtrousConvWithBatchNorm(
      self, activation, activation_op_name, with_bypass, delay,
      fused_batch_norm, use_resource):
    """Tests quantization: inputs -> atrous conv with batch norm -> Activation.

    Args:
      activation: Callable that returns an Operation, a factory method for the
        Activation.
      activation_op_name: String, name of the Activation operation.
      with_bypass: Bool, when true there is an extra connection added from
        inputs to just before Activation.
      delay: Int (optional), delay in number of steps until quantization starts.
      fused_batch_norm: Bool, when true use FusedBatchNorm.
      use_resource: Bool, when true uses resource variables.
    """
    graph = ops.Graph()
    with graph.as_default():
      variable_scope.get_variable_scope().set_use_resource(use_resource)
      batch_size, height, width, depth = 5, 128, 128, 3
      inputs = array_ops.zeros((batch_size, height, width, depth))
      dilation_rate = 2
      scope = 'test/test2' if with_bypass else 'test'
      node = separable_conv2d(
          inputs,
          None, [3, 3],
          rate=dilation_rate,
          depth_multiplier=1.0,
          padding='SAME',
          weights_initializer=self._WeightInit(0.09),
          activation_fn=None,
          normalizer_fn=batch_norm,
          normalizer_params=self._BatchNormParams(fused_batch_norm),
          scope=scope)

      # Manually add a bypass (optional) and an activation.
      if with_bypass:
        node = math_ops.add(inputs, node, name='test/Add')

      node = activation(node, name='test/' + activation_op_name)

      update_barrier = control_flow_ops.no_op(name='update_barrier')
      with ops.control_dependencies([update_barrier]):
        array_ops.identity(node, name='control_dependency')

      fold_batch_norms.FoldBatchNorms(graph, is_training=True)
      quantize.Quantize(graph, True, quant_delay=delay)

      self._AssertCorrectQuantizedGraphWithBatchNorm(
          graph, scope, 'DepthwiseConv2dNative', activation_op_name,
          with_bypass, delay, use_resource)
Ejemplo n.º 30
0
  def build(self, input_shape):
    input_shape = tensor_shape.TensorShape(input_shape)
    if not input_shape.ndims:
      raise ValueError('Input has undefined rank:', input_shape)
    ndim = len(input_shape)
    if self.axis < 0:
      axis = ndim + self.axis
    else:
      axis = self.axis
    if axis < 0 or axis >= ndim:
      raise ValueError('Value of `axis` argument ' + str(self.axis) +
                       ' is out of range for input with rank ' + str(ndim))
    param_dim = input_shape[axis]
    if not param_dim.value:
      raise ValueError('Input has undefined `axis` dimension. Input shape: ',
                       input_shape)

    if self.center:
      self.beta = vs.get_variable('beta',
                                  shape=(param_dim,),
                                  initializer=self.beta_initializer,
                                  regularizer=self.beta_regularizer,
                                  trainable=True)
    else:
      self.beta = None
    if self.scale:
      self.gamma = vs.get_variable('gamma',
                                   shape=(param_dim,),
                                   initializer=self.gamma_initializer,
                                   regularizer=self.gamma_regularizer,
                                   trainable=True)
    else:
      self.gamma = None

    # Disable variable partitioning when creating the moving mean and variance
    partitioner = vs.get_variable_scope().partitioner
    try:
      vs.get_variable_scope().set_partitioner(None)
      self.moving_mean = vs.get_variable(
          'moving_mean',
          shape=(param_dim,),
          initializer=self.moving_mean_initializer,
          trainable=False)
      self.moving_variance = vs.get_variable(
          'moving_variance',
          shape=(param_dim,),
          initializer=self.moving_variance_initializer,
          trainable=False)
    finally:
      vs.get_variable_scope().set_partitioner(partitioner)
    def train_mode(self, input_dim, input_states, input_mask, concept_dim, concept_states, concept_mask,
            init_state, decoder_inputs, decoder_refs, decoder_feats, decoder_wids, decoder_cids, loss_weights, mode_gen='ce_train'):
        '''
        encoder_dim: int-valued
        encoder_states: [batch_size, passage_len, encoder_dim].
        encoder_mask: [batch_size, passage_len] int32
        init_state: Tuple of [batch_size, gen_hidden_size]
        decoder_inputs: [batch_size, max_dec_steps].
        decoder_refs: [batch_size, max_dec_steps]
        decoder_feats: [batch_size, max_dec_steps]
        decoder_wids: [batch_size, max_dec_steps]
        decoder_cids: [batch_size, max_dec_steps]
        '''
        options = self.options

        batch_size = tf.shape(input_states)[0]
        input_len = tf.shape(input_states)[1]
        concept_len = tf.shape(concept_states)[1]


        decoder_inputs = tf.unstack(decoder_inputs, axis=1) # max_dec_steps * [batch_size]
        decoder_refs_unstack = tf.unstack(decoder_refs, axis=1) # max_dec_steps * [batch_size]
        decoder_feats = tf.unstack(decoder_feats, axis=1) # max_dec_steps * [batch_size, feat_num]
        decoder_wids = tf.unstack(decoder_wids, axis=1) # max_dec_steps * [batch_size]
        decoder_cids = tf.unstack(decoder_cids, axis=1) # max_dec_steps * [batch_size]

        # initialize all the variables
        state_t_1 = init_state
        context_input_t_1 = tf.zeros([batch_size, input_dim])
        context_concept_t_1 = tf.zeros([batch_size, concept_dim])

        # store variables from each time-step
        vocab_scores = []
        sampled_words = []
        with variable_scope.variable_scope("attention_decoder"):
            wordidx_t = decoder_inputs[0] # [batch_size] int32
            featidx_t = decoder_feats[0] # [batch_size, feat_num] int32
            wid_t = decoder_wids[0]
            cid_t = decoder_cids[0]
            for i in range(options.max_answer_len):
                if mode_gen in ('ce_train', 'loss',):
                    wordidx_t = decoder_inputs[i]
                    featidx_t = decoder_feats[i]
                    wid_t = decoder_wids[i]
                    cid_t = decoder_cids[i]
                word_t = _embedding_lookup(wordidx_t, self.action_embedding)
                if i > 0:
                    variable_scope.get_variable_scope().reuse_variables()

                (state_t, context_input_t, context_concept_t, output_t) = \
                        self.one_step_decoder(state_t_1, context_input_t_1, context_concept_t_1, word_t, featidx_t, wid_t, cid_t,
                                input_states, input_mask, concept_states, concept_mask)

                vocab_scores.append(output_t)

                state_t_1 = state_t
                context_input_t_1 = context_input_t
                context_concept_t_1 = context_concept_t

                if mode_gen == 'greedy':
                    # TODO update featidx_t
                    wordidx_t = tf.argmax(output_t, 1) # [batch_size]
                    wordidx_t = tf.reshape(wordidx_t, [-1]) # [batch_size]
                elif mode_gen == 'sample':
                    # TODO update featidx_t
                    log_score_t = tf.log(output_t) # [batch_size, vsize]
                    wordidx_t = tf.multinomial(log_score_t, 1) # [batch_size, 1]
                    wordidx_t = tf.reshape(wordidx_t, [-1]) # [batch_size]
                elif mode_gen in ('ce_train', 'loss',):
                    wordidx_t = tf.argmax(output_t, axis=1) # [batch]
                else:
                    assert False, 'unknown generating mode %s' % mode_gen
                sampled_words.append(wordidx_t)

        if len(sampled_words)!=0:
            sampled_words = tf.stack(sampled_words, axis=1) # [batch_size, max_dec_steps]

        vocab_scores = tf.stack(vocab_scores, axis=1) # [batch_size, max_dec_steps, vocab]

        # calculating loss
        self.loss = None
        if mode_gen in ('ce_train', 'loss', ):
            xent = _CE_loss(vocab_scores, decoder_refs, loss_weights) # [batch_size]
            if mode_gen == 'loss': xent *= self.placeholders.reward # multiply with rewards
            self.loss = tf.reduce_mean(xent)

        # accuracy is calculated only under 'ce_train', where true answer is given
        if mode_gen == 'ce_train':
            accuracy = _mask_and_accuracy(vocab_scores, decoder_refs, loss_weights)
            return accuracy, self.loss, sampled_words
        else:
            return None, self.loss, sampled_words
def embedding_lookup(
    params,
    ids,
    partition_strategy=None,  # pylint: disable=unused-argument
    name=None,
    validate_indices=None,  # pylint: disable=unused-argument
    max_norm=None,
    return_trainable=False,
):
  """Provides a dynamic version of embedding_lookup
      similar with tf.nn.embedding_lookup.

    Ids are flattened to a 1d tensor before being passed to embedding_lookup
    then, they are unflattend to match the original ids shape plus an extra
    leading dimension of the size of the embeddings.

    Args:
      params: A dynamic_embedding.Variable instance.
      ids: A tensor with any shape as same dtype of params.key_dtype.
      partition_strategy: No used, for API compatiblity with `nn.emedding_lookup`.
      name: A name for the operation. Name is optional in graph mode and required
        in eager mode.
      validate_indices: No used, just for compatible with nn.embedding_lookup .
      max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
        than this value.
      return_trainable: optional, If True, also return TrainableWrapper
    Returns:
      A tensor with shape [shape of ids] + [dim],
        dim is equal to the value dim of params.
        containing the values from the params tensor(s) for keys in ids.
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.
    """
  if isinstance(params, (list, tuple)) and len(params) > 1:
    raise ValueError("Only one params is allowed.")
  if isinstance(params, (list, tuple)):
    params = params[0]
  if not isinstance(params, de.Variable):
    raise TypeError("params should be a Variable instance.")
  if params.key_dtype != ids.dtype:
    raise TypeError(
        "params.key_dtype should be same with ids.dtype: {} vs. {}".format(
            params.key_dtype, ids.dtype))
  if context.executing_eagerly() and (name is None):
    raise ValueError(
        'Must specify a name for dynamic_embedding.embedding_lookup when running eagerly.'
    )

  scope = variable_scope.get_variable_scope()
  full_name = scope.name + "/" if scope.name else ""
  full_name += (name + "/") if name else "embedding_lookup/"
  with ops.name_scope(full_name):
    ids = ops.convert_to_tensor(ids, name="ids")
    if ids.get_shape().is_fully_defined():
      # use static shape
      initial_shape = [ids.get_shape().num_elements(), params.dim]
      embeddings_shape = ids.get_shape().concatenate([params.dim])
    else:
      # use dynamic shape
      initial_shape = (1, params.dim)
      embeddings_shape = array_ops.concat([array_ops.shape(ids), [params.dim]],
                                          axis=0)
    initial_value = array_ops.zeros(shape=initial_shape,
                                    dtype=params.value_dtype)
    if (isinstance(initial_value, ops.Tensor)
        and hasattr(initial_value, "graph")
        and initial_value.graph.building_function):

      def initial_value():
        return array_ops.zeros(initial_shape, dtype=params.value_dtype)

    with ops.colocate_with(None, ignore_existing=True):
      collections = [ops.GraphKeys.LOCAL_VARIABLES]
      if params.trainable:
        collections += [ops.GraphKeys.TRAINABLE_VARIABLES]

      def _create_trainable(trainable_name):
        return de.TrainableWrapper(params,
                                   ids,
                                   max_norm=max_norm,
                                   initial_value=initial_value,
                                   dtype=params.value_dtype,
                                   trainable=params.trainable,
                                   collections=collections,
                                   model_mode=ModelMode.CURRENT_SETTING,
                                   name=trainable_name)

      with ops.colocate_with(ids, ignore_existing=True):
        if context.executing_eagerly():
          trainable_ = params._trainable_store.get(name, None)
          if trainable_ is None:
            trainable_ = _create_trainable(name)
            params._trainable_store[name] = trainable_
          else:
            trainable_._reset_ids(ids)
        else:
          trainable_ = _create_trainable(name)
          params._trainable_store[name] = trainable_

    embeddings = array_ops.identity(trainable_)
    embeddings = array_ops.reshape(embeddings, shape=embeddings_shape)

  return (embeddings, trainable_) if return_trainable else embeddings
Ejemplo n.º 33
0
def attention_decoder(decoder_inputs,
                      initial_state,
                      encoder_states,
                      enc_padding_mask,
                      cell,
                      initial_state_attention=False,
                      pointer_gen=True,
                      use_coverage=False,
                      prev_coverage=None):
    """
  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    encoder_states: 3D Tensor [batch_size x attn_length x attn_size].
    enc_padding_mask: 2D Tensor [batch_size x attn_length] containing 1s and 0s; indicates which of the encoder locations are padding (0) or a real token (1).
    cell: rnn_cell.RNNCell defining the cell function and size.
    initial_state_attention:
      Note that this attention decoder passes each decoder input through a linear layer with the previous step's context vector to get a modified version of the input. If initial_state_attention is False, on the first decoder step the "previous context vector" is just a zero vector. If initial_state_attention is True, we use initial_state to (re)calculate the previous step's context vector. We set this to False for train/eval mode (because we call attention_decoder once for all decoder steps) and True for decode mode (because we call attention_decoder once for each decoder step).
    pointer_gen: boolean. If True, calculate the generation probability p_gen for each decoder step.
    use_coverage: boolean. If True, use coverage mechanism.
    prev_coverage:
      If not None, a tensor with shape (batch_size, attn_length). The previous step's coverage vector. This is only not None in decode mode when using coverage.

  Returns:
    outputs: A list of the same length as decoder_inputs of 2D Tensors of
      shape [batch_size x cell.output_size]. The output vectors.
    state: The final state of the decoder. A tensor shape [batch_size x cell.state_size].
    attn_dists: A list containing tensors of shape (batch_size,attn_length).
      The attention distributions for each decoder step.
    p_gens: List of length input_size, containing tensors of shape [batch_size, 1]. The values of p_gen for each decoder step. Empty list if pointer_gen=False.
    coverage: Coverage vector on the last step computed. None if use_coverage=False.
  """
    #pdb.set_trace()
    with variable_scope.variable_scope("attention_decoder") as scope:
        batch_size = encoder_states.get_shape(
        )[0].value  # if this line fails, it's because the batch size isn't defined
        attn_size = encoder_states.get_shape(
        )[2].value  # if this line fails, it's because the attention length isn't defined

        # Reshape encoder_states (need to insert a dim)
        encoder_states = tf.expand_dims(
            encoder_states,
            axis=2)  # now is shape (batch_size, attn_len, 1, attn_size)

        # To calculate attention, we calculate
        #   v^T tanh(W_h h_i + W_s s_t + b_attn)
        # where h_i is an encoder state, and s_t a decoder state.
        # attn_vec_size is the length of the vectors v, b_attn, (W_h h_i) and (W_s s_t).
        # We set it to be equal to the size of the encoder states.
        attention_vec_size = attn_size

        # Get the weight matrix W_h and apply it to each encoder state to get (W_h h_i), the encoder features
        W_h = variable_scope.get_variable(
            "W_h", [1, 1, attn_size, attention_vec_size])
        encoder_features = nn_ops.conv2d(
            encoder_states, W_h, [1, 1, 1, 1],
            "SAME")  # shape (batch_size,attn_length,1,attention_vec_size)

        # Get the weight vectors v and w_c (w_c is for coverage)
        v = variable_scope.get_variable("v", [attention_vec_size])
        if use_coverage:
            with variable_scope.variable_scope("coverage"):
                w_c = variable_scope.get_variable(
                    "w_c", [1, 1, 1, attention_vec_size])

        if prev_coverage is not None:  # for beam search mode with coverage
            # reshape from (batch_size, attn_length) to (batch_size, attn_len, 1, 1)
            prev_coverage = tf.expand_dims(tf.expand_dims(prev_coverage, 2), 3)

        def attention(decoder_state, coverage=None):
            """Calculate the context vector and attention distribution from the decoder state.

      Args:
        decoder_state: state of the decoder
        coverage: Optional. Previous timestep's coverage vector, shape (batch_size, attn_len, 1, 1).

      Returns:
        context_vector: weighted sum of encoder_states
        attn_dist: attention distribution
        coverage: new coverage vector. shape (batch_size, attn_len, 1, 1)
      """
            with variable_scope.variable_scope("Attention"):
                # Pass the decoder state through a linear layer (this is W_s s_t + b_attn in the paper)
                decoder_features = linear(
                    decoder_state, attention_vec_size,
                    True)  # shape (batch_size, attention_vec_size)
                decoder_features = tf.expand_dims(
                    tf.expand_dims(decoder_features, 1),
                    1)  # reshape to (batch_size, 1, 1, attention_vec_size)

                def masked_attention(e):
                    """Take softmax of e then apply enc_padding_mask and re-normalize"""
                    attn_dist = nn_ops.softmax(
                        e)  # take softmax. shape (batch_size, attn_length)
                    attn_dist *= enc_padding_mask  # apply mask
                    masked_sums = tf.reduce_sum(attn_dist,
                                                axis=1)  # shape (batch_size)
                    return attn_dist / tf.reshape(masked_sums,
                                                  [-1, 1])  # re-normalize

                if use_coverage and coverage is not None:  # non-first step of coverage
                    # Multiply coverage vector by w_c to get coverage_features.
                    coverage_features = nn_ops.conv2d(
                        coverage, w_c, [1, 1, 1, 1], "SAME"
                    )  # c has shape (batch_size, attn_length, 1, attention_vec_size)

                    # Calculate v^T tanh(W_h h_i + W_s s_t + w_c c_i^t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features +
                                          coverage_features),
                        [2, 3])  # shape (batch_size,attn_length)

                    # Calculate attention distribution
                    attn_dist = masked_attention(e)

                    # Update coverage vector
                    coverage += array_ops.reshape(attn_dist,
                                                  [batch_size, -1, 1, 1])
                else:
                    # Calculate v^T tanh(W_h h_i + W_s s_t + b_attn)
                    e = math_ops.reduce_sum(
                        v * math_ops.tanh(encoder_features + decoder_features),
                        [2, 3])  # calculate e

                    # Calculate attention distribution
                    attn_dist = masked_attention(e)

                    if use_coverage:  # first step of training
                        coverage = tf.expand_dims(tf.expand_dims(attn_dist, 2),
                                                  2)  # initialize coverage

                # Calculate the context vector from attn_dist and encoder_states
                context_vector = math_ops.reduce_sum(
                    array_ops.reshape(attn_dist, [batch_size, -1, 1, 1]) *
                    encoder_states, [1, 2])  # shape (batch_size, attn_size).
                context_vector = array_ops.reshape(context_vector,
                                                   [-1, attn_size])

            return context_vector, attn_dist, coverage

        outputs = []
        attn_dists = []
        p_gens = []
        state = initial_state
        coverage = prev_coverage  # initialize coverage to None or whatever was passed in
        context_vector = array_ops.zeros([batch_size, attn_size])
        context_vector.set_shape([
            None, attn_size
        ])  # Ensure the second shape of attention vectors is set.
        if initial_state_attention:  # true in decode mode
            # Re-calculate the context vector from the previous step so that we can pass it through a linear layer with this step's input to get a modified version of the input
            context_vector, _, coverage = attention(
                initial_state, coverage
            )  # in decode mode, this is what updates the coverage vector
        for i, inp in enumerate(decoder_inputs):
            tf.logging.info("Adding attention_decoder timestep %i of %i", i,
                            len(decoder_inputs))
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            # Merge input and previous attentions into one vector x of the same size as inp
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            x = linear([inp] + [context_vector], input_size, True)

            # Run the decoder RNN cell. cell_output = decoder state
            cell_output, state = cell(x, state)

            # Run the attention mechanism.
            if i == 0 and initial_state_attention:  # always true in decode mode
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True
                ):  # you need this because you've already run the initial attention(...) call
                    context_vector, attn_dist, _ = attention(
                        state, coverage)  # don't allow coverage to update
            else:
                context_vector, attn_dist, coverage = attention(
                    state, coverage)
            attn_dists.append(attn_dist)

            # Calculate p_gen
            if pointer_gen:
                with tf.variable_scope('calculate_pgen'):
                    p_gen = linear([context_vector, state.c, state.h, x], 1,
                                   True)  # Tensor shape (batch_size, 1)
                    p_gen = tf.sigmoid(p_gen)
                    p_gens.append(p_gen)

            # Concatenate the cell_output (= decoder state) and the context vector, and pass them through a linear layer
            # This is V[s_t, h*_t] + b in the paper
            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + [context_vector],
                                cell.output_size, True)
            outputs.append(output)

        # If using coverage, reshape it
        if coverage is not None:
            coverage = array_ops.reshape(coverage, [batch_size, -1])

        return outputs, state, attn_dists, p_gens, coverage
Ejemplo n.º 34
0
def seq2seq(feed_previous=False,
            input_dim=1,
            output_dim=1,
            input_length=120,
            output_length=48,
            hidden_dim=64,
            stacked_layers=2,
            GRADIENT_CLIPPING=2.5):

    tf.reset_default_graph()
    global_step = tf.Variable(
        initial_value=0,
        name="global_step",
        trainable=False,
        collections=[tf.GraphKeys.GLOBAL_STEP, tf.GraphKeys.GLOBAL_VARIABLES])

    weights = {
        'out': tf.get_variable('Weights_out', shape = [hidden_dim, output_dim], \
                               dtype = tf.float32, \
                               initializer = tf.truncated_normal_initializer()),
    }
    biases = {
        'out': tf.get_variable('Biases_out', shape = [output_dim], \
                               dtype = tf.float32, \
                               initializer = tf.constant_initializer(0.)),
    }

    with tf.variable_scope('Seq2seq'):
        encoder_input = [
            tf.placeholder(tf.float32,
                           shape=(None, input_dim),
                           name="input_{}".format(t))
            for t in range(input_length)
        ]

        target_sequence = [
            tf.placeholder(tf.float32,
                           shape=(None, output_dim),
                           name="y".format(t)) for t in range(output_length)
        ]

        decoder_input = [
            tf.zeros_like(target_sequence[0], dtype=tf.float32, name="GO")
        ] + target_sequence[:-1]

        with tf.variable_scope('LSTMCell'):
            cells = []
            for i in range(stacked_layers):
                with tf.variable_scope('RNN_{}'.format(i)):
                    cells.append(tf.contrib.rnn.LSTMCell(hidden_dim))
            cell = tf.contrib.rnn.MultiRNNCell(cells)

        with variable_scope.variable_scope('basic_rnn_seq2seq'):
            encoder_cell = copy.deepcopy(cell)
            _, encoder_state = rnn.static_rnn(encoder_cell,
                                              encoder_input,
                                              dtype=dtypes.float32)

            with variable_scope.variable_scope('rnn_decoder'):
                state = encoder_state
                outputs = []
                for i, input_ in enumerate(decoder_input):
                    if i > 0:
                        variable_scope.get_variable_scope().reuse_variables()
                    output, state = cell(input_, state)
                    outputs.append(output)

            reshaped = [
                tf.matmul(i, weights['out']) + biases['out'] for i in outputs
            ]
            return encoder_input, target_sequence, reshaped, global_step
def safe_embedding_lookup_sparse(
    embedding_weights,
    sparse_ids,
    sparse_weights=None,
    combiner="mean",
    default_id=None,
    name="safe_embedding_lookup_sparse",
    partition_strategy=None,  # no used
    max_norm=None,
    return_trainable=False,
):
  """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`.

    Lookup embedding results, accounting for empty features and invalid weights.

    Any IDs will be treated as valid include non-positive IDs.
    Invalid weights (<= 0) are pruned from input weights, as well as any IDs
    with non-positive weight. For an entry with no features, the embedding vector
    for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

    The ids and weights may be multi-dimensional. Embeddings are always aggregated
    along the last dimension.

    Args:
      embedding_weights: A single `dynamic_embedding.Variable` instance
        representing the complete embedding tensor.
      sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
      sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights are
        be assumed to be 1.0.
      combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
        default.
      default_id: The id to use for an entry with no features.
      name: A name for this operation. Name is optional in graph mode and required
        in eager mode.
      partition_strategy: A string specifying the partitioning strategy. Currently
        `"div"` and `"mod"` are supported. Default is `"div"`.
      max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
        combining.

    Returns:
      combined_embeddings:
        A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.

    Raises:
      ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  if embedding_weights.key_dtype != sparse_ids.dtype:
    raise TypeError(
        "embedding_weights.key_dtype should be same with sparse_ids.dtype: "
        "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype))

  weights_dtype = sparse_weights.dtype if sparse_weights is not None else None
  if weights_dtype and embedding_weights.value_dtype != weights_dtype:
    raise TypeError(
        "embedding_weights.value_dtype should be same with sparse_weights.dtype"
        ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype))

  scope = variable_scope.get_variable_scope()
  full_name = scope.name + "/" + name if scope.name else name
  with ops.name_scope(full_name + "/"):
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.dense_shape
    original_rank_dim = tensor_shape.dimension_value(
        sparse_ids.dense_shape.get_shape()[0])
    original_rank = (array_ops.size(original_shape)
                     if original_rank_dim is None else original_rank_dim)
    sparse_ids = de.math.sparse_reshape(
        sparse_ids,
        [
            math_ops.reduce_prod(
                array_ops.slice(original_shape, [0], [original_rank - 1])),
            array_ops.gather(original_shape, original_rank - 1),
        ],
    )
    if sparse_weights is not None:
      sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices,
                                                  sparse_weights.values,
                                                  sparse_ids.dense_shape)

    # Prune invalid weights.
    if combiner != "sum":
      sparse_ids, sparse_weights = _prune_invalid_weights(
          sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = de.math.sparse_fill_empty_rows(
        sparse_ids, default_id or 0)
    if sparse_weights is not None:
      sparse_weights, _ = de.math.sparse_fill_empty_rows(sparse_weights, 1.0)

    result, trainable_ = embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=name + "/embedding_lookup_sparse",
        max_norm=max_norm,
        return_trainable=True,
    )

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.stack([1, array_ops.shape(result)[1]]),
      )

      result = array_ops.where(is_row_empty,
                               array_ops.zeros_like(result),
                               result,
                               name="where")

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(
        result,
        array_ops.concat(
            [
                array_ops.slice(
                    math_ops.cast(original_shape, dtypes.int32),
                    [0],
                    [original_rank - 1],
                ),
                array_ops.slice(array_ops.shape(result), [1], [-1]),
            ],
            0,
        ),
    )
    final_result.set_shape(
        tensor_shape.unknown_shape(
            (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
                result.get_shape()[1:]))
    return (final_result, trainable_) if return_trainable else final_result
Ejemplo n.º 36
0
def variable(name,
             shape=None,
             dtype=None,
             initializer=None,
             regularizer=None,
             trainable=True,
             collections=None,
             caching_device=None,
             device=None,
             partitioner=None,
             custom_getter=None,
             use_resource=None,
             synchronization=variables.VariableSynchronization.AUTO,
             aggregation=variables.VariableAggregation.NONE):
    """Gets an existing variable with these parameters or creates a new one.

  Args:
    name: the name of the new or existing variable.
    shape: shape of the new or existing variable.
    dtype: type of the new or existing variable (defaults to `DT_FLOAT`).
    initializer: initializer for the variable if one is created.
    regularizer: a (Tensor -> Tensor or None) function; the result of applying
      it on a newly created variable will be added to the collection
      GraphKeys.REGULARIZATION_LOSSES and can be used for regularization.
    trainable: If `True` also add the variable to the graph collection
      `GraphKeys.TRAINABLE_VARIABLES` (see `tf.Variable`).
    collections: A list of collection names to which the Variable will be added.
      If None it would default to `tf.GraphKeys.GLOBAL_VARIABLES`.
    caching_device: Optional device string or function describing where the
      Variable should be cached for reading.  Defaults to the Variable's device.
    device: Optional device to place the variable. It can be an string or a
      function that is called to get the device for the variable.
    partitioner: Optional callable that accepts a fully defined `TensorShape`
      and dtype of the `Variable` to be created, and returns a list of
      partitions for each axis (currently only one axis can be partitioned).
    custom_getter: Callable that allows overwriting the internal get_variable
      method and has to have the same signature.
    use_resource: If `True` use a ResourceVariable instead of a Variable.
    synchronization: Indicates when a distributed a variable will be aggregated.
      Accepted values are constants defined in the class
      `tf.VariableSynchronization`. By default the synchronization is set to
      `AUTO` and the current `DistributionStrategy` chooses when to synchronize.
      If `synchronization` is set to `ON_READ`, `trainable` must not be set to
      `True`.
    aggregation: Indicates how a distributed variable will be aggregated.
      Accepted values are constants defined in the class
      `tf.VariableAggregation`.

  Returns:
    The created or existing variable.
  """
    collections = list(collections if collections is not None else
                       [ops.GraphKeys.GLOBAL_VARIABLES])

    # Remove duplicates
    collections = list(set(collections))
    getter = variable_scope.get_variable
    if custom_getter is not None:
        getter = functools.partial(
            custom_getter, reuse=variable_scope.get_variable_scope().reuse)
    with ops.device(device or ''):
        return getter(name,
                      shape=shape,
                      dtype=dtype,
                      initializer=initializer,
                      regularizer=regularizer,
                      trainable=trainable,
                      collections=collections,
                      caching_device=caching_device,
                      partitioner=partitioner,
                      use_resource=use_resource,
                      synchronization=synchronization,
                      aggregation=aggregation)
Ejemplo n.º 37
0
    def call(self, inputs, state):
        num_proj = self._num_units if self._num_proj is None else self._num_proj
        sigmoid = math_ops.sigmoid

        if self._state_is_tuple:
            (c_prev, h_prev) = state
        else:
            c_prev = array_ops.slice(state, [0, 0], [-1, self._num_units])
            h_prev = array_ops.slice(state, [0, self._num_units],
                                     [-1, num_proj])

        x = inputs[0]
        dtype = x.dtype
        input_size = x.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError(
                "Could not infer input size from get_shape()")  # [-1]?
        scope = vs.get_variable_scope()
        with vs.variable_scope(scope,
                               reuse=tf.AUTO_REUSE,
                               initializer=self._initializer) as unit_scope:
            # i = input_gate, j = new_input, f = forget_gate, o = output_gate
            lstm_matrix = _linear([x, h_prev], 4 * self._num_units, bias=True)
            i, j, f, o = array_ops.split(value=lstm_matrix,
                                         num_or_size_splits=4,
                                         axis=1)
            with vs.variable_scope("label_layer",
                                   reuse=tf.AUTO_REUSE) as label_scope:
                """ compute the output: probabilities """
                yy = tf.eye(self._num_labs)
                wy = vs.get_variable("wy", [self._num_labs, self._num_units])
                li = math_ops.matmul(yy, wy)

                if self._gate == "input":
                    logits = i + li
                elif self._gate == "output":
                    logits = o + li
                elif self._gate == "forget":
                    logits = f + li
                elif self._gate == "combine":
                    logits = lstm_matrix + array_ops.tile(li, [1, 4])

                yb = vs.get_variable("yb", [1, self._num_labs],
                                     initializer=init_ops.constant_initializer(
                                         0.0, dtype=dtype))
                logits = tf.reduce_sum(tf.log(1 + tf.exp(logits)), axis=1)
                logits += yb
                # Check check check probs = probs + tf.log(yb)
                output = logits - tf.reduce_max(logits)
                """ compute next state """

                if len(inputs) == 1:
                    y = softmax(output)
                elif len(inputs) == 2:
                    y = inputs[1]

                i_ = math_ops.matmul(y, wy)
            """ what should be added """
            if self._gate == "input":
                i += i_
            elif self._gate == "output":
                o += i_
            elif self._gate == "forget":
                f += i_
            elif self._gate == "combine":
                i += i_
                j += i_
                o += i_
                f += i_
            elif self._gate == "average":
                print("TODO")

            if self._use_peepholes:
                raise ValueError("Not supported yet")

            c = (sigmoid(f + self._forget_bias) * c_prev +
                 sigmoid(i) * self._activation(j))

            if self._cell_clip is not None:
                raise ValueError("Not supported yet")

            m = sigmoid(o) * self._activation(c)

            new_state = (LSTMStateTuple(c, m) if self._state_is_tuple else
                         array_ops.concat([c, m], 1))

        return output, new_state
Ejemplo n.º 38
0
    def __init__(self,
                 args,
                 output_size,
                 build_bias,
                 bias_initializer=None,
                 kernel_initializer=None):

        self._build_bias = build_bias

        if args is None or (nest.is_sequence(args) and not args):
            raise ValueError("`args` must be specified")

        if not nest.is_sequence(args):

            args = [args]

            self._is_sequence = False

        else:

            self._is_sequence = True

        # Calculate the total size of arguments on dimension 1.

        total_arg_size = 0

        shapes = [a.get_shape() for a in args]

        for shape in shapes:

            if shape.ndims != 2:
                raise ValueError("linear is expecting 2D arguments: %s" %
                                 shapes)

            if shape[1].value is None:

                raise ValueError(
                    "linear expects shape[1] to be provided for shape %s, "
                    "but saw %s" % (shape, shape[1]))

            else:

                total_arg_size += shape[1].value

        dtype = [a.dtype for a in args][0]

        scope = vs.get_variable_scope()

        with vs.variable_scope(scope) as outer_scope:

            self._weights = vs.get_variable(_WEIGHTS_VARIABLE_NAME,
                                            [total_arg_size, output_size],
                                            dtype=dtype,
                                            initializer=kernel_initializer)

            if build_bias:

                with vs.variable_scope(outer_scope) as inner_scope:

                    inner_scope.set_partitioner(None)

                    if bias_initializer is None:
                        bias_initializer = init_ops.constant_initializer(
                            0.0, dtype=dtype)

                    self._biases = vs.get_variable(
                        _BIAS_VARIABLE_NAME, [output_size],
                        dtype=dtype,
                        initializer=bias_initializer)
Ejemplo n.º 39
0
    def call(self, inputs, state):
        """Run one step of G-LSTM.
    Args:
      inputs: input Tensor, 2D, [batch x num_units].
      state: this must be a tuple of state Tensors, both `2-D`,
      with column sizes `c_state` and `m_state`.
    Returns:
      A tuple containing:
      - A `2-D, [batch x output_dim]`, Tensor representing the output of the
        G-LSTM after reading `inputs` when previous state was `state`.
        Here output_dim is:
           num_proj if num_proj was set,
           num_units otherwise.
      - LSTMStateTuple representing the new state of G-LSTM cell
        after reading `inputs` when the previous state was `state`.
    Raises:
      ValueError: If input size cannot be inferred from inputs via
        static shape inference.
    """
        (c_prev, m_prev) = state

        self._batch_size = inputs.shape[0].value or array_ops.shape(inputs)[0]
        input_size = inputs.shape[-1].value or array_ops.shape(inputs)[-1]
        dtype = inputs.dtype
        scope = vs.get_variable_scope()
        with vs.variable_scope(scope, initializer=self._initializer):
            i_parts = []
            j_parts = []
            f_parts = []
            o_parts = []

            for group_id in range(self._number_of_groups):
                with vs.variable_scope("group%d" % group_id):
                    x_g_id = array_ops.concat(
                        [
                            self._get_input_for_group(
                                inputs, group_id,
                                int(input_size / self._number_of_groups)),
                            #self._group_shape[0]), # this is only correct if inputs dim = num_units!!!
                            self._get_input_for_group(
                                m_prev, group_id,
                                int(self._output_size /
                                    self._number_of_groups))
                        ],
                        axis=1)
                    #self._group_shape[0])], axis=1)
                    if self._linear1[group_id] is None:
                        self._linear1[group_id] = _Linear(
                            x_g_id, 4 * self._group_shape[1], False)
                    R_k = self._linear1[group_id](x_g_id)  # pylint: disable=invalid-name
                    i_k, j_k, f_k, o_k = array_ops.split(R_k, 4, 1)

                i_parts.append(i_k)
                j_parts.append(j_k)
                f_parts.append(f_k)
                o_parts.append(o_k)

            bi = vs.get_variable(name="bias_i",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bj = vs.get_variable(name="bias_j",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bf = vs.get_variable(name="bias_f",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))
            bo = vs.get_variable(name="bias_o",
                                 shape=[self._num_units],
                                 dtype=dtype,
                                 initializer=init_ops.constant_initializer(
                                     0.0, dtype=dtype))

            i = nn_ops.bias_add(array_ops.concat(i_parts, axis=1), bi)
            j = nn_ops.bias_add(array_ops.concat(j_parts, axis=1), bj)
            f = nn_ops.bias_add(array_ops.concat(f_parts, axis=1), bf)
            o = nn_ops.bias_add(array_ops.concat(o_parts, axis=1), bo)

        c = (math_ops.sigmoid(f + self._forget_bias) * c_prev +
             math_ops.sigmoid(i) * math_ops.tanh(j))
        m = math_ops.sigmoid(o) * self._activation(c)

        if self._num_proj is not None:
            with vs.variable_scope("projection"):
                if self._linear2 is None:
                    self._linear2 = _Linear(m, self._num_proj, False)
                m = self._linear2(m)

        new_state = rnn_cell_impl.LSTMStateTuple(c, m)
        return m, new_state
Ejemplo n.º 40
0
def beam_attention_decoder(decoder_inputs,
                          initial_state,
                          attention_states,
                          cell,
                           embedding,
                          output_size=None,
                          num_heads=1,
                          loop_function=None,
                          dtype=None,
                          scope=None,
                          initial_state_attention=False, output_projection=None, beam_size=10):
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError("With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError("Shape[1] and [2] of attention_states must be known: %s"
                         % attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder", dtype=dtype) as scope:
        dtype = scope.dtype
        # batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = array_ops.shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states, [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in xrange(num_heads):
            k = variable_scope.get_variable("AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
            hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

        state = []
        # 将encoder的最后一个隐层状态扩展成beam_size维,因为decoder阶段的batch_size是beam_size。
        # initial_state是一个列表,RNN有多少层就有多少个元素,每个元素都是一个LSTMStateTuple,包含h,c两个隐层状态
        # 所以要将其扩展成beam_size维,其实是把c和h进行扩展,最后再合成LSTMStateTuple就可以了
        for layers in initial_state:
            c = [layers.c] * beam_size
            h = [layers.h] * beam_size
            c = tf.concat(c, 0)
            h = tf.concat(h, 0)
            state.append(rnn_cell_impl.LSTMStateTuple(c, h))
        state = tuple(state)
        # state_size = int(initial_state.get_shape().with_rank(2)[1])
        # states = []
        # for kk in range(beam_size):
        #     states.append(initial_state)
        # state = tf.concat(states, 0)
        # state = initial_state

        def attention(query):
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(query_list, 1)
            for a in xrange(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    y = Linear(query, attention_vec_size, True)(query)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                    a = nn_ops.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        # attention也要定义成beam_size为的tensor
        batch_attn_size = array_ops.stack([beam_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype) for _ in xrange(num_heads)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)

        log_beam_probs, beam_path, beam_symbols = [], [], []
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if i == 0:
                #i=0时,输入时一个batch_szie=beam_size的tensor,且里面每个元素的值都是相同的,都是<GO>标志
                inp = tf.nn.embedding_lookup(embedding, tf.constant(1, dtype=tf.int32, shape=[beam_size]))

            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function", reuse=True):
                    inp = loop_function(prev, i, log_beam_probs, beam_path, beam_symbols)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" % inp.name)
            inputs = [inp] + attns
            x = Linear(inputs, input_size, True)(inputs)

            # Run the RNN.
            cell_output, state = cell(x, state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(variable_scope.get_variable_scope(), reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                inputs = [cell_output] + attns
                output = Linear(inputs, output_size, True)(inputs)
            if loop_function is not None:
                prev = output
            outputs.append(tf.argmax(nn_ops.xw_plus_b(output, output_projection[0], output_projection[1]), axis=1))

    return outputs, state, tf.reshape(tf.concat(beam_path, 0), [-1, beam_size]), tf.reshape(tf.concat(beam_symbols, 0),
                                                                                            [-1, beam_size])
Ejemplo n.º 41
0
def dynamic_distraction_m2_decoder(decoder_inputs,
                      initial_state,
                      distract_initial_state,
                      attention_states,
                      attention_states_query,
                      cell1,cell2,
                      distraction_cell,
                      output_size=None,
                      num_heads=1,
                      loop_function=None,
                      dtype=None,
                      scope=None,
                      initial_state_attention=False):
  """RNN decoder with attention for the sequence-to-sequence model.

  In this context "attention" means that, during decoding, the RNN can look up
  information in the additional tensor attention_states, and it does this by
  focusing on a few entries from the tensor. This model has proven to yield
  especially good results in a number of sequence-to-sequence tasks. This
  implementation is based on http://arxiv.org/abs/1412.7449 (see below for
  details). It is recommended for complex sequence-to-sequence tasks.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor [batch_size x cell.state_size].
    attention_states: 3D Tensor [batch_size x attn_length x attn_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    output_size: Size of the output vectors; if None, we use cell.output_size.
    num_heads: Number of attention heads that read from attention_states.
    loop_function: If not None, this function will be applied to i-th output
      in order to generate i+1-th input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    dtype: The dtype to use for the RNN initial state (default: tf.float32).
    scope: VariableScope for the created subgraph; default: "attention_decoder".
    initial_state_attention: If False (default), initial attentions are zero.
      If True, initialize the attentions from the initial state and attention
      states -- useful when we wish to resume decoding from a previously
      stored decoder state and attention states.

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of
        shape [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either the i-th element
        of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        First, we run the cell on a combination of the input and previous
        attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      state: The state of each decoder cell the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].

  Raises:
    ValueError: when num_heads is not positive, there are no inputs, shapes
      of attention_states are not set, or input size cannot be inferb_a     from the input.
  """
  if decoder_inputs is None:
    raise ValueError("Must provide at least 1 input to attention decoder.")
  if num_heads < 1:
    raise ValueError("With less than 1 heads, use a non-attention decoder.")
  if attention_states.get_shape()[2].value is None:
    raise ValueError("Shape[2] of attention_states must be known: %s"
                     % attention_states.get_shape())
  if output_size is None:
    output_size = cell1.output_size

  with variable_scope.variable_scope(
      scope or "dynamic_distraction_m2_decoder", dtype=dtype) as scope:
    dtype = scope.dtype

    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length_state = attention_states.get_shape()[1].value
    attn_length_query = attention_states_query.get_shape()[1].value

    dim_1 = initial_state.get_shape()[1].value
    dim_2 = cell1.output_size
    project_initial_state_W = variable_scope.get_variable("Initial_State_W", [dim_1, dim_2])
    project_initial_state_B = variable_scope.get_variable("Initial_State_Bias", [dim_2])

    print ("Preksha " + scope.name)
    if attn_length_state is None:
      attn_length_state = shape(attention_states)[1]

    if attn_length_query is None:
      attn_length_query = shape(attention_states_query)[1]

    attn_size_state = attention_states.get_shape()[2].value
    attn_size_query = attention_states_query.get_shape()[2].value
    b_a = variable_scope.get_variable("b_a", [1, attn_size_state])

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden_states = array_ops.reshape(
        attention_states, [-1, attn_length_state, 1, attn_size_state])

    hidden_states_query = array_ops.reshape(
        attention_states_query, [-1, attn_length_query, 1, attn_size_query])

    hidden_features_states = []
    hidden_features_query  = []

    v_state = []
    attention_vec_size_state  = attn_size_state  # Size of query vectors for attention.
    
    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_State_%d" % a,
                                      [1, 1, attn_size_state, attention_vec_size_state])

      hidden_features_states.append(nn_ops.conv2d(hidden_states, k, [1, 1, 1, 1], "SAME"))
      
      v_state.append(
          variable_scope.get_variable("AttnV_State_%d" % a, [attention_vec_size_state]))


    v_query = []
    attention_vec_size_query  = attn_size_query  # Size of query vectors for attention.

    for a in xrange(num_heads):
      k = variable_scope.get_variable("AttnW_Query_%d" %a, 
                                      [1, 1, attn_size_query, attention_vec_size_query])

      hidden_features_query.append(nn_ops.conv2d(hidden_states_query, k, [1, 1, 1, 1], "SAME"))
      
      v_query.append(
          variable_scope.get_variable("AttnV_Query_%d" % a, [attention_vec_size_query]))


    state_1 = math_ops.matmul(initial_state, project_initial_state_W) + project_initial_state_B
    state_2 = state_1


    prev_states = []

    for i in range(attn_length_state):
      prev_states.append(array_ops.zeros([batch_size]))

    def attention(query, prev_states, b_a):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_%d" % a):
          y = linear(query, attention_vec_size_state, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_state])
          # Attention mask is a softmax of v^T * tanh(...).


          temp = hidden_features_states[a] + y
          new_states = array_ops.squeeze(temp, [2])

          new_states_list = array_ops.unpack(new_states, axis=1)
          #print(temp.get_shape(), new_states.get_shape(), len(new_states_list), new_states_list[0].get_shape())
          distract_states_list = []
          for i, _ in enumerate(new_states_list):
              temp = array_ops.reshape(prev_states[i], [-1, 1])
              t1 = math_ops.matmul(temp, b_a)
              print ("b_a size and prev_states size", temp.get_shape(), prev_states[i].get_shape(), b_a.get_shape(), t1.get_shape())
              distract_states_list.append(new_states_list[i] - t1)

          distract_states = array_ops.pack(distract_states_list, axis=1)

          print (len(distract_states_list), distract_states.get_shape())
          s = math_ops.reduce_sum(
              v_state[a] * math_ops.tanh(distract_states), [2])

          print(s.get_shape())
          a = nn_ops.softmax(s)
          prev_states = array_ops.pack(prev_states,  axis=1)
          prev_states = prev_states + a
          
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length_state, 1, 1]) * hidden_states,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size_state]))
      return ds, array_ops.unpack(prev_states, axis=1)

    def attention_query(query):
      """Put attention masks on hidden using hidden_features and query."""
      ds = []  # Results of attention reads will be stored here.
      if nest.is_sequence(query):  # If the query is a tuple, flatten it.
        query_list = nest.flatten(query)
        for q in query_list:  # Check that ndims == 2 if specified.
          ndims = q.get_shape().ndims
          if ndims:
            assert ndims == 2
        query = array_ops.concat(1, query_list)
      for a in xrange(num_heads):
        with variable_scope.variable_scope("Attention_Query_%d" % a):
          y = linear(query, attention_vec_size_query, True)
          y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size_query])
          # Attention mask is a softmax of v^T * tanh(...).
          s = math_ops.reduce_sum(
              v_query[a] * math_ops.tanh(hidden_features_query[a] + y), [2, 3])
          a = nn_ops.softmax(s)
          # Now calculate the attention-weighted vector d.
          d = math_ops.reduce_sum(
              array_ops.reshape(a, [-1, attn_length_query, 1, 1]) * hidden_states_query,
              [1, 2])
          ds.append(array_ops.reshape(d, [-1, attn_size_query]))


      return ds[0]


    outputs = []
    ctx_vec = []
    prev = None

    batch_attn_size_state = array_ops.pack([batch_size, attn_size_state])
    batch_attn_size_query = array_ops.pack([batch_size, attn_size_query])


    attns_state = [array_ops.zeros(batch_attn_size_state, dtype=dtype)
             for _ in xrange(num_heads)]

    attns_query = [array_ops.zeros(batch_attn_size_query, dtype=dtype)
             for _ in xrange(num_heads)]

    for a in attns_state:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size_state])


    for a in attns_query:  # Ensure the second shape of attention vectors is set.
      a.set_shape([None, attn_size_query])


    acc_ctx = array_ops.zeros([batch_size, attn_size_state])

    if initial_state_attention:
      attns_query = attention_query(initial_state)
      list_of_queries = [initial_state, attns_query]
      attns_state, prev_states = attention(list_of_queries, prev_states)

    for i, inp in enumerate(decoder_inputs):
      if i > 0:
        variable_scope.get_variable_scope().reuse_variables()
      # If loop_function is set, we use it instead of decoder_inputs.
      if loop_function is not None and prev is not None:
        with variable_scope.variable_scope("loop_function", reuse=True):
          inp = loop_function(prev, i)
      # Merge input and previous attentions into one vector of the right size.
      input_size = inp.get_shape().with_rank(2)[1]
      if input_size.value is None:
        raise ValueError("Could not infer input size from input: %s" % inp.name)
      

      with variable_scope.variable_scope("Cell2"):
        input_2 = linear([state_1] + [inp], input_size, True)
        output_2, state_2 = cell2(input_2, state_2)
      

      # Run the RNN.
      #print (x.get_shape())
      
      # Run the attention mechanism.

      if i == 0 and initial_state_attention:
        with variable_scope.variable_scope(variable_scope.get_variable_scope(),
                                           reuse=True):
          attns_query = attention_query(output_2)
          list_of_queries = [state, attns_query]
          attns_state, prev_states = attention(list_of_queries, prev_states, b_a)
      else:
        attns_query = attention_query(output_2)
        list_of_queries = [output_2, attns_query]
        attns_state, prev_states = attention(list_of_queries, prev_states, b_a)


      with variable_scope.variable_scope("AttnOutputProjection"):

        W = variable_scope.get_variable("W", [1,attn_size_state])
        U = variable_scope.get_variable("U", [1,attn_size_state])

        new_ctx = math_ops.mul(W, attns_state[0]) - math_ops.mul(U, acc_ctx)
        new_ctx = math_ops.tanh(new_ctx)

        acc_ctx = acc_ctx + new_ctx

        with variable_scope.variable_scope("Cell1"):
          input_1 = linear([output_2] + [new_ctx], input_size, True)
          output_1, state_1 = cell1(input_1, state_1)

        output = math_ops.tanh(linear([inp] + [output_1] + [new_ctx], output_size, True))
        #x_shape = variable_scope.get_variable(name = 'x_shape',shape=cell_output.get_shape())
        if loop_function is not None:
          prev = output
        outputs.append(output)
	ctx_vec.append(new_ctx)
  return outputs, state_1, ctx_vec
Ejemplo n.º 42
0
def attention_decoder(decoder_inputs,
                      sequence_length,
                      initial_state,
                      attention_matrix,
                      cell,
                      output_size=None,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None,
                      initial_state_attention=False):
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_matrix.get_shape()[1:].is_fully_defined():
        raise ValueError("Shape of attention matrix must be known: %s" %
                         attention_matrix.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        #batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
        # Temporarily avoid EmbeddingWrapper and seq2seq badness
        # TODO(lukaszkaiser): remove EmbeddingWrapper
        if decoder_inputs[0].get_shape().ndims != 1:
            (fixed_batch_size,
             input_size) = decoder_inputs[0].get_shape().with_rank(2)
            if input_size.value is None:
                raise ValueError(
                    "Input size (second dimension of inputs[0]) must be accessible via "
                    "shape inference, but saw value None.")
        else:
            fixed_batch_size = decoder_inputs[0].get_shape(
            ).with_rank_at_least(1)[0]

        if fixed_batch_size.value:
            batch_size = fixed_batch_size.value
        else:
            batch_size = array_ops.shape(decoder_inputs[0])[0]

        if sequence_length is not None:
            sequence_length = math_ops.to_int32(sequence_length)
            zero_output = array_ops.zeros(
                array_ops.pack([batch_size, cell.output_size]),
                decoder_inputs[0].dtype)
            zero_output.set_shape(
                tensor_shape.TensorShape(
                    [fixed_batch_size.value, cell.output_size]))
            min_sequence_length = math_ops.reduce_min(sequence_length)
            max_sequence_length = math_ops.reduce_max(sequence_length)

        # ATTENTION COMPUTATION

        attn_size = attention_matrix.get_shape()[-1].value
        batch_attn_size = array_ops.pack([batch_size, attn_size])

        def _attention(query, states):
            """Put attention masks on hidden using hidden_features and query."""
            v = variable_scope.get_variable("AttnV", [attn_size])
            k = variable_scope.get_variable("AttnW",
                                            [1, 1, attn_size, attn_size])

            # attn is v^T * tanh(W1*h_t + U*q)

            # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
            attn_length = states.get_shape()[1].value
            hidden = array_ops.reshape(states, [-1, attn_length, 1, attn_size])
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")

            y = rnn_cell._linear(query, attn_size, True)
            y = array_ops.reshape(y, [-1, 1, 1, attn_size])
            # Attention mask is a softmax of v^T * tanh(...).
            s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                    [2, 3])
            a = nn_ops.softmax(s)
            # Now calculate the attention-weighted vector d.
            d = math_ops.reduce_sum(
                array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden, [1, 2])
            d = array_ops.reshape(d, [-1, attn_size])
            return d

        def attention(query):
            outer_states = tf.unpack(attention_matrix, axis=1)

            inner_states = []
            for i, states in enumerate(outer_states):
                with variable_scope.variable_scope("Attention_outer",
                                                   reuse=i > 0):
                    inner_states.append(_attention(query, states))

            with variable_scope.variable_scope("Attention_inner"):
                return _attention(query, tf.pack(inner_states, 1))

        state = cell.zero_state(
            batch_size, dtype) if initial_state == None else initial_state
        outputs = []
        prev = None

        attns = array_ops.zeros(batch_attn_size, dtype=dtype)
        attns.set_shape([None, attn_size])

        if initial_state_attention:
            attns = attention(initial_state)
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)
            x = rnn_cell._linear([inp] + [attns], input_size, True)

            if sequence_length is not None:
                call_cell = lambda: cell(x, state)
                if sequence_length is not None:
                    cell_output, state = _rnn_step(i, sequence_length,
                                                   min_sequence_length,
                                                   max_sequence_length,
                                                   zero_output, state,
                                                   call_cell, cell.state_size)
            else:
                cell_output, state = cell(x, state)

            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = rnn_cell._linear([cell_output] + [attns], output_size,
                                          True)
            if loop_function is not None:
                prev = output
            outputs.append(output)

    return outputs, state
Ejemplo n.º 43
0
def _linear(args,
            output_size,
            bias,
            weight_name=_WEIGHTS_VARIABLE_NAME,
            bias_name=_BIAS_VARIABLE_NAME,
            bias_initializer=None,
            kernel_initializer=None):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

  Args:
    args: a 2D Tensor or a list of 2D, batch x n, Tensors.
    output_size: int, second dimension of W[i].
    bias: boolean, whether to add a bias term or not.
    bias_initializer: starting value to initialize the bias
      (default is all zeros).
    kernel_initializer: starting value to initialize the weight.

  Returns:
    A 2D Tensor with shape [batch x output_size] equal to
    sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

  Raises:
    ValueError: if some of the arguments has unspecified or wrong shape.
  """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape() for a in args]
    for shape in shapes:
        if shape.ndims != 2:
            raise ValueError("linear is expecting 2D arguments: %s" % shapes)
        if shape[1].value is None:
            raise ValueError(
                "linear expects shape[1] to be provided for shape %s, "
                "but saw %s" % (shape, shape[1]))
        else:
            total_arg_size += shape[1].value

    dtype = [a.dtype for a in args][0]

    # Now the computation.
    scope = vs.get_variable_scope()
    with vs.variable_scope(scope) as outer_scope:
        weights = vs.get_variable(weight_name, [total_arg_size, output_size],
                                  dtype=dtype,
                                  initializer=kernel_initializer)

        # if the args is a single tensor then matmul it with weight
        # if the args is a list of tensors then concat them in axis of 1 and matmul
        if len(args) == 1:
            res = math_ops.matmul(args[0], weights)
        else:
            res = math_ops.matmul(array_ops.concat(args, 1), weights)
        if not bias:
            return res
        with vs.variable_scope(outer_scope) as inner_scope:
            inner_scope.set_partitioner(None)
            if bias_initializer is None:
                bias_initializer = init_ops.constant_initializer(0.0,
                                                                 dtype=dtype)
            biases = vs.get_variable(bias_name, [output_size],
                                     dtype=dtype,
                                     initializer=bias_initializer)
        return nn_ops.bias_add(res, biases)
Ejemplo n.º 44
0
    def __init__(self, dist, coord, replica_id, devices, variable_creator_fn,
                 fn, caching_scope, args, kwargs):
        super(_MirroredReplicaThread, self).__init__()
        self.coord = coord
        self.distribution = dist
        self.devices = devices
        self.replica_id = replica_id
        self.replica_id_in_sync_group = (
            dist.extended._get_replica_id_in_sync_group(replica_id))  # pylint: disable=protected-access

        self.variable_creator_fn = variable_creator_fn
        # State needed to run and return the results of `fn`.
        self.main_fn = fn
        self.main_args = args
        self.main_kwargs = kwargs
        self.main_result = None
        self.done = False
        # State needed to run the next merge_call() (if any) requested via
        # ReplicaContext.
        self.merge_fn = None
        self.merge_args = None
        self.merge_kwargs = None
        self.merge_result = None
        self.captured_name_scope = None
        self.captured_var_scope = None
        try:
            self.caching_scope_entered = caching_scope.new_cache_scope_count
            self.caching_scope_exited = caching_scope.cache_scope_exited_count
        except AttributeError:
            self.caching_scope_entered = None
            self.caching_scope_exited = None

        # We use a thread.Event for the main thread to signal when this
        # thread should start running (`should_run`), and another for
        # this thread to transfer control back to the main thread
        # (`has_paused`, either when it gets to a
        # `get_replica_context().merge_call` or when `fn` returns). In
        # either case the event starts cleared, is signaled by calling
        # set(). The receiving thread waits for the signal by calling
        # wait() and then immediately clearing the event using clear().
        self.should_run = threading.Event()
        self.has_paused = threading.Event()
        # These fields have to do with inheriting various contexts from the
        # parent thread:
        context.ensure_initialized()
        ctx = context.context()
        self.in_eager = ctx.executing_eagerly()
        self.record_thread_local_summary_state()
        self.record_thread_local_eager_context_state()
        self.context_device_policy = (
            pywrap_tfe.TFE_ContextGetDevicePlacementPolicy(ctx._context_handle)
        )  # pylint: disable=protected-access
        self.graph = ops.get_default_graph()
        with ops.init_scope():
            self._init_in_eager = context.executing_eagerly()
            self._init_graph = ops.get_default_graph()
        self._variable_creator_stack = self.graph._variable_creator_stack[:]  # pylint: disable=protected-access
        self._var_scope = variable_scope.get_variable_scope()
        # Adding a "/" at end lets us re-enter this scope later.
        self._name_scope = self.graph.get_name_scope()
        if self._name_scope:
            self._name_scope += "/"
        if self.replica_id > 0:
            if not self._name_scope:
                self._name_scope = ""
            self._name_scope += "replica_%d/" % self.replica_id
Ejemplo n.º 45
0
def func_graph_from_py_func(name,
                            python_func,
                            args,
                            kwargs,
                            signature=None,
                            func_graph=None,
                            autograph=False,
                            add_control_dependencies=True,
                            arg_names=None,
                            op_return_value=None):
    """Returns a `FuncGraph` generated from `python_func`.

  Args:
    name: an identifier for the function.
    python_func: the Python function to trace.
    args: the positional args with which the Python function should be called;
      ignored if a signature is provided.
    kwargs: the keyword args with which the Python function should be called;
      ignored if a signature is provided.
    signature: a possibly nested sequence of `TensorSpecs` specifying the shapes
      and dtypes of the arguments. When a signature is provided, `args` and
      `kwargs` are ignored, and `python_func` is traced with Tensors conforming
      to `signature`. If `None`, the shapes and dtypes are inferred from the
      inputs.
    func_graph: Optional. An instance of FuncGraph. If provided, we will use
      this graph else a new one is built and returned.
    autograph: whether to use autograph to compile `python_func`.
      See https://www.tensorflow.org/guide/autograph for more information.
    add_control_dependencies: If True, automatically adds control dependencies
      to ensure program order matches execution order and stateful ops always
      execute.
    arg_names: Optional list of argument names, used to give input placeholders
      recognizable names.
    op_return_value: Optional. A Tensor. If set and `python_func` returns
      Operations, those return values will be replaced with this value. If not
      set, returning an Operation triggers an error.

  Returns:
    A FuncGraph.

  Raises:
    TypeError: If any of `python_func`'s return values is neither `None` nor a
      `Tensor`.
  """
    if op_return_value is not None:
        assert isinstance(op_return_value, ops.Tensor), op_return_value
    if func_graph is None:
        func_graph = FuncGraph(name)
    assert isinstance(func_graph, FuncGraph)
    if add_control_dependencies:
        control_manager = AutomaticControlDependencies
    else:
        control_manager = ops.NullContextmanager
    with func_graph.as_default(), control_manager() as a:
        current_scope = variable_scope.get_variable_scope()
        default_use_recource = current_scope.use_resource
        current_scope.set_use_resource(True)

        if signature is not None:
            args = signature
            kwargs = {}

        # Creates and names placeholders for all arguments.
        func_args = _get_defun_inputs_from_args(args, arg_names)
        func_kwargs = _get_defun_inputs_from_kwargs(kwargs)

        # Note: `nest.flatten` sorts by keys, as does `_deterministic_dict_values`.
        # Variables to help check whether mutation happens in calling the function
        # Copy the recursive list, tuple and map structure, but not base objects
        func_args_before = nest.pack_sequence_as(func_args,
                                                 nest.flatten(func_args))
        func_kwargs_before = nest.pack_sequence_as(func_kwargs,
                                                   nest.flatten(func_kwargs))

        def convert(x):
            """Converts a function output to a Tensor."""
            if x is None:
                return None
            if op_return_value is not None and isinstance(x, ops.Operation):
                # TODO(b/79881896): we currently can't capture external control deps, so
                # this won't work if x needs to be captured (i.e. if python_func returns
                # captured Operations).
                with ops.control_dependencies([x]):
                    x = array_ops.identity(op_return_value)
            elif not isinstance(x, tensor_array_ops.TensorArray):
                try:
                    x = ops.convert_to_tensor_or_indexed_slices(x)
                except (ValueError, TypeError):
                    raise TypeError(
                        "To be compatible with tf.contrib.eager.defun, Python functions "
                        "must return zero or more Tensors; in compilation of %s, found "
                        "return value of type %s, which is not a Tensor." %
                        (str(python_func), type(x)))
            if add_control_dependencies:
                x = a.mark_as_return(x)
            return x

        this_tape = tape.push_new_tape()
        try:
            if autograph:
                from tensorflow.python import autograph  # pylint: disable=g-import-not-at-top
                _, original_func = tf_decorator.unwrap(python_func)

                def wrapper(*args, **kwargs):
                    return autograph.converted_call(
                        original_func, None,
                        autograph.ConversionOptions(
                            verbose=autograph.Verbosity.BRIEF,
                            recursive=True,
                            strip_decorators=(def_function.function, ),
                            optional_features=(),
                        ), *args, **kwargs)

                # Wrapping around a decorator allows checks like tf_inspect.getargspec
                # to be accurate.
                converted_func = tf_decorator.make_decorator(
                    original_func, wrapper)
                tf_decorator.rewrap(python_func, original_func, converted_func)

            func_outputs = python_func(*func_args, **func_kwargs)

            # invariant: `func_outputs` contains only Tensors, IndexedSlices,
            # SparseTensors, TensorArrays and `None`s.
            func_outputs = nest.map_structure(convert, func_outputs)

            check_mutation(func_args_before, func_args)
            check_mutation(func_kwargs_before, func_kwargs)
        finally:
            tape.pop_tape(this_tape)
            current_scope.set_use_resource(default_use_recource)

        # Variables in `func_args`, `func_kwargs` should be explicit inputs
        # to the function, not captured inputs.
        tape_variables = this_tape.watched_variables()
        arg_variables = set()
        inputs = []
        for arg in nest.flatten(func_args) + nest.flatten(func_kwargs):
            if isinstance(arg, resource_variable_ops.ResourceVariable):
                # Even if an argument variable was not used in the function, we've
                # already manually captured the resource Tensor when creating argument
                # placeholders.
                resource_placeholder = func_graph.captures.pop(arg.handle)
                arg_variables.add(arg)
                inputs.append(resource_placeholder)
            elif isinstance(arg, ops.Tensor):
                inputs.append(arg)
        variables = [v for v in tape_variables if v not in arg_variables]
        func_graph.inputs = inputs + list(func_graph.captures.values())

        func_graph.structured_outputs = func_outputs
        # Returning a closed-over tensor does not trigger convert_to_tensor.
        func_graph.outputs.extend(
            func_graph.capture(x)
            for x in flatten(func_graph.structured_outputs) if x is not None)

        func_graph.variables = variables

    # Register any other functions defined in the graph.
    with ops.init_scope():
        if context.executing_eagerly():
            for f in func_graph._functions.values():  # pylint: disable=protected-access
                # TODO(ashankar): What about the gradient registry?
                context.add_function(f._c_func.func)  # pylint: disable=protected-access

    return func_graph
Ejemplo n.º 46
0
def foldl(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
          swap_memory=False, name=None):
  """foldl on the list of tensors unpacked from `elems` on dimension 0.

  This foldl operator repeatedly applies the callable `fn` to a sequence
  of elements from first to last. The elements are made of the tensors
  unpacked from `elems` on dimension 0. The callable fn takes two tensors as
  arguments. The first argument is the accumulated value computed from the
  preceding invocation of fn. If `initializer` is None, `elems` must contain
  at least one element, and its first element is used as the initializer.

  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
  of the result tensor is fn(initializer, values[0]).shape`.

  This method also allows multi-arity `elems` and output of `fn`.  If `elems`
  is a (possibly nested) list or tuple of tensors, then each of these tensors
  must have a matching first (unpack) dimension.  The signature of `fn` may
  match the structure of `elems`.  That is, if `elems` is
  `(t1, [t2, t3, [t4, t5]])`, then an appropriate signature for `fn` is:
  `fn = lambda (t1, [t2, t3, [t4, t5]]):`.

  Args:
    fn: The callable to be performed.
    elems: A tensor or (possibly nested) sequence of tensors, each of which
      will be unpacked along their first dimension.  The nested sequence
      of the resulting slices will be the first argument to `fn`.
    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
      as the initial value for the accumulator.
    parallel_iterations: (optional) The number of iterations allowed to run
      in parallel.
    back_prop: (optional) True enables support for back propagation.
    swap_memory: (optional) True enables GPU-CPU memory swapping.
    name: (optional) Name prefix for the returned tensors.

  Returns:
    A tensor or (possibly nested) sequence of tensors, resulting from applying
    `fn` consecutively to the list of tensors unpacked from `elems`, from first
    to last.

  Raises:
    TypeError: if `fn` is not callable.

  Example:
    ```python
    elems = tf.constant([1, 2, 3, 4, 5, 6])
    sum = foldl(lambda a, x: a + x, elems)
    # sum == 21
    ```
  """
  if not callable(fn):
    raise TypeError("fn must be callable.")

  def create_ta(elem):
    return tensor_array_ops.TensorArray(
        dtype=elem.dtype, size=n, dynamic_size=False,
        infer_shape=True).unstack(elem)

  in_graph_mode = not context.executing_eagerly()
  with ops.name_scope(name, "foldl", [elems]):
    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
    # supported in Eager
    if in_graph_mode:
      # Any get_variable calls in fn will cache the first call locally
      # and not issue repeated network I/O requests for each iteration.
      varscope = vs.get_variable_scope()
      varscope_caching_device_was_none = False
      if varscope.caching_device is None:
        # TODO(ebrevdo): Change to using colocate_with here and in other
        # methods.
        varscope.set_caching_device(lambda op: op.device)
        varscope_caching_device_was_none = True

    # Convert elems to tensor array. n may be known statically.
    elems_flat = [
        ops.convert_to_tensor(elem, name="elem") for elem in nest.flatten(elems)
    ]
    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
         or array_ops.shape(elems_flat[0])[0])

    elems_ta = nest.map_structure(create_ta, elems)

    if initializer is None:
      a = nest.map_structure(lambda elem: elem.read(0), elems_ta)
      i = constant_op.constant(1)
    else:
      a = initializer
      i = constant_op.constant(0)

    def compute(i, a):
      elem_i = nest.map_structure(lambda elem: elem.read(i), elems_ta)
      a = fn(a, elem_i)
      return [i + 1, a]

    _, r_a = control_flow_ops.while_loop(
        lambda i, a: i < n, compute, [i, a],
        parallel_iterations=parallel_iterations,
        back_prop=back_prop,
        swap_memory=swap_memory,
        maximum_iterations=n)

    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
    # supported in Eager
    if in_graph_mode and varscope_caching_device_was_none:
      varscope.set_caching_device(None)

    return r_a
Ejemplo n.º 47
0
def model_with_buckets(encoder_inputs,
                       decoder_inputs,
                       targets,
                       weights,
                       buckets,
                       seq2seq,
                       softmax_loss_function=None,
                       per_example_loss=False,
                       name=None):
    """Create a sequence-to-sequence model with support for bucketing.

  The seq2seq argument is a function that defines a sequence-to-sequence model,
  e.g., seq2seq = lambda x, y: basic_rnn_seq2seq(
      x, y, core_rnn_cell.GRUCell(24))

  Args:
    encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
    decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
    targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
    weights: List of 1D batch-sized float-Tensors to weight the targets.
    buckets: A list of pairs of (input size, output size) for each bucket.
    seq2seq: A sequence-to-sequence model function; it takes 2 input that
      agree with encoder_inputs and decoder_inputs, and returns a pair
      consisting of outputs and states (as, e.g., basic_rnn_seq2seq).
    softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
      to be used instead of the standard softmax (the default if this is None).
    per_example_loss: Boolean. If set, the returned loss will be a batch-sized
      tensor of losses for each sequence in the batch. If unset, it will be
      a scalar with the averaged loss from all examples.
    name: Optional name for this operation, defaults to "model_with_buckets".

  Returns:
    A tuple of the form (outputs, losses), where:
      outputs: The outputs for each bucket. Its j'th element consists of a list
        of 2D Tensors. The shape of output tensors can be either
        [batch_size x output_size] or [batch_size x num_decoder_symbols]
        depending on the seq2seq model used.
      losses: List of scalar Tensors, representing losses for each bucket, or,
        if per_example_loss is set, a list of 1D batch-sized float Tensors.

  Raises:
    ValueError: If length of encoder_inputsut, targets, or weights is smaller
      than the largest (last) bucket.
  """
    if len(encoder_inputs) < buckets[-1][0]:
        raise ValueError(
            "Length of encoder_inputs (%d) must be at least that of la"
            "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
    if len(targets) < buckets[-1][1]:
        raise ValueError("Length of targets (%d) must be at least that of last"
                         "bucket (%d)." % (len(targets), buckets[-1][1]))
    if len(weights) < buckets[-1][1]:
        raise ValueError("Length of weights (%d) must be at least that of last"
                         "bucket (%d)." % (len(weights), buckets[-1][1]))

    all_inputs = encoder_inputs + decoder_inputs + targets + weights
    losses = []
    outputs = []
    with ops.name_scope(name, "model_with_buckets", all_inputs):
        for j, bucket in enumerate(buckets):
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(),
                    reuse=True if j > 0 else None):
                bucket_outputs, _ = seq2seq(encoder_inputs[:bucket[0]],
                                            decoder_inputs[:bucket[1]])
                outputs.append(bucket_outputs)
                if per_example_loss:
                    losses.append(
                        sequence_loss_by_example(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))
                else:
                    losses.append(
                        sequence_loss(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))

    return outputs, losses
Ejemplo n.º 48
0
def actrgn_attention_decoder(decoder_inputs,
                             initial_state,
                             attention_states,
                             cell,
                             output_size=None,
                             num_heads=1,
                             loop_function=None,
                             dtype=None,
                             scope=None,
                             initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

	In this context "attention" means that, during decoding, the RNN can look up
	information in the additional tensor attention_states, and it does this by
	focusing on a few entries from the tensor. This model has proven to yield
	especially good results in a number of sequence-to-sequence tasks. This
	implementation is based on http://arxiv.org/abs/1412.7449 (see below for
	details). It is recommended for complex sequence-to-sequence tasks.

	Args:
	decoder_inputs: A list of 2D Tensors [batch_size x input_size].
	initial_state: 2D Tensor [batch_size x cell.state_size].
	attention_states: 3D Tensor [batch_size x attn_length x attn_size].
	cell: core_rnn_cell.RNNCell defining the cell function and size.
	output_size: Size of the output vectors; if None, we use cell.output_size.
	num_heads: Number of attention heads that read from attention_states.
	loop_function: If not None, this function will be applied to i-th output
	  in order to generate i+1-th input, and decoder_inputs will be ignored,
	  except for the first element ("GO" symbol). This can be used for decoding,
	  but also for training to emulate http://arxiv.org/abs/1506.03099.
	  Signature -- loop_function(prev, i) = next
	    * prev is a 2D Tensor of shape [batch_size x output_size],
	    * i is an integer, the step number (when advanced control is needed),
	    * next is a 2D Tensor of shape [batch_size x input_size].
	dtype: The dtype to use for the RNN initial state (default: tf.float32).
	scope: VariableScope for the created subgraph; default: "attention_decoder".
	initial_state_attention: If False (default), initial attentions are zero.
	  If True, initialize the attentions from the initial state and attention
	  states -- useful when we wish to resume decoding from a previously
	  stored decoder state and attention states.

	Returns:
	A tuple of the form (outputs, state), where:
	  outputs: A list of the same length as decoder_inputs of 2D Tensors of
	    shape [batch_size x output_size]. These represent the generated outputs.
	    Output i is computed from input i (which is either the i-th element
	    of decoder_inputs or loop_function(output {i-1}, i)) as follows.
	    First, we run the cell on a combination of the input and previous
	    attention masks:
	      cell_output, new_state = cell(linear(input, prev_attn), prev_state).
	    Then, we calculate new attention masks:
	      new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
	    and then we calculate the output:
	      output = linear(cell_output, new_attn).
	  state: The state of each decoder cell the final time-step.
	    It is a 2D Tensor of shape [batch_size x cell.state_size].

	Raises:
	ValueError: when num_heads is not positive, there are no inputs, shapes
	  of attention_states are not set, or input size cannot be inferred
	  from the input.
	"""
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if attention_states.get_shape()[2].value is None:
        raise ValueError("Shape[2] of attention_states must be known: %s" %
                         attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder",
                                       dtype=dtype) as scope:
        dtype = scope.dtype

    batch_size = array_ops.shape(decoder_inputs[0])[0]  # Needed for reshaping.
    attn_length = attention_states.get_shape()[1].value
    if attn_length is None:
        attn_length = array_ops.shape(attention_states)[1]
    attn_size = attention_states.get_shape()[2].value

    # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
    hidden = array_ops.reshape(attention_states,
                               [-1, attn_length, 1, attn_size])
    hidden_features = []
    v = []
    attention_vec_size = attn_size  # Size of query vectors for attention.
    for a in xrange(num_heads):
        k = variable_scope.get_variable("AttnW_%d" % a,
                                        [1, 1, attn_size, attention_vec_size])
        hidden_features.append(nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
        v.append(
            variable_scope.get_variable("AttnV_%d" % a, [attention_vec_size]))

    state = initial_state

    def attention(query):
        """Put attention masks on hidden using hidden_features and query."""
        ds = []  # Results of attention reads will be stored here.
        if nest.is_sequence(query):  # If the query is a tuple, flatten it.
            query_list = nest.flatten(query)
            for q in query_list:  # Check that ndims == 2 if specified.
                ndims = q.get_shape().ndims
                if ndims:
                    assert ndims == 2
            query = array_ops.concat(query_list, 1)
        for a in xrange(num_heads):
            with variable_scope.variable_scope("Attention_%d" % a):
                y = linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(
                    v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                a = nn_ops.softmax(s)
                # Now calculate the attention-weighted vector d.
                d = math_ops.reduce_sum(
                    array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                    [1, 2])
                ds.append(array_ops.reshape(d, [-1, attn_size]))
        return ds

    outputs = []
    prev = None
    batch_attn_size = array_ops.stack([batch_size, attn_size])
    attns = [
        array_ops.zeros(batch_attn_size, dtype=dtype)
        for _ in xrange(num_heads)
    ]
    for a in attns:  # Ensure the second shape of attention vectors is set.
        a.set_shape([None, attn_size])
    if initial_state_attention:
        attns = attention(initial_state)
    for i, inp in enumerate(decoder_inputs):
        if i > 0:
            variable_scope.get_variable_scope().reuse_variables()
        # If loop_function is set, we use it instead of decoder_inputs.
        if loop_function is not None and prev is not None:
            with variable_scope.variable_scope("loop_function", reuse=True):
                inp = loop_function(prev, i)
        # Merge input and previous attentions into one vector of the right size.
        input_size = inp.get_shape().with_rank(2)[1]
        if input_size.value is None:
            raise ValueError("Could not infer input size from input: %s" %
                             inp.name)
        x = linear([inp] + attns, input_size, True)
        # Run the RNN.
        cell_output, state = cell(x, state)
        # Run the attention mechanism.
        if i == 0 and initial_state_attention:
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(), reuse=True):
                attns = attention(state)
        else:
            attns = attention(state)

        with variable_scope.variable_scope("AttnOutputProjection"):
            output = linear([cell_output] + attns, output_size, True)
        if loop_function is not None:
            prev = output
        outputs.append(output)

    return outputs, state
Ejemplo n.º 49
0
def rnn(cell, inputs, initial_state=None, dtype=None, sequence_length=None, scope=None):
  """Creates a recurrent neural network specified by RNNCell "cell".

  The simplest form of RNN network generated is:
    state = cell.zero_state(...)
    outputs = []
    for input_ in inputs:
      output, state = cell(input_, state)
      outputs.append(output)
    return (outputs, state)

  However, a few other options are available:

  An initial state can be provided.
  If the sequence_length vector is provided, dynamic calculation is performed.
  This method of calculation does not compute the RNN steps past the maximum
  sequence length of the minibatch (thus saving computational time),
  and properly propagates the state at an example's sequence length
  to the final state output.

  The dynamic calculation performed is, at time t for batch row b,
    (output, state)(b, t) =
      (t >= sequence_length(b))
        ? (zeros(cell.output_size), states(b, sequence_length(b) - 1))
        : cell(input(b, t), state(b, t - 1))

  Args:
    cell: An instance of RNNCell.
    inputs: A length T list of inputs, each a tensor of shape
      [batch_size, cell.input_size].
    initial_state: (optional) An initial state for the RNN.  This must be
      a tensor of appropriate type and shape [batch_size x cell.state_size].
    dtype: (optional) The data type for the initial state.  Required if
      initial_state is not provided.
    sequence_length: Specifies the length of each sequence in inputs.
      An int32 or int64 vector (tensor) size [batch_size].  Values in [0, T).
    scope: VariableScope for the created subgraph; defaults to "RNN".

  Returns:
    A pair (outputs, state) where:
      outputs is a length T list of outputs (one for each input)
      state is the final state

  Raises:
    TypeError: If "cell" is not an instance of RNNCell.
    ValueError: If inputs is None or an empty list, or if the input depth
      cannot be inferred from inputs via shape inference.
  """

  if not isinstance(cell, BaseCell):
    raise TypeError("cell must be an instance of RNNCell")
  if not isinstance(inputs, list):
    raise TypeError("inputs must be a list")
  if not inputs:
    raise ValueError("inputs must not be empty")

  outputs = []
  # Create a new scope in which the caching device is either
  # determined by the parent scope, or is set to place the cached
  # Variable using the same placement as for the rest of the RNN.
  with vs.variable_scope(scope or "RNN") as varscope:
    if varscope.caching_device is None:
      varscope.set_caching_device(lambda op: op.device)

    # Temporarily avoid EmbeddingWrapper and seq2seq badness
    # TODO(lukaszkaiser): remove EmbeddingWrapper
    if inputs[0].get_shape().ndims != 1:
      (fixed_batch_size, input_size) = inputs[0].get_shape().with_rank(2)
      if input_size.value is None:
        raise ValueError(
            "Input size (second dimension of inputs[0]) must be accessible via "
            "shape inference, but saw value None.")
    else:
      fixed_batch_size = inputs[0].get_shape().with_rank_at_least(1)[0]

    if fixed_batch_size.value:
      batch_size = fixed_batch_size.value
    else:
      batch_size = array_ops.shape(inputs[0])[0]
    if initial_state is not None:
      state = initial_state
    else:
      if not dtype:
        raise ValueError("If no initial_state is provided, dtype must be.")
      state = cell.zero_state(batch_size, dtype)

    if sequence_length is not None:
      sequence_length = math_ops.to_int32(sequence_length)

    if sequence_length is not None:  # Prepare variables
      zero_output = array_ops.zeros(
          array_ops.pack([batch_size, cell.output_size]), inputs[0].dtype)
      zero_output.set_shape(
          tensor_shape.TensorShape([fixed_batch_size.value, cell.output_size]))
      min_sequence_length = math_ops.reduce_min(sequence_length)
      max_sequence_length = math_ops.reduce_max(sequence_length)

    for time, input_ in enumerate(inputs):
      if time > 0: vs.get_variable_scope().reuse_variables()
      # pylint: disable=cell-var-from-loop
      call_cell = lambda: cell(input_, state)
      # pylint: enable=cell-var-from-loop
      if sequence_length is not None:
        (output, state) = _rnn_step(
            time, sequence_length, min_sequence_length, max_sequence_length,
            zero_output, state, call_cell)
      else:
        (output, state) = call_cell()

      outputs.append(output)

    return (outputs, state)
Ejemplo n.º 50
0
def beam_rnn_decoder(decoder_inputs,
                     initial_state,
                     cell,
                     loop_function=None,
                     scope=None,
                     output_projection=None,
                     beam_size=10):
    """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """

    with variable_scope.variable_scope(scope or "rnn_decoder"):
        state = initial_state
        prev = None
        log_beam_probs, beam_path, beam_symbols = [], [], []
        path_lengthes, is_finished_beam = None, None
        for i, inp in enumerate(decoder_inputs):
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp, state, path_lengthes, is_finished_beam = loop_function(
                        i, prev, state, log_beam_probs, beam_path,
                        beam_symbols, path_lengthes, is_finished_beam)

            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            output, state = cell(inp, state)
            if loop_function is not None:
                prev = output
    # from time-major to batch_major
    beam_path = tf.stack(beam_path, axis=1)
    beam_symbols = tf.stack(beam_symbols, axis=1)
    # [batch*beam, state] -> [batch, beam, state]
    state = tf.reshape(state, [
        -1,
        beam_path.get_shape().as_list()[-1],
        state.get_shape().as_list()[-1]
    ])
    return beam_path, beam_symbols, state
Ejemplo n.º 51
0
def scan(fn, elems, initializer=None, parallel_iterations=10, back_prop=True,
         swap_memory=False, infer_shape=True, reverse=False, name=None):
  """scan on the list of tensors unpacked from `elems` on dimension 0.

  The simplest version of `scan` repeatedly applies the callable `fn` to a
  sequence of elements from first to last. The elements are made of the tensors
  unpacked from `elems` on dimension 0. The callable fn takes two tensors as
  arguments. The first argument is the accumulated value computed from the
  preceding invocation of fn. If `initializer` is None, `elems` must contain
  at least one element, and its first element is used as the initializer.

  Suppose that `elems` is unpacked into `values`, a list of tensors. The shape
  of the result tensor is `[len(values)] + fn(initializer, values[0]).shape`.
  If reverse=True, it's fn(initializer, values[-1]).shape.

  This method also allows multi-arity `elems` and accumulator.  If `elems`
  is a (possibly nested) list or tuple of tensors, then each of these tensors
  must have a matching first (unpack) dimension.  The second argument of
  `fn` must match the structure of `elems`.

  If no `initializer` is provided, the output structure and dtypes of `fn`
  are assumed to be the same as its input; and in this case, the first
  argument of `fn` must match the structure of `elems`.

  If an `initializer` is provided, then the output of `fn` must have the same
  structure as `initializer`; and the first argument of `fn` must match
  this structure.

  For example, if `elems` is `(t1, [t2, t3])` and `initializer` is
  `[i1, i2]` then an appropriate signature for `fn` in `python2` is:
  `fn = lambda (acc_p1, acc_p2), (t1, [t2, t3]):` and `fn` must return a list,
  `[acc_n1, acc_n2]`.  An alternative correct signature for `fn`, and the
   one that works in `python3`, is:
  `fn = lambda a, t:`, where `a` and `t` correspond to the input tuples.

  Args:
    fn: The callable to be performed.  It accepts two arguments.  The first
      will have the same structure as `initializer` if one is provided,
      otherwise it will have the same structure as `elems`.  The second
      will have the same (possibly nested) structure as `elems`.  Its output
      must have the same structure as `initializer` if one is provided,
      otherwise it must have the same structure as `elems`.
    elems: A tensor or (possibly nested) sequence of tensors, each of which
      will be unpacked along their first dimension.  The nested sequence
      of the resulting slices will be the first argument to `fn`.
    initializer: (optional) A tensor or (possibly nested) sequence of tensors,
      initial value for the accumulator, and the expected output type of `fn`.
    parallel_iterations: (optional) The number of iterations allowed to run
      in parallel.
    back_prop: (optional) True enables support for back propagation.
    swap_memory: (optional) True enables GPU-CPU memory swapping.
    infer_shape: (optional) False disables tests for consistent output shapes.
    reverse: (optional) True scans the tensor last to first (instead of first
      to last).
    name: (optional) Name prefix for the returned tensors.

  Returns:
    A tensor or (possibly nested) sequence of tensors.  Each tensor packs the
    results of applying `fn` to tensors unpacked from `elems` along the first
    dimension, and the previous accumulator value(s), from first to last (or
    last to first, if `reverse=True`).

  Raises:
    TypeError: if `fn` is not callable or the structure of the output of
      `fn` and `initializer` do not match.
    ValueError: if the lengths of the output of `fn` and `initializer`
      do not match.

  Examples:
    ```python
    elems = np.array([1, 2, 3, 4, 5, 6])
    sum = scan(lambda a, x: a + x, elems)
    # sum == [1, 3, 6, 10, 15, 21]
    sum = scan(lambda a, x: a + x, elems, reverse=True)
    # sum == [22, 21, 18, 15, 11, 6]
    ```

    ```python
    elems = np.array([1, 2, 3, 4, 5, 6])
    initializer = np.array(0)
    sum_one = scan(
        lambda a, x: x[0] - x[1] + a, (elems + 1, elems), initializer)
    # sum_one == [1, 2, 3, 4, 5, 6]
    ```

    ```python
    elems = np.array([1, 0, 0, 0, 0, 0])
    initializer = (np.array(0), np.array(1))
    fibonaccis = scan(lambda a, _: (a[1], a[0] + a[1]), elems, initializer)
    # fibonaccis == ([1, 1, 2, 3, 5, 8], [1, 2, 3, 5, 8, 13])
    ```
  """
  if not callable(fn):
    raise TypeError("fn must be callable.")

  input_is_sequence = nest.is_sequence(elems)
  input_flatten = lambda x: nest.flatten(x) if input_is_sequence else [x]
  def input_pack(x):
    return nest.pack_sequence_as(elems, x) if input_is_sequence else x[0]

  if initializer is None:
    output_is_sequence = input_is_sequence
    output_flatten = input_flatten
    output_pack = input_pack
  else:
    output_is_sequence = nest.is_sequence(initializer)
    output_flatten = lambda x: nest.flatten(x) if output_is_sequence else [x]
    def output_pack(x):
      return (nest.pack_sequence_as(initializer, x)
              if output_is_sequence else x[0])

  elems_flat = input_flatten(elems)

  in_graph_mode = not context.executing_eagerly()
  with ops.name_scope(name, "scan", elems_flat):
    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
    # supported in Eager
    if in_graph_mode:
      # Any get_variable calls in fn will cache the first call locally
      # and not issue repeated network I/O requests for each iteration.
      varscope = vs.get_variable_scope()
      varscope_caching_device_was_none = False
      if varscope.caching_device is None:
        # TODO(ebrevdo): Change to using colocate_with here and in other
        # methods.
        varscope.set_caching_device(lambda op: op.device)
        varscope_caching_device_was_none = True

    # Convert elems to tensor array.
    elems_flat = [
        ops.convert_to_tensor(elem, name="elem") for elem in elems_flat]

    # Convert elems to tensor array. n may be known statically.
    n = (tensor_shape.dimension_value(elems_flat[0].shape[0])
         or array_ops.shape(elems_flat[0])[0])

    # TensorArrays are always flat
    elems_ta = [
        tensor_array_ops.TensorArray(dtype=elem.dtype, size=n,
                                     dynamic_size=False,
                                     infer_shape=True)
        for elem in elems_flat]
    # Unpack elements
    elems_ta = [
        elem_ta.unstack(elem) for elem_ta, elem in zip(elems_ta, elems_flat)]

    if initializer is None:
      a_flat = [elem.read(n - 1 if reverse else 0) for elem in elems_ta]
      i = constant_op.constant(1)
    else:
      initializer_flat = output_flatten(initializer)
      a_flat = [ops.convert_to_tensor(init) for init in initializer_flat]
      i = constant_op.constant(0)

    # Create a tensor array to store the intermediate values.
    accs_ta = [
        tensor_array_ops.TensorArray(
            dtype=init.dtype, size=n,
            element_shape=init.shape if infer_shape else None,
            dynamic_size=False,
            infer_shape=infer_shape)
        for init in a_flat]

    if initializer is None:
      accs_ta = [acc_ta.write(n - 1 if reverse else 0, a)
                 for (acc_ta, a) in zip(accs_ta, a_flat)]

    def compute(i, a_flat, tas):
      """The loop body of scan.

      Args:
        i: the loop counter.
        a_flat: the accumulator value(s), flattened.
        tas: the output accumulator TensorArray(s), flattened.

      Returns:
        [i + 1, a_flat, tas]: the updated counter + new accumulator values +
          updated TensorArrays

      Raises:
        TypeError: if initializer and fn() output structure do not match
        ValueType: if initializer and fn() output lengths do not match
      """
      packed_elems = input_pack([elem_ta.read(i) for elem_ta in elems_ta])
      packed_a = output_pack(a_flat)
      a_out = fn(packed_a, packed_elems)
      nest.assert_same_structure(
          elems if initializer is None else initializer, a_out)
      flat_a_out = output_flatten(a_out)
      tas = [ta.write(i, value) for (ta, value) in zip(tas, flat_a_out)]
      if reverse:
        next_i = i - 1
      else:
        next_i = i + 1
      return (next_i, flat_a_out, tas)

    if reverse:
      initial_i = n - 1 - i
      condition = lambda i, _1, _2: i >= 0
    else:
      initial_i = i
      condition = lambda i, _1, _2: i < n
    _, _, r_a = control_flow_ops.while_loop(
        condition, compute, (initial_i, a_flat, accs_ta),
        parallel_iterations=parallel_iterations,
        back_prop=back_prop, swap_memory=swap_memory,
        maximum_iterations=n)

    results_flat = [r.stack() for r in r_a]

    n_static = tensor_shape.Dimension(tensor_shape.dimension_value(
        elems_flat[0].get_shape().with_rank_at_least(1)[0]))
    for elem in elems_flat[1:]:
      n_static.merge_with(tensor_shape.Dimension(tensor_shape.dimension_value(
          elem.get_shape().with_rank_at_least(1)[0])))
    for r in results_flat:
      r.set_shape(tensor_shape.TensorShape(n_static).concatenate(
          r.get_shape()[1:]))

    # TODO(akshayka): Remove the in_graph_mode check once caching devices are
    # supported in Eager
    if in_graph_mode and varscope_caching_device_was_none:
      varscope.set_caching_device(None)

    return output_pack(results_flat)
Ejemplo n.º 52
0
    def build(self, input_shape):
        """Create variables of the Cudnn RNN.

    It can be called manually before `__call__()` or automatically through
    `__call__()`. In the former case, subsequent `__call__()`s will skip
    creating variables.
    Args:
      input_shape: network input tensor shape, a python list or a TensorShape
        object with 3 dimensions.
    Raises:
      ValueError: if input_shape has wrong dimension or unknown 3rd dimension.
    """
        if self.built:
            return

        input_shape = tensor_shape.TensorShape(input_shape)
        if input_shape.ndims != 3:
            raise ValueError("Expecting input_shape with 3 dims, got %d" %
                             input_shape.ndims)
        if input_shape[-1].value is None:
            raise ValueError("The last dimension of the inputs to `CudnnRNN` "
                             "should be defined. Found `None`.")
        self._input_size = input_shape[-1].value
        self.input_spec = base_layer.InputSpec(ndim=3,
                                               axes={-1: self._input_size})

        self._set_scope(None)

        # Not using base class `add_variable()` since the it calls
        # `tf.get_variable()` with a callable initializer whereas here with a
        # tensor. The difference is mandated to support forward-compatibility with
        # Cudnn.
        with vs.variable_scope(self._scope,
                               reuse=self.built,
                               custom_getter=self._update_trainable_weights):
            if self._kernel_initializer is None:
                self._kernel_initializer = init_ops.glorot_uniform_initializer(
                    seed=self._seed, dtype=self._plain_dtype)
            if self._bias_initializer is None:
                self._bias_initializer = init_ops.constant_initializer(
                    0.0, dtype=self._plain_dtype)

            weights = [
                self._kernel_initializer(sp, dtype=self._plain_dtype)
                for sp in self.canonical_weight_shapes
            ]
            biases = [
                self._bias_initializer(sp, dtype=self._plain_dtype)
                for sp in self.canonical_bias_shapes
            ]
            opaque_params_t = self._canonical_to_opaque(weights, biases)

            if vs.get_variable_scope().partitioner is not None:
                logging.warn(
                    "Partitioner is not supported for Cudnn RNN layer variables, using "
                    "it will create forward-compatibility issues with future "
                    "CUDA/CuDNN generations.")
            # Initialize opaque params with a tensor.
            self.kernel = vs.get_variable("opaque_kernel",
                                          dtype=self._plain_dtype,
                                          initializer=opaque_params_t,
                                          validate_shape=False)
        # Create saveable in the outer scope of the cudnn subgraph, such that
        # alternative subgraph with platform-independent rnn cells can load the
        # checkpoints directly.
        if not (self.built or vs.get_variable_scope().reuse is True):
            self._create_saveable()
        self.built = True
Ejemplo n.º 53
0
def attention_isf_decoder(decoder_inputs,
                          initial_state,
                          attention_states,
                          isf_scores,
                          idf_scores,
                          locisf_scores,
                          cell,
                          output_size=None,
                          num_heads=1,
                          loop_function=None,
                          dtype=None,
                          scope=None,
                          initial_state_attention=False):
    """
  isf_scores: np array with ISF scores (not a tensor) (normalized or not)
  """

    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if num_heads < 1:
        raise ValueError(
            "With less than 1 heads, use a non-attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_ifsscore_decoder"):

        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        if attn_length is None:
            attn_length = shape(attention_states)[1]
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])
        hidden_features = []
        v = []
        attention_vec_size = attn_size  # Size of query vectors for attention.
        for a in range(num_heads):
            k = variable_scope.get_variable(
                "AttnW_%d" % a, [1, 1, attn_size, attention_vec_size])
            hidden_features.append(
                nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME"))
            v.append(
                variable_scope.get_variable("AttnV_%d" % a,
                                            [attention_vec_size]))

        state = initial_state

        def attention(query):
            """Put attention masks on hidden using hidden_features and query."""
            ds = []  # Results of attention reads will be stored here.
            if nest.is_sequence(query):  # If the query is a tuple, flatten it.
                query_list = nest.flatten(query)
                for q in query_list:  # Check that ndims == 2 if specified.
                    ndims = q.get_shape().ndims
                    if ndims:
                        assert ndims == 2
                query = array_ops.concat(1, query_list)
            for a in range(num_heads):
                with variable_scope.variable_scope("Attention_%d" % a):
                    y = linear(query, attention_vec_size, True)
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # Attention mask is a softmax of v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v[a] * math_ops.tanh(hidden_features[a] + y), [2, 3])
                    a = nn_ops.softmax(s)
                    # Now calculate the attention-weighted vector d.
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        prev = None
        batch_attn_size = array_ops.pack([batch_size, attn_size])
        attns = [
            array_ops.zeros(batch_attn_size, dtype=dtype)
            for _ in range(num_heads)
        ]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])
        if initial_state_attention:
            attns = attention(initial_state)
        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i)
            # Merge input and previous attentions into one vector of the right size.
            input_size = inp.get_shape().with_rank(2)[1]
            if input_size.value is None:
                raise ValueError("Could not infer input size from input: %s" %
                                 inp.name)

            #h_isf = tf.mul(isf_scores[i],inp)
            #extra_feats = [h_isf]
            extra_feats = []
            if FLAGS.use_locisf:
                extra_feats.append(locisf_scores[i])
            if FLAGS.use_isf:
                extra_feats.append(isf_scores[i])
            if FLAGS.use_idf:
                extra_feats.append(idf_scores[i])

            x = linear([inp] + attns + extra_feats, input_size, True)
            # Run the RNN.
            cell_output, state = cell(x, state)
            # Run the attention mechanism.
            if i == 0 and initial_state_attention:
                with variable_scope.variable_scope(
                        variable_scope.get_variable_scope(), reuse=True):
                    attns = attention(state)
            else:
                attns = attention(state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([cell_output] + attns + extra_feats,
                                output_size, True)
            if loop_function is not None:
                prev = output
            outputs.append(output)

    return outputs, state
Ejemplo n.º 54
0
def model_with_buckets(encoder_inputs,
                       encoder_mask,
                       decoder_inputs,
                       targets,
                       weights,
                       buckets,
                       seq2seq,
                       softmax_loss_function=None,
                       per_example_loss=False,
                       name=None):
    """Create a sequence-to-sequence model with support for bucketing.

    Args:
      encoder_inputs: A list of Tensors to feed the encoder; first seq2seq input.
      encoder_mask: the mask of encoder inputs that label where are PADs.
      decoder_inputs: A list of Tensors to feed the decoder; second seq2seq input.
      targets: A list of 1D batch-sized int32 Tensors (desired output sequence).
      weights: List of 1D batch-sized float-Tensors to weight the targets.
      buckets: A list of pairs of (input size, output size) for each bucket.
      seq2seq: A sequence-to-sequence model function
      softmax_loss_function: Function (inputs-batch, labels-batch) -> loss-batch
        to be used instead of the standard softmax (the default if this is None).
      per_example_loss: Boolean. If set, the returned loss will be a batch-sized
        tensor of losses for each sequence in the batch. If unset, it will be
        a scalar with the averaged loss from all examples.
      name: Optional name for this operation, defaults to "model_with_buckets".

    Returns:
      A tuple of the form (outputs, losses, symbols), where:
        outputs: The outputs for each bucket. Its j'th element consists of a list
          of 2D Tensors of shape [batch_size x num_decoder_symbols] (jth outputs).
        losses: List of scalar Tensors, representing losses for each bucket, or,
          if per_example_loss is set, a list of 1D batch-sized float Tensors.
        symbols: The final translation result got from beam search

    Raises:
      ValueError: If length of encoder_inputsut, targets, or weights is smaller
        than the largest (last) bucket.
    """
    if len(encoder_inputs) < buckets[-1][0]:
        raise ValueError(
            "Length of encoder_inputs (%d) must be at least that of la"
            "st bucket (%d)." % (len(encoder_inputs), buckets[-1][0]))
    if len(targets) < buckets[-1][1]:
        raise ValueError("Length of targets (%d) must be at least that of last"
                         "bucket (%d)." % (len(targets), buckets[-1][1]))
    if len(weights) < buckets[-1][1]:
        raise ValueError("Length of weights (%d) must be at least that of last"
                         "bucket (%d)." % (len(weights), buckets[-1][1]))

    all_inputs = encoder_inputs + decoder_inputs + targets + weights
    losses = []
    outputs = []
    symbols = []  # to save the output of beam search
    with ops.name_scope(name, "model_with_buckets", all_inputs):
        for j, bucket in enumerate(buckets):
            with variable_scope.variable_scope(
                    variable_scope.get_variable_scope(),
                    reuse=True if j > 0 else None):
                bucket_outputs, _, bucket_symbols = seq2seq(
                    encoder_inputs[:bucket[0]], encoder_mask,
                    decoder_inputs[:bucket[1]])
                outputs.append(bucket_outputs)
                symbols.append(bucket_symbols)
                if per_example_loss:
                    losses.append(
                        sequence_loss_by_example(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))
                else:
                    losses.append(
                        sequence_loss(
                            outputs[-1],
                            targets[:bucket[1]],
                            weights[:bucket[1]],
                            softmax_loss_function=softmax_loss_function))

    return outputs, losses, symbols
Ejemplo n.º 55
0
    def time_aware_multihead_attention(
        self,
        queries,
        keys,
        key_length,
        query_length,
        t_querys,
        t_keys,
        t_querys_length,
        t_keys_length,
        num_units=None,
        num_heads=8,
        dropout_rate=0,
        is_training=True,
        scope="multihead_attention",
        reuse=None,
    ):
        '''Applies multihead attention.

        Args:
          queries: A 3d tensor with shape of [N, T_q, C_q].
          queries_length: A 1d tensor with shape of [N].
          keys: A 3d tensor with shape of [N, T_k, C_k].
          keys_length:  A 1d tensor with shape of [N].
          num_units: A scalar. Attention size.
          dropout_rate: A floating point number.
          is_training: Boolean. Controller of mechanism for dropout.
          num_heads: An int. Number of heads.
          scope: Optional scope for `variable_scope`.
          reuse: Boolean, whether to reuse the weights of a previous layer
          by the same name.

        Returns
          A 3d tensor with shape of (N, T_q, C)
        '''
        # Linear projections, C = # dim or column, T_x = # vectors or actions
        Q = tf.layers.dense(queries, num_units,
                            activation=tf.nn.relu)  # (N, T_q, C)
        #Q = tf.layers.dropout(Q, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
        K = tf.layers.dense(keys, num_units,
                            activation=tf.nn.relu)  # (N, T_k, C)
        #K = tf.layers.dropout(K, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
        V = tf.layers.dense(keys, num_units,
                            activation=tf.nn.relu)  # (N, T_k, C)
        #V = tf.layers.dropout(V, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

        with tf.variable_scope(scope, reuse=reuse):
            # Set the fall back option for num_units
            if num_units is None:
                num_units = queries.get_shape().as_list()[-1]
            #list = t_querys.get_shape().as_list()
            #query_len = queries.get_shape().as_list()[-2]
            #key_len = queries.get_shape().as_list()[-2]

            # time decay gate
            scope = variable_scope.get_variable_scope()
            with variable_scope.variable_scope(scope,
                                               reuse=None) as unit_scope:
                with variable_scope.variable_scope(unit_scope):
                    time_input_w = variable_scope.get_variable(
                        "_time_input_w",
                        shape=[num_units, num_units],
                        dtype=queries.dtype)
                    '''
                    time_input_b = variable_scope.get_variable("_time_input_b",
                                                                shape=[t_querys_length, t_keys_length],
                                                                dtype=queries.dtype)
                    time_input_w1 = variable_scope.get_variable("_time_input_w1",
                                                               shape=[t_querys_length, t_keys_length],
                                                               dtype=queries.dtype)
                    time_input_b1 = variable_scope.get_variable("_time_input_b1",
                                                                shape=[t_querys_length, t_keys_length],
                                                                dtype=queries.dtype)
                    time_output_w1 = variable_scope.get_variable("time_output_w1",
                                                               shape=[t_querys_length, t_keys_length],
                                                               dtype=queries.dtype)
                    time_output_w2 = variable_scope.get_variable("time_output_w2",
                                                                 shape=[t_querys_length, t_keys_length],
                                                                 dtype=queries.dtype)
                    time_output_b = variable_scope.get_variable("time_output_b",
                                                               shape=[t_querys_length, t_keys_length],
                                                               dtype=queries.dtype)
                    '''
                    #time_input_b = variable_scope.get_variable("_time_input_b",
                    #shape=[t_querys_length, t_keys_length],
                    #dtype=queries.dtype)
                    time_input_w1 = variable_scope.get_variable(
                        "_time_input_w1",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    time_input_b1 = variable_scope.get_variable(
                        "_time_input_b1",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    time_output_w1 = variable_scope.get_variable(
                        "time_output_w1",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    time_output_w2 = variable_scope.get_variable(
                        "time_output_w2",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    time_output_w3 = variable_scope.get_variable(
                        "time_output_w3",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    time_output_b = variable_scope.get_variable(
                        "time_output_b",
                        shape=[t_querys_length, t_keys_length],
                        dtype=queries.dtype)
                    #time_w = variable_scope.get_variable(
                    #"_time_w", shape=[query_len, key_len], dtype=queries.dtype)
                    #time_b = variable_scope.get_variable(
                    #"_time_b", shape=[query_len, key_len], dtype=queries.dtype)
                    #time_b2 = variable_scope.get_variable(
                    # "_time_b2", shape=[query_len, key_len], dtype=queries.dtype)

            #time_query_key = tf.matmul(queries,time_input_w, name ='1')
            time_query_key = math_ops.tensordot(Q, time_input_w, [[2], [0]])
            time_query_key = tf.matmul(time_query_key,
                                       keys,
                                       transpose_b=True,
                                       name='2')
            #time_query_key = tf.nn.tanh(time_query_key+time_input_b)
            time_query_key = tf.nn.tanh(time_query_key)
            #time_query_key = tf.layers.dropout(time_query_key, rate=dropout_rate, training=tf.convert_to_tensor(is_training))
            '''
            t_querys = tf.expand_dims(t_querys,2 )
            t_querys = tf.concat([t_querys] * t_keys_length, axis=2)
            '''
            t_querys = tf.stack([t_querys] * t_keys_length, axis=2)
            '''
            t_keys = tf.expand_dims(t_keys, 1)
            t_keys = tf.concat([t_keys] * t_querys_length, axis=1)
            '''
            t_keys = tf.stack([t_keys] * t_querys_length, axis=1)

            #decay = tf.relu(time_w * tf.log((t_querys - tf.transpose(t_keys))+1)+time_b)
            decay = tf.log(tf.add(tf.abs(tf.subtract(t_querys, t_keys)), 1))
            #decay_mean = tf.reduce_sum(decay)/(t_keys_length*t_querys_length)
            #decay = decay/(decay_mean+1)
            #decay = self.normalize(decay)
            decay = tf.nn.tanh(decay * time_input_w1 + time_input_b1)
            #decay = tf.nn.tanh(decay * time_input_w1)

            #decay_gate = time_output_w1 * decay * time_query_key + time_output_b 1
            #decay_gate = time_output_w1 * decay + time_output_b 1
            # 3
            decay_gate = time_output_w1 * decay + time_output_w2 * time_query_key + time_output_b

            #decay_gate = tf.sigmoid(time_output_w1*decay*time_query_key+time_output_b)
            #decay_gate = tf.exp(-time_query_key * decay)
            #sigmoid -> exp decay 0.145 0.067
            #relu sigmoid 0.150 0.729
            #relu ->exp decay 0.1423 0.0676
            #relu-> sigmoid + 0.156
            #relu-> sigmoid + split
            #relu sigmoid time_output_w1*decay+time_output_w2*time_query_key+time_output_b
            #0.50 0.68

            # Split and concat
            Q_ = tf.concat(tf.split(Q, num_heads, axis=2),
                           axis=0)  # (h*N, T_q, C/h)
            K_ = tf.concat(tf.split(K, num_heads, axis=2),
                           axis=0)  # (h*N, T_k, C/h)
            V_ = tf.concat(tf.split(V, num_heads, axis=2),
                           axis=0)  # (h*N, T_k, C/h)
            decay_gate_ = tf.concat([decay_gate] * num_heads,
                                    axis=0)  # (h*N, T_k, C/h)
            #decay_gate_ = tf.layers.dropout(decay_gate_, rate=dropout_rate,
            #training=tf.convert_to_tensor(is_training))

            # Multiplication
            # query-key score matrix
            # each big score matrix is then split into h score matrix with same size
            # w.r.t. different part of the feature
            outputs = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]),
                                name='3')  # (h*N, T_q, T_k)
            outputs *= tf.nn.sigmoid(decay_gate_)

            # Scale
            outputs = outputs / (K_.get_shape().as_list()[-1]**0.5)

            # Key Masking
            #key_masks = tf.sign(tf.abs(tf.reduce_sum(keys, axis=-1)))# (N, T_k)
            key_masks = tf.sequence_mask(key_length,
                                         tf.shape(keys)[1])  # (N, T_k)
            key_masks = tf.tile(key_masks, [num_heads, 1])  # (h*N, T_k)
            key_masks = tf.tile(tf.expand_dims(
                key_masks, 1), [1, tf.shape(queries)[1], 1])  # (h*N, T_q, T_k)

            paddings = tf.ones_like(outputs) * (-2**32 + 1)
            #outputs = tf.where(tf.equal(key_masks, 0), outputs, paddings)  # (h*N, T_q, T_k)
            outputs = tf.where(key_masks, outputs, paddings)  # (h*N, T_q, T_k)

            # Causality = Future blinding: No use, removed

            # Activation
            outputs = tf.nn.softmax(outputs)  # (h*N, T_q, T_k)
            '''
            # Query Masking
            query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
            query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
            query_masks = tf.tile(tf.expand_dims(query_masks, -1), [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
            outputs *= query_masks  # broadcasting. (N, T_q, C)

            # Attention vector
            att_vec = outputs

            # Dropouts
            outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

            # Weighted sum
            outputs = tf.matmul(outputs, V_)  # ( h*N, T_q, C/h)

            # Restore shape
            outputs = tf.concat(tf.split(outputs, num_heads, axis=0), axis=2)  # (N, T_q, C)

            # Residual connection
            outputs += queries

            # Normalize
            outputs = self.normalize(outputs)  # (N, T_q, C)
            '''

            # Query Masking
            #query_masks = tf.sign(tf.abs(tf.reduce_sum(queries, axis=-1)))  # (N, T_q)
            query_masks = tf.sequence_mask(query_length,
                                           tf.shape(queries)[1],
                                           dtype=tf.float32)  # (N, T_q)
            query_masks = tf.tile(query_masks, [num_heads, 1])  # (h*N, T_q)
            query_masks = tf.tile(tf.expand_dims(query_masks, -1),
                                  [1, 1, tf.shape(keys)[1]])  # (h*N, T_q, T_k)
            outputs *= query_masks  # broadcasting. (N, T_q, C)
            print(outputs.shape.as_list())
            print(query_masks.shape.as_list())

            # Attention vector
            #########Tom Sun
            att_vec = outputs

            # Dropouts
            #outputs = tf.layers.dropout(outputs, rate=dropout_rate, training=tf.convert_to_tensor(is_training))

            # Weighted sum
            outputs = tf.matmul(outputs, V_, name='4')  # ( h*N, T_q, C/h)

            # Restore shape
            outputs = tf.concat(tf.split(outputs, num_heads, axis=0),
                                axis=2)  # (N, T_q, C)
            outputs = outputs

            # Residual connection
            outputs += queries

            # Normalize
            outputs = self.normalize(outputs)  # (N, T_q, C)

        return outputs, att_vec
Ejemplo n.º 56
0
def attention_decoder(encoder_mask,
                      decoder_inputs,
                      initial_state,
                      attention_states,
                      cell,
                      beam_size,
                      output_size=None,
                      num_layers=1,
                      loop_function=None,
                      dtype=dtypes.float32,
                      scope=None,
                      initial_state_attention=False):
    """RNN decoder with attention for the sequence-to-sequence model.

    In this context "attention" means that, during decoding, the RNN can look up
    information in the additional tensor attention_states, and it does this by
    focusing on a few entries from the tensor. This model has proven to yield
    especially good results in a number of sequence-to-sequence tasks. This
    implementation is based on http://arxiv.org/abs/1409.0473 (see below for
    details).

    Args:
      encoder_mask: the mask of encoder inputs [batch_size x attn_length].
      decoder_inputs: A list of 2D Tensors [batch_size x input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      beam_size: the beam size of beam search
      output_size: Size of the output vectors; if None, we use cell.output_size.
      loop_function: When decoding, this function will be applied to i-th output
        in order to generate i+1-th input. The generation is by beam search.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "attention_decoder".
      initial_state_attention: If False (default), initial attentions are zero.
        If True, initialize the attentions from the initial state.

    Returns:
      A tuple of the form (outputs, state, symbols), where:
        outputs: A list of the same length as decoder_inputs of 2D Tensors of
          shape [batch_size x output_size]. These represent the generated outputs.
          Output i is computed from input i (which is either the i-th element
          of decoder_inputs or loop_function(output {i-1}, i)) as follows.
        state: The state of each decoder cell the final time-step.
          It is a 2D Tensor of shape [batch_size x cell.state_size].
        symbols: When training, it is []; when decoding, it is the best translation
          generated by beam search.

    Raises:
      ValueError: when shapes of attention_states are not set,
        or input size cannot be inferred from the input.
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())
    if output_size is None:
        output_size = cell.output_size

    with variable_scope.variable_scope(scope or "attention_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value
        state_size = initial_state.get_shape()[1].value
        attention_vec_size = attn_size // 2  # Size of query vectors for attention.

        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        # compute the initial hidden state of decoder
        initial_state = math_ops.tanh(
            linear(initial_state,
                   state_size,
                   False,
                   weight_initializer=init_ops.random_normal_initializer(
                       0, 0.01, seed=SEED)))

        with variable_scope.variable_scope(scope or "attention"):
            k = variable_scope.get_variable(
                "AttnW", [1, 1, attn_size, attention_vec_size],
                initializer=init_ops.random_normal_initializer(0,
                                                               0.001,
                                                               seed=SEED))
            hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
            v = variable_scope.get_variable(
                "AttnV", [attention_vec_size],
                initializer=init_ops.constant_initializer(0.0))

        def attention(query, scope=None):
            """Put attention masks on hidden using hidden_features and query."""
            with variable_scope.variable_scope(scope or "attention"):
                ds = []  # Results of attention reads will be stored here.
                if nest.is_sequence(
                        query):  # If the query is a tuple, flatten it.
                    query_list = nest.flatten(query)
                    for q in query_list:  # Check that ndims == 2 if specified.
                        ndims = q.get_shape().ndims
                        if ndims:
                            assert ndims == 2
                    query = array_ops.concat(query_list, 1)

                with variable_scope.variable_scope("AttnU"):
                    y = linear(
                        query,
                        attention_vec_size,
                        False,
                        weight_initializer=init_ops.random_normal_initializer(
                            0, 0.001, seed=SEED))
                    y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                    # the additive attention is computed by v^T * tanh(...).
                    s = math_ops.reduce_sum(
                        v * math_ops.tanh(hidden_features + y), [2, 3])
                    s = array_ops.transpose(
                        array_ops.transpose(s) - math_ops.reduce_max(s, [1]))
                    # sofxmax with mask
                    s = math_ops.exp(s)
                    s = math_ops.to_float(encoder_mask) * s
                    a = array_ops.transpose(
                        array_ops.transpose(s) / math_ops.reduce_sum(s, [1]))
                    d = math_ops.reduce_sum(
                        array_ops.reshape(a, [-1, attn_length, 1, 1]) * hidden,
                        [1, 2])
                    ds.append(array_ops.reshape(d, [-1, attn_size]))
            return ds

        outputs = []
        output = None
        state = initial_state
        out_state = array_ops.split(state, num_layers, 1)[-1]
        prev = None
        symbols = []
        prev_probs = [0]
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = [array_ops.zeros(batch_attn_size, dtype=dtype)]
        for a in attns:  # Ensure the second shape of attention vectors is set.
            a.set_shape([None, attn_size])

        for i, inp in enumerate(decoder_inputs):
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()
            # If loop_function is set, we use it instead of decoder_inputs.
            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp, prev_probs, index, prev_symbol = loop_function(
                        prev, prev_probs, beam_size, i)
                    out_state = array_ops.gather(out_state,
                                                 index)  # update prev state
                    state = array_ops.gather(state, index)  # update prev state
                    attns = [array_ops.gather(attn, index)
                             for attn in attns]  # update prev attens
                    for j, output in enumerate(outputs):
                        outputs[j] = array_ops.gather(
                            output, index)  # update prev outputs
                    for j, symbol in enumerate(symbols):
                        symbols[j] = array_ops.gather(
                            symbol, index)  # update prev symbols
                    symbols.append(prev_symbol)

            # Run the attention mechanism.
            if i > 0 or (i == 0 and initial_state_attention):
                attns = attention(out_state, scope="attention")

            # Run the RNN.
            cinp = array_ops.concat(
                [inp, attns[0]],
                1)  # concatenate next input and the context vector
            out_state, state = cell(cinp, state)

            with variable_scope.variable_scope("AttnOutputProjection"):
                output = linear([out_state] + [cinp], output_size, False)
                output = array_ops.reshape(output, [-1, output_size // 2, 2])
                output = math_ops.reduce_max(output, 2)  # maxout

            if loop_function is not None:
                prev = output
            outputs.append(output)

        if loop_function is not None:
            # handle the last symbol
            inp, prev_probs, index, prev_symbol = loop_function(
                prev, prev_probs, beam_size, i + 1)
            out_state = array_ops.gather(out_state, index)  # update prev state
            state = array_ops.gather(state, index)  # update prev state
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.gather(output,
                                              index)  # update prev outputs
            for j, symbol in enumerate(symbols):
                symbols[j] = array_ops.gather(symbol,
                                              index)  # update prev symbols
            symbols.append(prev_symbol)

            # output the best result of beam search
            for k, symbol in enumerate(symbols):
                symbols[k] = array_ops.gather(symbol, 0)
            out_state = array_ops.expand_dims(array_ops.gather(out_state, 0),
                                              0)
            state = array_ops.expand_dims(array_ops.gather(state, 0), 0)
            for j, output in enumerate(outputs):
                outputs[j] = array_ops.expand_dims(array_ops.gather(output, 0),
                                                   0)  # update prev outputs
    return outputs, state, symbols
def embedding_lookup_sparse(
    params,
    sp_ids,
    sp_weights,
    partition_strategy=None,  # no used
    name="embedding_lookup_sparse",
    combiner="mean",
    max_norm=None,
    return_trainable=False,
):
  """Provides a dynamic version of embedding_lookup_sparse
      similar with tf.nn.embedding_lookup_sparse.

    This op assumes that there is at least one id for each row in the dense tensor
    represented by sp_ids (i.e. there are no rows with empty features), and that
    all the indices of sp_ids are in canonical row-major order.

    It also assumes that all id values lie in the range [0, p0), where p0
    is the sum of the size of params along dimension 0.

    Args:
      params: A single `dynamic_embedding.Variable` instance representing
        the complete embedding tensor.
      sp_ids: N x M `SparseTensor` of int64 ids where N is typically batch size
        and M is arbitrary.
      sp_weights: either a `SparseTensor` of float / double weights, or `None` to
        indicate all weights should be taken to be 1. If specified, `sp_weights`
        must have exactly the same shape and indices as `sp_ids`.
      partition_strategy: No used.
      name: a name for the operation. Name is optional in graph mode and required
        in eager mode.
      combiner: A string specifying the reduction op. Currently "mean", "sqrtn"
        and "sum" are supported. "sum" computes the weighted sum of the embedding
        results for each row. "mean" is the weighted sum divided by the total
        weight. "sqrtn" is the weighted sum divided by the square root of the sum
        of the squares of the weights.
      max_norm: If not `None`, each embedding is clipped if its l2-norm is larger
        than this value, before combining.
      return_trainable: optional, If True, also return TrainableWrapper create by
        `dynamic_embedding.embedding_lookup`

    Returns:
      combined_embeddings: A dense tensor representing the combined embeddings
        for the sparse ids. For each row in the dense tensor represented by
        `sp_ids`, the op looks up the embeddings for all ids in that row,
        multiplies them by the corresponding weight, and combines these embeddings
        as specified.

        In other words, if

          `shape(combined params) = [+infinity, dim]`

        and

          `shape(sp_ids) = shape(sp_weights) = [d0, d1, ..., dn]`

        then

          `shape(output) = [d0, dim]`.

        For instance, if params dim=20, and sp_ids / sp_weights are

          ```python
          [0, 0]: id 1, weight 2.0
          [0, 1]: id 3, weight 0.5
          [1, 0]: id 0, weight 1.0
          [2, 3]: id 1, weight 3.0
          ```

        with `combiner`="mean", then the output will be a 3x20 matrix where

          ```python
          output[0, :] = (params[1, :] * 2.0 + params[3, :] * 0.5) / (2.0 + 0.5)
          output[1, :] = (params[0, :] * 1.0) / 1.0
          output[2, :] = (params[1, :] * 3.0) / 3.0
          ```
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.
    Raises:
      TypeError: If `sp_ids` is not a `SparseTensor`, or if `sp_weights` is
        neither `None` nor `SparseTensor`.
      ValueError: If `combiner` is not one of {"mean", "sqrtn", "sum"}.
    """
  if combiner not in ("mean", "sqrtn", "sum"):
    raise ValueError("combiner must be one of 'mean', 'sqrtn' or 'sum'")

  if not isinstance(sp_ids, sparse_tensor.SparseTensor):
    raise TypeError("sp_ids must be SparseTensor")

  ignore_weights = sp_weights is None
  if not ignore_weights:
    if not isinstance(sp_weights, sparse_tensor.SparseTensor):
      raise TypeError("sp_weights must be either None or SparseTensor")

  scope = variable_scope.get_variable_scope()
  full_name = scope.name + "/" + name if scope.name else name
  with ops.name_scope(full_name + "/"):
    segment_ids = sp_ids.indices[:, 0]
    if segment_ids.dtype != dtypes.int32:
      segment_ids = math_ops.cast(segment_ids, dtypes.int32)

    ids = sp_ids.values
    ids, idx = array_ops.unique(ids)

    embeddings, trainable_ = embedding_lookup(
        params,
        ids,
        name=name + "/embedding_lookup",
        partition_strategy=partition_strategy,
        max_norm=max_norm,
        return_trainable=True,
    )
    if embeddings.dtype in (dtypes.float16, dtypes.bfloat16):
      embeddings = math_ops.cast(embeddings, dtypes.float32)
    if not ignore_weights:
      weights = sp_weights.values
      if weights.dtype != embeddings.dtype:
        weights = math_ops.cast(weights, embeddings.dtype)

      embeddings = array_ops.gather(embeddings, idx)

      # Reshape weights to allow broadcast
      ones = array_ops.fill(
          array_ops.expand_dims(array_ops.rank(embeddings) - 1, 0), 1)
      bcast_weights_shape = array_ops.concat([array_ops.shape(weights), ones],
                                             0)

      orig_weights_shape = weights.get_shape()
      weights = array_ops.reshape(weights, bcast_weights_shape)

      # Set the weight shape, since after reshaping to bcast_weights_shape,
      # the shape becomes None.
      if embeddings.get_shape().ndims is not None:
        weights.set_shape(
            orig_weights_shape.concatenate(
                [1 for _ in range(embeddings.get_shape().ndims - 1)]))

      embeddings *= weights

      if combiner == "sum":
        embeddings = math_ops.segment_sum(embeddings, segment_ids, name=name)
      elif combiner == "mean":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weight_sum = math_ops.segment_sum(weights, segment_ids)
        embeddings = math_ops.div(embeddings, weight_sum, name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.segment_sum(embeddings, segment_ids)
        weights_squared = math_ops.pow(weights, 2)
        weight_sum = math_ops.segment_sum(weights_squared, segment_ids)
        weight_sum_sqrt = math_ops.sqrt(weight_sum)
        embeddings = math_ops.div(embeddings, weight_sum_sqrt, name=name)
      else:
        assert False, "Unrecognized combiner"
    else:
      assert idx is not None
      if combiner == "sum":
        embeddings = de.math.sparse_segment_sum(embeddings,
                                                idx,
                                                segment_ids,
                                                name=name)
      elif combiner == "mean":
        embeddings = math_ops.sparse_segment_mean(embeddings,
                                                  idx,
                                                  segment_ids,
                                                  name=name)
      elif combiner == "sqrtn":
        embeddings = math_ops.sparse_segment_sqrt_n(embeddings,
                                                    idx,
                                                    segment_ids,
                                                    name=name)
      else:
        assert False, "Unrecognized combiner"

    return (embeddings, trainable_) if return_trainable else embeddings
Ejemplo n.º 58
0
def pointer_decoder(decoder_inputs,
                    initial_state,
                    attention_states,
                    ori_encoder_inputs,
                    cell,
                    feed_prev=False,
                    dtype=dtypes.float32,
                    scope=None):
    """RNN decoder with pointer net for the sequence-to-sequence model.
    Args:
      decoder_inputs: a list of 2D Tensors [batch_size x cell.input_size].
      initial_state: 2D Tensor [batch_size x cell.state_size].
      attention_states: 3D Tensor [batch_size x attn_length x attn_size].
      cell: rnn_cell.RNNCell defining the cell function and size.
      dtype: The dtype to use for the RNN initial state (default: tf.float32).
      scope: VariableScope for the created subgraph; default: "pointer_decoder".
    Returns:
      outputs: A list of the same length as decoder_inputs of 2D Tensors of shape
        [batch_size x output_size]. These represent the generated outputs.
        Output i is computed from input i (which is either i-th decoder_inputs.
        First, we run the cell
        on a combination of the input and previous attention masks:
          cell_output, new_state = cell(linear(input, prev_attn), prev_state).
        Then, we calculate new attention masks:
          new_attn = softmax(V^T * tanh(W * attention_states + U * new_state))
        and then we calculate the output:
          output = linear(cell_output, new_attn).
      states: The state of each decoder cell in each time-step. This is a list
        with length len(decoder_inputs) -- one item for each time-step.
        Each item is a 2D Tensor of shape [batch_size x cell.state_size].
    """
    if not decoder_inputs:
        raise ValueError("Must provide at least 1 input to attention decoder.")
    if not attention_states.get_shape()[1:2].is_fully_defined():
        raise ValueError(
            "Shape[1] and [2] of attention_states must be known: %s" %
            attention_states.get_shape())

    with vs.variable_scope(scope or "point_decoder"):
        batch_size = array_ops.shape(
            decoder_inputs[0])[0]  # Needed for reshaping.
        input_size = decoder_inputs[0].get_shape()[1].value
        attn_length = attention_states.get_shape()[1].value
        attn_size = attention_states.get_shape()[2].value

        # To calculate W1 * h_t we use a 1-by-1 convolution, need to reshape before.
        hidden = array_ops.reshape(attention_states,
                                   [-1, attn_length, 1, attn_size])

        attention_vec_size = attn_size  # Size of query vectors for attention.
        k = vs.get_variable("AttnW", [1, 1, attn_size, attention_vec_size])
        hidden_features = nn_ops.conv2d(hidden, k, [1, 1, 1, 1], "SAME")
        v = vs.get_variable("AttnV", [attention_vec_size])

        states = [initial_state]

        def attention(query):
            """Point on hidden using hidden_features and query."""
            with vs.variable_scope("Attention"):
                y = core_rnn_cell_impl._linear(query, attention_vec_size, True)
                y = array_ops.reshape(y, [-1, 1, 1, attention_vec_size])
                # Attention mask is a softmax of v^T * tanh(...).
                s = math_ops.reduce_sum(v * math_ops.tanh(hidden_features + y),
                                        [2, 3])
                return s

        outputs = []
        prev = None
        batch_attn_size = array_ops.stack([batch_size, attn_size])
        attns = array_ops.zeros(batch_attn_size, dtype=dtype)

        attns.set_shape([None, attn_size])
        inps = []
        for i in range(len(decoder_inputs)):
            if i > 0:
                vs.get_variable_scope().reuse_variables()
            inp = decoder_inputs[i]

            if feed_prev and i > 0:
                inp = tf.stack(ori_encoder_inputs)
                inp = tf.transpose(inp, perm=[1, 0, 2])
                inp = tf.reshape(inp, [-1, attn_length, input_size])
                inp = tf.reduce_sum(
                    inp *
                    tf.reshape(tf.nn.softmax(output), [-1, attn_length, 1]), 1)
                inp = tf.stop_gradient(inp)
                inps.append(inp)

            # Use the same inputs in inference, order internaly

            # Merge input and previous attentions into one vector of the right size.

            x = core_rnn_cell_impl._linear([inp, attns], cell.output_size,
                                           True)
            #x = inp

            # Run the RNN.
            cell_output, new_state = cell(x, states[-1])
            states.append(new_state)
            # Run the attention mechanism.
            output = attention(new_state)

            outputs.append(output)

    return outputs, states, inps
Ejemplo n.º 59
0
def _GetBatchNormParams(graph, context, has_scaling):
    """Extracts relevant tensors for folding batch norms.

  Args:
    graph: Graph to inspect.
    context: The scope under which we look for batch norm params
    has_scaling: Bool that specifies if scaling is done as part of batch norm.

  Returns:
    _BatchNormMatch containing all required batch norm parameters.
  """
    gamma_tensor = None
    batch_mean_tensor = None
    batch_variance_tensor = None
    moving_mean_tensor = None
    moving_variance_tensor = None
    batch_epsilon = None
    bn_decay_mean_tensor = None
    bn_decay_var_tensor = None

    # TODO(raghuramank) This code relies on string matching and needs to be
    # updated if unfused batch norm continues to be widely used
    # Matching variable names is brittle and relies on scoping
    # conventions. Fused batch norm folding is more robust. Support for unfused
    # batch norms will be deprecated as we move forward. Fused batch norms allow
    # for faster training and should be used whenever possible.
    # context contains part of the names of the tensors we are interested in:
    # For MobilenetV1, the context has repetitions:
    # MobilenetV1/MobilenetV1/Conv2d_3_depthwise
    # when the moving_mean tensor has the name:
    # MobilenetV1/Conv2d_3_depthwise/BatchNorm/moving_mean/read
    # To pick the correct variable name, it is necessary to ignore the repeating
    # header.

    # For MobilenetV2, this problem does not exist:
    # The context is: MobilenetV2/expanded_conv_3/depthwise
    # and the names of the tensors start with a single MobilenetV2
    # The moving mean for example, has the name:
    # MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
    # We identify the best match for an op by checking for
    # 1. The suffix of the op is exactly matched
    # 2. Maximum number of matches with the context.The matching
    # score is given by the number of parts of context (split by /) that
    # are present in the parts of the tensor name (again split by /).
    # For example: scope= MobilenetV2/MobilenetV2/expanded_conv_3 and
    # op.name =  MobilenetV2/expanded_conv_3/depthwise/BatchNorm/moving_mean/read
    # will have 2 matches,scope with a different conv layer will have one match.

    op_suffix_mean = 'BatchNorm/moments/Squeeze'
    op_suffix_variance = 'BatchNorm/moments/Squeeze_1'
    op_suffix_epsilon = 'BatchNorm/batchnorm_1/add/y'
    op_suffix_bn_decay_mean = 'BatchNorm/AssignMovingAvg/decay'
    op_suffix_bn_decay_var = 'BatchNorm/AssignMovingAvg_1/decay'

    if variable_scope.get_variable_scope().use_resource:
        op_suffix_gamma = 'BatchNorm/gamma/Read/ReadVariableOp'
        op_suffix_moving_variance = (
            'BatchNorm/moving_variance/Read/ReadVariableOp')
        op_suffix_moving_mean = ('BatchNorm/moving_mean/Read/ReadVariableOp')
    else:
        op_suffix_gamma = 'BatchNorm/gamma'
        op_suffix_moving_variance = 'BatchNorm/moving_variance/read'
        op_suffix_moving_mean = 'BatchNorm/moving_mean/read'
    # Parse through list of ops to find relevant ops

    batch_mean_tensor = _FindMatchingTensor(graph, op_suffix_mean, context)
    batch_variance_tensor = _FindMatchingTensor(graph, op_suffix_variance,
                                                context)
    moving_mean_tensor = _FindMatchingTensor(graph, op_suffix_moving_mean,
                                             context)
    moving_variance_tensor = _FindMatchingTensor(graph,
                                                 op_suffix_moving_variance,
                                                 context)
    batch_epsilon = _FindMatchingTensor(graph, op_suffix_epsilon, context)
    bn_decay_mean_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_mean,
                                               context)
    bn_decay_var_tensor = _FindMatchingTensor(graph, op_suffix_bn_decay_var,
                                              context)
    if batch_mean_tensor is None and moving_mean_tensor is None:
        ValueError('Error folding unfused batch norms')
    if has_scaling:
        gamma_tensor = _FindMatchingTensor(graph, op_suffix_gamma, context)

    if not has_scaling:
        gamma_tensor = array_ops.ones(moving_mean_tensor.shape)

    return _BatchNormMatch(layer_op=None,
                           bn_op=None,
                           output_tensor=None,
                           input_tensor=None,
                           weight_tensor=None,
                           gamma_tensor=gamma_tensor,
                           beta_tensor=None,
                           mean_tensor=batch_mean_tensor,
                           variance_tensor=batch_variance_tensor,
                           moving_mean_tensor=moving_mean_tensor,
                           moving_variance_tensor=moving_variance_tensor,
                           bn_decay_mean_tensor=bn_decay_mean_tensor,
                           bn_decay_var_tensor=bn_decay_var_tensor,
                           batch_epsilon=batch_epsilon,
                           batch_to_space_op=None)
Ejemplo n.º 60
0
def actrgn_rnn_decoder(decoder_inputs,
                       initial_state,
                       initial_attn_output,
                       cell,
                       attn_dim,
                       lstm_dim,
                       loop_function=None,
                       scope=None):
    """RNN decoder for the sequence-to-sequence model.

  Args:
    decoder_inputs: A list of 2D Tensors [batch_size x input_size].
    initial_state: 2D Tensor with shape [batch_size x cell.state_size].
    cell: core_rnn_cell.RNNCell defining the cell function and size.
    loop_function: If not None, this function will be applied to the i-th output
      in order to generate the i+1-st input, and decoder_inputs will be ignored,
      except for the first element ("GO" symbol). This can be used for decoding,
      but also for training to emulate http://arxiv.org/abs/1506.03099.
      Signature -- loop_function(prev, i) = next
        * prev is a 2D Tensor of shape [batch_size x output_size],
        * i is an integer, the step number (when advanced control is needed),
        * next is a 2D Tensor of shape [batch_size x input_size].
    scope: VariableScope for the created subgraph; defaults to "rnn_decoder".

  Returns:
    A tuple of the form (outputs, state), where:
      outputs: A list of the same length as decoder_inputs of 2D Tensors with
        shape [batch_size x output_size] containing generated outputs.
      state: The state of each cell at the final time-step.
        It is a 2D Tensor of shape [batch_size x cell.state_size].
        (Note that in some cases, like basic RNN cell or GRU cell, outputs and
         states can be the same. They are different for LSTM cells though.)
  """
    with variable_scope.variable_scope(scope or "actrgn_rnn_decoder"):
        state = initial_state
        output = initial_attn_output

        outputs = []
        prev = None
        w_l = variable_scope.get_variable(name='lstm_to_attn_w',
                                          shape=[lstm_dim, attn_dim],
                                          dtype=tf.float32)
        b_l = variable_scope.get_variable(name='lstm_to_attn_b',
                                          shape=[attn_dim],
                                          dtype=tf.float32)
        w_i = variable_scope.get_variable(name='ip_to_attn_w',
                                          shape=[attn_dim, attn_dim],
                                          dtype=tf.float32)
        b_i = variable_scope.get_variable(name='ip_to_attn_b',
                                          shape=[attn_dim],
                                          dtype=tf.float32)
        w_f = variable_scope.get_variable(name='attn_to_prob_w',
                                          shape=[attn_dim, 1],
                                          dtype=tf.float32)
        b_f = variable_scope.get_variable(name='attn_to_prob_b',
                                          shape=[1],
                                          dtype=tf.float32)

        for i, inp in enumerate(decoder_inputs):

            if loop_function is not None and prev is not None:
                with variable_scope.variable_scope("loop_function",
                                                   reuse=True):
                    inp = loop_function(prev, i)
            if i > 0:
                variable_scope.get_variable_scope().reuse_variables()

            attn_state = tf.matmul(output, w_l) + b_l

            context_state = tf.matmul(
                inp, tf.tile(tf.expand_dims(w_i, 0),
                             [int(inp.shape[0]), 1, 1])) + b_i

            context_state = context_state + tf.expand_dims(attn_state, 1)
            context_state = tf.tanh(context_state)

            attn_prob = tf.squeeze(
                tf.nn.softmax(
                    tf.matmul(
                        context_state,
                        tf.tile(tf.expand_dims(w_f, 0),
                                [int(context_state.shape[0]), 1, 1])) + b_f))

            inp_rnn = tf.reduce_sum(
                tf.multiply(inp, tf.expand_dims(attn_prob, 2)), 1)

            output, state = cell(inp_rnn, state)

            outputs.append(output)
            if loop_function is not None:
                prev = output
    return outputs, state