Esempio n. 1
0
def embedding_postprocessor(input_tensor,
                            use_positional_embeddings=True,
                            positional_embedding_name='positional_embeddings',
                            initializer_range=0.02,
                            max_positional_embeddings=512,
                            dropout_prob=0.1):
  """Perform positional embeddings on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size].
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.
    
  Returns:
    float tensor with same sahpe as 'input_tensor'.
  """
  input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
  seq_length = input_shape[1]
  width = input_shape[2]

  if use_positional_embeddings:
    assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings)
    with tf.control_dependencies([assert_op]):
      full_positional_embeddings = tf.get_variable(
        name=positional_embedding_name,
        shape=[max_positional_embeddings, width],
        initializer=ft.create_initialzer(initializer_range=initializer_range))
    
    positional_embeddings = tf.slice(full_positional_embeddings, [0, 0], [seq_length, -1])  # [seq_length, width]
    positional_embeddings = tf.expand_dims(positional_embeddings, [0])  # [1, seq_length, width]

    output = input_tensor + positional_embeddings
  
  output = ft.layer_norm_and_dropout(output, dropout_prob)
  return output
Esempio n. 2
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size,
                     initializer_range,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
  """Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialation range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True. use one-hot method for word embedding.
      If False, use 'tf.gather()'.
  
  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """
  
  embedding_table = tf.get_variable(
    name=word_embedding_name,
    shape=[vocab_size, embedding_size],
    initializer=ft.create_initialzer(initializer_range=initializer_range))
  
  if use_one_hot_embeddings:
    input_shape = ft.get_shape_list(input_ids, expected_rank=2)
    input_ids_squeeze = tf.reshape(input_ids, [-1])
    one_hot_input_ids = tf.one_hot(input_ids_squeeze, depth=vocab_size)
    output = tf.matmul(one_hot_input_ids, embedding_table)
    output = tf.reshape(output, [input_shape[0], input_shape[1], -1])
  else:
    output = tf.nn.embedding_lookup(embedding_table, input_ids)
  
  return output, embedding_table
Esempio n. 3
0
    def build_graph(self,
                    encoder_state,
                    initializer_range,
                    seq_length_decoder_input_data=None,
                    decoder_input_data=None,
                    scope=None):
        with tf.variable_scope(scope, default_name='decoder'):
            output_layer = tf.layers.Dense(
                self.tgt_vocab_size,
                name='decoder_output',
                kernel_initializer=ft.create_initialzer(
                    initializer_range=initializer_range))
            maximum_iterations = 0
            if self.max_len_infer != None:
                maximum_iterations = self.max_len_infer
            else:
                decoding_length_factor = 5.0
                max_encoder_length = tf.reduce_max(
                    seq_length_decoder_input_data)
                maximum_iterations = tf.to_int32(
                    tf.round(
                        tf.to_float(max_encoder_length) *
                        decoding_length_factor))

            with tf.variable_scope('rnn_decoder') as scope:
                cells, decoder_initial_state = None, None
                cells = lego.create_cell_list_for_RNN(
                    unit_type=self.unit_type,
                    num_units=self.num_units,
                    dropout=self.dropout,
                    forget_bias=self.forget_bias)
                decoder_initial_state = encoder_state
                logits, sample_id, final_context_state = None, None, None

                if self.is_training:
                    target_input = decoder_input_data
                    decoder_emb_inp = tf.nn.embedding_lookup(
                        self.embedding_table, target_input)

                    helper = tf.contrib.seq2seq.TrainingHelper(
                        decoder_emb_inp, seq_length_decoder_input_data)
                    my_decoder = tf.contrib.seq2seq.BasicDecoder(
                        cells, helper, decoder_initial_state)

                    outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
                        my_decoder, swap_memory=True, scope=scope)
                    self.logits = output_layer(outputs.rnn_output)
                    self.sample_id = tf.argmax(self.logits, axis=-1)
                else:
                    start_tokens = tf.fill([self.batch_size], self.tgt_sos_id)
                    end_token = self.tgt_eos_id

                    helper = tf.contrib.seq2seq.GreedyEmbeddingHelper(
                        self.embedding_table, start_tokens, end_token)
                    my_decoder = tf.contrib.seq2seq.BasicDecoder(
                        cells,
                        helper,
                        decoder_initial_state,
                        output_layer=output_layer)

                    outputs, final_context_state, _ = tf.contrib.seq2seq.dynamic_decode(
                        my_decoder,
                        maximum_iterations=maximum_iterations,
                        swap_memory=True,
                        scope=scope)
                    self.logits = outputs.rnn_output
                    self.sample_id = outputs.sample_id

        self.ppl_seq, self.ppl = ft.get_ppl(self.logits)
Esempio n. 4
0
def transformer_model(input_tensor,
                      attention_mask,
                      hidden_size,
                      num_hidden_layers,
                      num_attention_heads,
                      intermediate_size,
                      intermediate_act_fn,
                      hidden_dropout_prob,
                      attention_dropout_prob,
                      initializer_range,
                      do_return_all_layers=False):
  """Multi-headed, multi-layer Transformer from 'Attention is All you need'.
  
  This is almost an exact implementation of the original Transformer encoder.

  Args:
  input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
  attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
    seq_length], with 1 for positions that can be attended to and 0 in
    positions that should not be.
  hidden_size: int. Hidden size of the Transformer.
  num_hidden_layers: int. Number of layers (blocks) in the Transformer.
  num_attention_heads: int. Number of attention heads in the Transformer.
  intermediate_size: int. The size of the "intermediate" (a.k.a., feed
    forward) layer.
  intermediate_act_fn: function. The non-linear activation function to apply
    to the output of the intermediate/feed-forward layer.
  hidden_dropout_prob: float. Dropout probability for the hidden layers.
  attention_probs_dropout_prob: float. Dropout probability of the attention
    probabilities.
  initializer_range: float. Range of the initializer (stddev of truncated
    normal).
  do_return_all_layers: Whether to also return all layers or just the final
    layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.
  """
  if hidden_size % num_attention_heads != 0:
    raise ValueError(
      'The hidden size ({}) is not a multiple of the number of attention head\
         ({}).'.format(hidden_size, num_attention_heads))
  
  attention_head_size = int(hidden_size / num_attention_heads)
  input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
  batch_size = input_shape[0]
  seq_length = input_shape[1]
  input_width = input_shape[2]

  if input_width != hidden_size:
    raise ValueError('The width of the input tensor ({}) != hidden size ({}).'.format(input_width, hidden_size))

  prev_output = input_tensor
  all_layers_outputs = []
  for layer_idx in range(num_hidden_layers):
    with tf.variable_scope('layer_{}'.format(layer_idx)):
      layer_input = prev_output

      with tf.variable_scope('attention'):
        with tf.variable_scope('self'):
          # [b, s, n * a]
          attention_head = attention_layer(input_tensor=layer_input,
                                           attention_mask=attention_mask,
                                           num_attention_heads=num_attention_heads,
                                           size_per_head=attention_head_size,
                                           query_act=None,
                                           key_act=None,
                                           value_act=None,
                                           attention_dropout_prob=attention_dropout_prob,
                                           initializer_range=initializer_range,
                                           batch_size=batch_size,
                                           seq_length=seq_length)
        
        with tf.variable_scope('output'):
          # [b, s, h]
          attention_output = tf.layers.dense(
            attention_head,
            hidden_size,
            kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))
          attention_output = ft.dropout(attention_output, hidden_dropout_prob)
          attention_output = ft.layer_norm(attention_output + layer_input)

      with tf.variable_scope('intermediate'):
        # [b, s, i]
        intermediate_output = tf.layers.dense(
          attention_head,
          intermediate_size,
          activation=intermediate_act_fn,
          kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))
      
      with tf.variable_scope('output'):
        # [b, s, h]
        layer_output = tf.layers.dense(
          intermediate_output,
          hidden_size,
          kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))
        layer_output = ft.dropout(layer_output, hidden_dropout_prob)
        layer_output = ft.layer_norm(layer_output + attention_output)
        prev_output = layer_output
        all_layers_outputs.append(prev_output)
  
  if do_return_all_layers:
    return all_layers_outputs
  else:
    return all_layers_outputs[-1]
Esempio n. 5
0
def attention_layer(input_tensor,
                    attention_mask,
                    num_attention_heads,
                    size_per_head,
                    query_act,
                    key_act,
                    value_act,
                    attention_dropout_prob,
                    initializer_range,
                    batch_size,
                    seq_length):
  """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with transposes and
  reshapes rather than actual separate tensors.

  Args:
    input_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
      output will be of shape [batch_size, from_seq_length, num_attention_heads
      * size_per_head].
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `input_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
  
  def transpose_for_scores(input_tensor, batch_size, seq_length, num_attention_heads, width):
    output_tensor = tf.reshape(
      input_tensor, [batch_size, seq_length, num_attention_heads, width])
    
    output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
    return output_tensor

  # [b, s, n * a]
  query_layer = tf.layers.dense(
    input_tensor,
    num_attention_heads * size_per_head,
    activation=query_act,
    name='query',
    kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))

  key_layer = tf.layers.dense(
    input_tensor,
    num_attention_heads * size_per_head,
    activation=key_act,
    name='key',
    kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))
  
  value_layer = tf.layers.dense(
    input_tensor,
    num_attention_heads * size_per_head,
    activation=value_act,
    name='value',
    kernel_initializer=ft.create_initialzer(initializer_range=initializer_range))
  
  # [b, n, s, a]
  query_layer = transpose_for_scores(query_layer, batch_size, seq_length, 
                                     num_attention_heads, size_per_head)
  
  key_layer = transpose_for_scores(key_layer, batch_size, seq_length, 
                                   num_attention_heads, size_per_head)
  
  # [b, n, s, s]
  attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
  attention_scores = tf.multiply(attention_scores, 1.0 / math.sqrt(float(size_per_head)))

  if attention_mask is not None:
    # [b, 1, s, s]
    attention_mask = tf.expand_dims(attention_mask, axis=[1])
    adder = (1.0 - attention_mask) * -10000.0
    attention_scores += adder

  attention_prob = tf.nn.softmax(attention_scores)
  attention_prob = ft.dropout(attention_prob, attention_dropout_prob)

  # [b, n, s, a]
  value_layer = transpose_for_scores(value_layer, batch_size, seq_length,
                                     num_attention_heads, size_per_head)
  
  # [b, n, s, a]
  context_layer = tf.matmul(attention_prob, value_layer)
  # [b, s, n, a]
  context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
  # [b, s, n * a]
  context_layer = tf.reshape(context_layer, [batch_size, seq_length, num_attention_heads * size_per_head])

  return context_layer