Esempio n. 1
0
def vae(state, num_units, scope='vae'):
    """VAE implementation, Hard Coding.
    
    The formula to calculate the vae loss:
      vae_loss = (-0.5 * tf.reduce_sum(1.0 + vae_vb - tf.square(vae_mean) - tf.exp(vae_vb)) / batch_size) * 0.001  
    """
    shape = ft.get_shape_list(state, expected_rank=2)

    with tf.variable_scope(scope):
        vae_mean = tf.layers.dense(state,
                                   num_units,
                                   activation=tf.nn.tanh,
                                   name='vae_mean',
                                   kernel_initializer=ft.create_initializer())

        vae_vb = tf.layers.dense(state,
                                 num_units,
                                 activation=tf.nn.tanh,
                                 name='vae_vb',
                                 kernel_initializer=ft.create_initializer())

        eps = tf.random_normal([shape[0], num_units],
                               0.0,
                               1.0,
                               dtype=tf.float32)

        z = vae_mean + tf.sqrt(tf.exp(vae_vb)) * eps

    return z, vae_mean, vae_vb
Esempio n. 2
0
def embedding_postprocessor(input_tensor,
                            use_positional_embeddings=True,
                            positional_embedding_name='positional_embeddings',
                            initializer_range=0.02,
                            max_positional_embeddings=512,
                            dropout_prob=0.1):
    """Perform positional embeddings on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size].
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.
    
  Returns:
    float tensor with same sahpe as 'input_tensor'.
  """
    input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
    seq_length = input_shape[1]
    width = input_shape[2]

    if use_positional_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings)
        with tf.control_dependencies([assert_op]):
            full_positional_embeddings = tf.get_variable(
                name=positional_embedding_name,
                shape=[max_positional_embeddings, width],
                initializer=ft.create_initializer(
                    initializer_range=initializer_range))

        positional_embeddings = tf.slice(
            full_positional_embeddings, [0, 0],
            [seq_length, -1])  # [seq_length, width]
        positional_embeddings = tf.expand_dims(positional_embeddings,
                                               [0])  # [1, seq_length, width]

        output = input_tensor + positional_embeddings

    output = ft.layer_norm_and_dropout(output, dropout_prob)
    return output
Esempio n. 3
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size,
                     initializer_range,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialation range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True. use one-hot method for word embedding.
      If False, use 'tf.gather()'.
  
  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=ft.create_initializer(initializer_range=initializer_range))

    if use_one_hot_embeddings:
        input_shape = ft.get_shape_list(input_ids, expected_rank=2)
        input_ids_squeeze = tf.reshape(input_ids, [-1])
        one_hot_input_ids = tf.one_hot(input_ids_squeeze, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
        output = tf.reshape(output, [input_shape[0], input_shape[1], -1])
    else:
        output = tf.nn.embedding_lookup(embedding_table, input_ids)

    return output, embedding_table
Esempio n. 4
0
def transformer_model(input_tensor,
                      attention_mask,
                      hidden_size,
                      num_hidden_layers,
                      num_attention_heads,
                      intermediate_size,
                      intermediate_act_fn,
                      hidden_dropout_prob,
                      attention_dropout_prob,
                      initializer_range,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from 'Attention is All you need'.
  
  This is almost an exact implementation of the original Transformer encoder.

  Args:
  input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
  attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
    seq_length], with 1 for positions that can be attended to and 0 in
    positions that should not be.
  hidden_size: int. Hidden size of the Transformer.
  num_hidden_layers: int. Number of layers (blocks) in the Transformer.
  num_attention_heads: int. Number of attention heads in the Transformer.
  intermediate_size: int. The size of the "intermediate" (a.k.a., feed
    forward) layer.
  intermediate_act_fn: function. The non-linear activation function to apply
    to the output of the intermediate/feed-forward layer.
  hidden_dropout_prob: float. Dropout probability for the hidden layers.
  attention_probs_dropout_prob: float. Dropout probability of the attention
    probabilities.
  initializer_range: float. Range of the initializer (stddev of truncated
    normal).
  do_return_all_layers: Whether to also return all layers or just the final
    layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            'The hidden size ({}) is not a multiple of the number of attention head\
         ({}).'.format(hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    if input_width != hidden_size:
        raise ValueError(
            'The width of the input tensor ({}) != hidden size ({}).'.format(
                input_width, hidden_size))

    prev_output = input_tensor
    all_layers_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope('layer_{}'.format(layer_idx)):
            layer_input = prev_output

            with tf.variable_scope('attention'):
                with tf.variable_scope('self'):
                    # [b, s, n * a]
                    attention_head = attention_layer(
                        input_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        query_act=None,
                        key_act=None,
                        value_act=None,
                        attention_dropout_prob=attention_dropout_prob,
                        initializer_range=initializer_range,
                        batch_size=batch_size,
                        seq_length=seq_length)

                with tf.variable_scope('output'):
                    # [b, s, h]
                    attention_output = tf.layers.dense(
                        attention_head,
                        hidden_size,
                        kernel_initializer=ft.create_initializer(
                            initializer_range=initializer_range))
                    attention_output = ft.dropout(attention_output,
                                                  hidden_dropout_prob)
                    attention_output = ft.layer_norm(attention_output +
                                                     layer_input)

            with tf.variable_scope('intermediate'):
                # [b, s, i]
                intermediate_output = tf.layers.dense(
                    attention_head,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=ft.create_initializer(
                        initializer_range=initializer_range))

            with tf.variable_scope('output'):
                # [b, s, h]
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=ft.create_initializer(
                        initializer_range=initializer_range))
                layer_output = ft.dropout(layer_output, hidden_dropout_prob)
                layer_output = ft.layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layers_outputs.append(prev_output)

    if do_return_all_layers:
        return all_layers_outputs
    else:
        return all_layers_outputs[-1]
Esempio n. 5
0
def attention_layer(input_tensor, attention_mask, num_attention_heads,
                    size_per_head, query_act, key_act, value_act,
                    attention_dropout_prob, initializer_range, batch_size,
                    seq_length):
    """Performs multi-headed attention from `from_tensor` to `to_tensor`.

  This is an implementation of multi-headed attention based on "Attention
  is all you Need". If `from_tensor` and `to_tensor` are the same, then
  this is self-attention. Each timestep in `from_tensor` attends to the
  corresponding sequence in `to_tensor`, and returns a fixed-with vector.

  This function first projects `from_tensor` into a "query" tensor and
  `to_tensor` into "key" and "value" tensors. These are (effectively) a list
  of tensors of length `num_attention_heads`, where each tensor is of shape
  [batch_size, seq_length, size_per_head].

  Then, the query and key tensors are dot-producted and scaled. These are
  softmaxed to obtain attention probabilities. The value tensors are then
  interpolated by these probabilities, then concatenated back to a single
  tensor and returned.

  In practice, the multi-headed attention are done with transposes and
  reshapes rather than actual separate tensors.

  Args:
    input_tensor: float Tensor of shape [batch_size, from_seq_length,
      from_width].
    attention_mask: (optional) int32 Tensor of shape [batch_size,
      from_seq_length, to_seq_length]. The values should be 1 or 0. The
      attention scores will effectively be set to -infinity for any positions in
      the mask that are 0, and will be unchanged for positions that are 1.
    num_attention_heads: int. Number of attention heads.
    size_per_head: int. Size of each attention head.
    query_act: (optional) Activation function for the query transform.
    key_act: (optional) Activation function for the key transform.
    value_act: (optional) Activation function for the value transform.
    attention_probs_dropout_prob: (optional) float. Dropout probability of the
      attention probabilities.
    initializer_range: float. Range of the weight initializer.
    do_return_2d_tensor: bool. If True, the output will be of shape [batch_size
      * from_seq_length, num_attention_heads * size_per_head]. If False, the
      output will be of shape [batch_size, from_seq_length, num_attention_heads
      * size_per_head].
    batch_size: (Optional) int. If the input is 2D, this might be the batch size
      of the 3D version of the `from_tensor` and `to_tensor`.
    seq_length: (Optional) If the input is 2D, this might be the seq length
      of the 3D version of the `input_tensor`.

  Returns:
    float Tensor of shape [batch_size, from_seq_length,
      num_attention_heads * size_per_head].

  Raises:
    ValueError: Any of the arguments or tensor shapes are invalid.
  """
    def transpose_for_scores(input_tensor, batch_size, seq_length,
                             num_attention_heads, width):
        output_tensor = tf.reshape(
            input_tensor, [batch_size, seq_length, num_attention_heads, width])

        output_tensor = tf.transpose(output_tensor, [0, 2, 1, 3])
        return output_tensor

    # [b, s, n * a]
    query_layer = tf.layers.dense(input_tensor,
                                  num_attention_heads * size_per_head,
                                  activation=query_act,
                                  name='query',
                                  kernel_initializer=ft.create_initializer(
                                      initializer_range=initializer_range))

    key_layer = tf.layers.dense(input_tensor,
                                num_attention_heads * size_per_head,
                                activation=key_act,
                                name='key',
                                kernel_initializer=ft.create_initializer(
                                    initializer_range=initializer_range))

    value_layer = tf.layers.dense(input_tensor,
                                  num_attention_heads * size_per_head,
                                  activation=value_act,
                                  name='value',
                                  kernel_initializer=ft.create_initializer(
                                      initializer_range=initializer_range))

    # [b, n, s, a]
    query_layer = transpose_for_scores(query_layer, batch_size, seq_length,
                                       num_attention_heads, size_per_head)

    key_layer = transpose_for_scores(key_layer, batch_size, seq_length,
                                     num_attention_heads, size_per_head)

    # [b, n, s, s]
    attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
    attention_scores = tf.multiply(attention_scores,
                                   1.0 / math.sqrt(float(size_per_head)))

    if attention_mask is not None:
        # [b, 1, s, s]
        attention_mask = tf.expand_dims(attention_mask, axis=[1])
        adder = (1.0 - attention_mask) * -10000.0
        attention_scores += adder

    attention_prob = tf.nn.softmax(attention_scores)
    attention_prob = ft.dropout(attention_prob, attention_dropout_prob)

    # [b, n, s, a]
    value_layer = transpose_for_scores(value_layer, batch_size, seq_length,
                                       num_attention_heads, size_per_head)

    # [b, n, s, a]
    context_layer = tf.matmul(attention_prob, value_layer)
    # [b, s, n, a]
    context_layer = tf.transpose(context_layer, [0, 2, 1, 3])
    # [b, s, n * a]
    context_layer = tf.reshape(
        context_layer,
        [batch_size, seq_length, num_attention_heads * size_per_head])

    return context_layer
Esempio n. 6
0
  def model_fn(features, labels, mode, params):
    # features name and shape
    for name in sorted(features.keys()):
      tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape))

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # get data
    input_data = features['input_data']
    input_mask = features['input_mask']
    if mode == tf.estimator.ModeKeys.TRAIN:
      sentiment_labels = features['sentiment_labels']
      sentiment_mask_indices = features['sentiment_mask_indices']
      true_length_from_data = features['true_length']

    # build model
    model = BertEncoder(
      config=cg.BertEncoderConfig,
      is_training=is_training,
      input_ids=input_data,
      input_mask=input_mask)
    
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    # [cls] output -> [b, h]
    cls_output = model.get_cls_output()
    # sequence_output -> [b, s, h], do not contain [CLS], because the mask indices do not shift
    sequence_output = model.get_sequence_output()[:, 1:, :]

    # project the hidden size to the num_classes
    with tf.variable_scope('final_output'):
      # [b, num_classes]
      output_logits = tf.layers.dense(
        cls_output,
        cg.BertEncoderConfig.num_classes,
        name='final_output',
        kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range))

    if mode == tf.estimator.ModeKeys.PREDICT:
      output_softmax = tf.nn.softmax(output_logits, axis=-1)
      output_result = tf.argmax(output_softmax, axis=-1)
      predictions = {'predict': output_result}
      output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
      if mode == tf.estimator.ModeKeys.TRAIN:
        # masked_output -> [b * x, h]
        masked_output = gather_indexs(sequence_output, sentiment_mask_indices)
        
        # get output for word polarity prediction
        with tf.variable_scope('sentiment_project'):
          # [b * x, 2]
          output_sentiment = tf.layers.dense(
            masked_output,
            2,
            name='final_output',
            kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range))
        # output_sentiment_probs = tf.nn.softmax(output_sentiment, axis=-1)

        batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=1)[0], dtype=tf.float32)
        # cross-entropy loss
        cls_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels,
          logits=output_logits)) / batch_size

        # mse loss
        # # Regression Model
        true_sequence = get_true_sequence(true_length_from_data)
        # mse_loss = calculate_mse_loss(
        #   output_sentiment, sentiment_labels, true_sequence)

        # # Classification Model
        true_label_flatten = tf.reshape(sentiment_labels, [-1])
        mse_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=true_label_flatten,
          logits=output_sentiment) * true_sequence) / tf.reduce_sum(true_sequence)

        loss = cls_loss + mse_loss
        # loss = cls_loss

        learning_rate = tf.train.polynomial_decay(cg.learning_rate,
                                  tf.train.get_or_create_global_step(),
                                  cg.train_steps,
                                  end_learning_rate=cg.lr_limit,
                                  power=1.0,
                                  cycle=False)

        lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate)
        optimizer = tf.train.AdamOptimizer(lr, name='optimizer')
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())

        current_steps = tf.train.get_or_create_global_step()
        logging_hook = tf.train.LoggingTensorHook(
          {'step' : current_steps, 'loss' : loss, 'cls_loss' : cls_loss, 'mse_loss': mse_loss, 'lr' : lr}, 
          every_n_iter=cg.print_info_interval)

        output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
      elif mode == tf.estimator.ModeKeys.EVAL:
        # TODO
        raise NotImplementedError
    
    return output_spec