Esempio n. 1
0
    def __init__(self,
                 config,
                 is_training,
                 input_ids,
                 input_mask=None,
                 token_type_ids=None,
                 use_one_hot_embeddings=False,
                 scope=None):
        """Constructor for BertEncoder.
    
    Args:
      token_type_ids: maybe for positional embedding for encoder.
    
    TODO
    """
        # create a copy of config, prevent from changing the original configuration.
        config = copy.deepcopy(config)
        if not is_training:
            config.hidden_drouput_prob = 0.0
            config.attention_probs_dropout_prob = 0.0

        input_shape = ft.get_shape_list(input_ids, expected_rank=2)
        batch_size = input_shape[0]
        seq_length = input_shape[1]

        if input_mask is None:
            input_mask = tf.ones(shape=[batch_size, seq_length],
                                 dtype=tf.int32)

        if token_type_ids is None:
            token_type_ids = tf.zeros(shape=[batch_size, seq_length],
                                      dtype=tf.int32)

        self.build_graph(config, input_ids, input_mask, token_type_ids,
                         use_one_hot_embeddings)
Esempio n. 2
0
def vae(state, num_units, scope='vae'):
    """VAE implementation, Hard Coding.
    
    The formula to calculate the vae loss:
      vae_loss = (-0.5 * tf.reduce_sum(1.0 + vae_vb - tf.square(vae_mean) - tf.exp(vae_vb)) / batch_size) * 0.001  
    """
    shape = ft.get_shape_list(state, expected_rank=2)

    with tf.variable_scope(scope):
        vae_mean = tf.layers.dense(state,
                                   num_units,
                                   activation=tf.nn.tanh,
                                   name='vae_mean',
                                   kernel_initializer=ft.create_initializer())

        vae_vb = tf.layers.dense(state,
                                 num_units,
                                 activation=tf.nn.tanh,
                                 name='vae_vb',
                                 kernel_initializer=ft.create_initializer())

        eps = tf.random_normal([shape[0], num_units],
                               0.0,
                               1.0,
                               dtype=tf.float32)

        z = vae_mean + tf.sqrt(tf.exp(vae_vb)) * eps

    return z, vae_mean, vae_vb
Esempio n. 3
0
    def __init__(self,
                 config,
                 is_training,
                 encoder_state,
                 embedding_table,
                 decoder_intput_data=None,
                 seq_length_decoder_input_data=None,
                 scope=None):
        config = copy.deepcopy(config)
        self.is_training = is_training
        self.embedding_table = embedding_table

        input_shape = ft.get_shape_list(encoder_state, expected_rank=2)
        self.batch_size = input_shape[0]

        self.tgt_vocab_size = config.tgt_vocab_size
        self.unit_type = config.unit_type
        self.num_units = config.num_units
        self.forget_bias = config.forget_bias

        if not is_training:
            self.dropout = 0.0
        else:
            self.dropout = config.dropout

        initializer_range = config.initializer_range

        self.tgt_sos_id = tf.constant(config.sos_id, dtype=tf.int32)
        self.tgt_eos_id = tf.constant(config.eos_id, dtype=tf.int32)
        self.max_len_infer = config.max_len_infer

        self.build_graph(encoder_state, initializer_range,
                         seq_length_decoder_input_data, decoder_intput_data,
                         scope)
Esempio n. 4
0
def embedding_postprocessor(input_tensor,
                            use_positional_embeddings=True,
                            positional_embedding_name='positional_embeddings',
                            initializer_range=0.02,
                            max_positional_embeddings=512,
                            dropout_prob=0.1):
    """Perform positional embeddings on a word embedding tensor.

  Args:
    input_tensor: float Tensor of shape [batch_size, seq_length, embedding_size].
    use_position_embeddings: bool. Whether to add position embeddings for the
      position of each token in the sequence.
    position_embedding_name: string. The name of the embedding table variable
      for positional embeddings.
    initializer_range: float. Range of the weight initialization.
    max_position_embeddings: int. Maximum sequence length that might ever be
      used with this model. This can be longer than the sequence length of
      input_tensor, but cannot be shorter.
    dropout_prob: float. Dropout probability applied to the final output tensor.
    
  Returns:
    float tensor with same sahpe as 'input_tensor'.
  """
    input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
    seq_length = input_shape[1]
    width = input_shape[2]

    if use_positional_embeddings:
        assert_op = tf.assert_less_equal(seq_length, max_positional_embeddings)
        with tf.control_dependencies([assert_op]):
            full_positional_embeddings = tf.get_variable(
                name=positional_embedding_name,
                shape=[max_positional_embeddings, width],
                initializer=ft.create_initializer(
                    initializer_range=initializer_range))

        positional_embeddings = tf.slice(
            full_positional_embeddings, [0, 0],
            [seq_length, -1])  # [seq_length, width]
        positional_embeddings = tf.expand_dims(positional_embeddings,
                                               [0])  # [1, seq_length, width]

        output = input_tensor + positional_embeddings

    output = ft.layer_norm_and_dropout(output, dropout_prob)
    return output
Esempio n. 5
0
def gather_indexs(sequence_output, sentiment_mask_indices):
  shape = ft.get_shape_list(sequence_output, expected_rank=3)
  batch_size = shape[0]
  seq_length = shape[1]
  width = shape[2]

  # [b, 1]
  flat_offsets = tf.reshape(
      tf.range(0, batch_size, dtype=tf.int32) * seq_length, [-1, 1])
  # [b, x]
  flat_positions = tf.reshape(flat_offsets + sentiment_mask_indices, [-1])
  # [b * s, w]
  flat_sequence_output = tf.reshape(sequence_output, [batch_size * seq_length, width])
  # [b * x, w]
  output_tensor = tf.gather(flat_sequence_output, flat_positions)

  return output_tensor
Esempio n. 6
0
def calculate_mse_loss(model_output, true_label, true_sequence):
  """This is used for calculating the mse loss.
  
  Args:
    model_output: (batch_size * seq_length, mask_padding_size).
    true_label: (batch, seq_length, mask_padding_size).
    true_sequence: (batch * seq_length * mask_padding_size).

  Returns:
    mse_loss: tf.float32.
  """
  batch_size = tf.cast(ft.get_shape_list(model_output, expected_rank=2)[0], dtype=tf.float32)
  # flatten the tensor
  model_output_flatten = tf.reshape(model_output, [-1])
  true_label_flatten = tf.reshape(true_label, [-1])
  # get actual length without mask, cause the following mse calculation ignore the mask
  length = tf.reduce_sum(true_sequence)

  mse_loss = tf.reduce_sum(
    tf.pow((model_output_flatten - true_label_flatten), 2) * true_sequence) / batch_size

  return mse_loss
Esempio n. 7
0
def embedding_lookup(input_ids,
                     vocab_size,
                     embedding_size,
                     initializer_range,
                     word_embedding_name='word_embeddings',
                     use_one_hot_embeddings=False):
    """Looks up words embeddings for id tensor.

  Args:
    input_ids: int32 Tensor of shape [batch_size, seq_length] containing word ids.
    vocab_size: int. Size of the embedding vocabulary.
    embedding_size: int. Width of the word embeddings.
    initializer_range: float. Embedding initialation range.
    word_embedding_name: string. Name of the embedding table.
    use_one_hot_embeddings: bool. If True. use one-hot method for word embedding.
      If False, use 'tf.gather()'.
  
  Returns:
    float Tensor of shape [batch_size, seq_length, embedding_size].
  """

    embedding_table = tf.get_variable(
        name=word_embedding_name,
        shape=[vocab_size, embedding_size],
        initializer=ft.create_initializer(initializer_range=initializer_range))

    if use_one_hot_embeddings:
        input_shape = ft.get_shape_list(input_ids, expected_rank=2)
        input_ids_squeeze = tf.reshape(input_ids, [-1])
        one_hot_input_ids = tf.one_hot(input_ids_squeeze, depth=vocab_size)
        output = tf.matmul(one_hot_input_ids, embedding_table)
        output = tf.reshape(output, [input_shape[0], input_shape[1], -1])
    else:
        output = tf.nn.embedding_lookup(embedding_table, input_ids)

    return output, embedding_table
Esempio n. 8
0
def transformer_model(input_tensor,
                      attention_mask,
                      hidden_size,
                      num_hidden_layers,
                      num_attention_heads,
                      intermediate_size,
                      intermediate_act_fn,
                      hidden_dropout_prob,
                      attention_dropout_prob,
                      initializer_range,
                      do_return_all_layers=False):
    """Multi-headed, multi-layer Transformer from 'Attention is All you need'.
  
  This is almost an exact implementation of the original Transformer encoder.

  Args:
  input_tensor: float Tensor of shape [batch_size, seq_length, hidden_size].
  attention_mask: (optional) int32 Tensor of shape [batch_size, seq_length,
    seq_length], with 1 for positions that can be attended to and 0 in
    positions that should not be.
  hidden_size: int. Hidden size of the Transformer.
  num_hidden_layers: int. Number of layers (blocks) in the Transformer.
  num_attention_heads: int. Number of attention heads in the Transformer.
  intermediate_size: int. The size of the "intermediate" (a.k.a., feed
    forward) layer.
  intermediate_act_fn: function. The non-linear activation function to apply
    to the output of the intermediate/feed-forward layer.
  hidden_dropout_prob: float. Dropout probability for the hidden layers.
  attention_probs_dropout_prob: float. Dropout probability of the attention
    probabilities.
  initializer_range: float. Range of the initializer (stddev of truncated
    normal).
  do_return_all_layers: Whether to also return all layers or just the final
    layer.

  Returns:
    float Tensor of shape [batch_size, seq_length, hidden_size], the final
    hidden layer of the Transformer.
  """
    if hidden_size % num_attention_heads != 0:
        raise ValueError(
            'The hidden size ({}) is not a multiple of the number of attention head\
         ({}).'.format(hidden_size, num_attention_heads))

    attention_head_size = int(hidden_size / num_attention_heads)
    input_shape = ft.get_shape_list(input_tensor, expected_rank=3)
    batch_size = input_shape[0]
    seq_length = input_shape[1]
    input_width = input_shape[2]

    if input_width != hidden_size:
        raise ValueError(
            'The width of the input tensor ({}) != hidden size ({}).'.format(
                input_width, hidden_size))

    prev_output = input_tensor
    all_layers_outputs = []
    for layer_idx in range(num_hidden_layers):
        with tf.variable_scope('layer_{}'.format(layer_idx)):
            layer_input = prev_output

            with tf.variable_scope('attention'):
                with tf.variable_scope('self'):
                    # [b, s, n * a]
                    attention_head = attention_layer(
                        input_tensor=layer_input,
                        attention_mask=attention_mask,
                        num_attention_heads=num_attention_heads,
                        size_per_head=attention_head_size,
                        query_act=None,
                        key_act=None,
                        value_act=None,
                        attention_dropout_prob=attention_dropout_prob,
                        initializer_range=initializer_range,
                        batch_size=batch_size,
                        seq_length=seq_length)

                with tf.variable_scope('output'):
                    # [b, s, h]
                    attention_output = tf.layers.dense(
                        attention_head,
                        hidden_size,
                        kernel_initializer=ft.create_initializer(
                            initializer_range=initializer_range))
                    attention_output = ft.dropout(attention_output,
                                                  hidden_dropout_prob)
                    attention_output = ft.layer_norm(attention_output +
                                                     layer_input)

            with tf.variable_scope('intermediate'):
                # [b, s, i]
                intermediate_output = tf.layers.dense(
                    attention_head,
                    intermediate_size,
                    activation=intermediate_act_fn,
                    kernel_initializer=ft.create_initializer(
                        initializer_range=initializer_range))

            with tf.variable_scope('output'):
                # [b, s, h]
                layer_output = tf.layers.dense(
                    intermediate_output,
                    hidden_size,
                    kernel_initializer=ft.create_initializer(
                        initializer_range=initializer_range))
                layer_output = ft.dropout(layer_output, hidden_dropout_prob)
                layer_output = ft.layer_norm(layer_output + attention_output)
                prev_output = layer_output
                all_layers_outputs.append(prev_output)

    if do_return_all_layers:
        return all_layers_outputs
    else:
        return all_layers_outputs[-1]
Esempio n. 9
0
  def model_fn(features, labels, mode, params):
    # features name and shape
    _info('*** Features ****')
    for name in sorted(features.keys()):
      tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape))

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # get data
    input_x = features['input_x']
    input_mask = features['input_mask']
    if is_training:
      input_y = features['input_y']
      seq_length = features['seq_length']
    else:
      input_y = None
      seq_length = None

    # build encoder
    model = BertEncoder(
      config=cg.BertEncoderConfig,
      is_training=is_training,
      input_ids=input_x,
      input_mask=input_mask)
    embedding_table = model.get_embedding_table()
    encoder_output = tf.reduce_sum(model.get_sequence_output(), axis=1)

    # build decoder
    decoder_model = Decoder(
      config=cg.DecoderConfig,
      is_training=is_training,
      encoder_state=encoder_output,
      embedding_table=embedding_table,
      decoder_intput_data=input_y,
      seq_length_decoder_input_data=seq_length)
    logits, sample_id, ppl_seq, ppl = decoder_model.get_decoder_output()

    if mode == tf.estimator.ModeKeys.PREDICT:
      predictions = {'sample_id': sample_id, 'ppls': ppl_seq}
      output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
      if mode == tf.estimator.ModeKeys.TRAIN:
        max_time = ft.get_shape_list(labels, expected_rank=2)[1]
        target_weights = tf.sequence_mask(seq_length, max_time, dtype=logits.dtype)
        batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=2)[0], tf.float32)

        loss = tf.reduce_sum(
          tf.nn.sparse_softmax_cross_entropy_with_logits(labels=labels, logits=logits) * target_weights) / batch_size

        learning_rate = tf.train.polynomial_decay(cg.learning_rate,
                                          tf.train.get_or_create_global_step(),
                                          cg.train_steps / 100,
                                          end_learning_rate=1e-4,
                                          power=1.0,
                                          cycle=False)

        lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate)
        optimizer = tf.train.AdamOptimizer(lr, name='optimizer')
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())


        # this is excellent, because it could display the result each step, i.e., each step equals to batch_size.
        # the output_spec, display the result every save checkpoints step.
        logging_hook = tf.train.LoggingTensorHook({'loss' : loss, 'ppl': ppl, 'lr': lr}, every_n_iter=cg.print_info_interval)

        output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
      elif mode == tf.estimator.ModeKeys.EVAL:
        # TODO
        raise NotImplementedError
    
    return output_spec
Esempio n. 10
0
  def model_fn(features, labels, mode, params):
    # features name and shape
    for name in sorted(features.keys()):
      tf.logging.info(' name = {}, shape = {}'.format(name, features[name].shape))

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    # get data
    input_data = features['input_data']
    input_mask = features['input_mask']
    if mode == tf.estimator.ModeKeys.TRAIN:
      sentiment_labels = features['sentiment_labels']
      sentiment_mask_indices = features['sentiment_mask_indices']
      true_length_from_data = features['true_length']

    # build model
    model = BertEncoder(
      config=cg.BertEncoderConfig,
      is_training=is_training,
      input_ids=input_data,
      input_mask=input_mask)
    
    tvars = tf.trainable_variables()
    initialized_variable_names = {}
    if init_checkpoint:
      (assignment_map, initialized_variable_names
      ) = get_assignment_map_from_checkpoint(tvars, init_checkpoint)
      tf.train.init_from_checkpoint(init_checkpoint, assignment_map)

    tf.logging.info("**** Trainable Variables ****")
    for var in tvars:
      init_string = ""
      if var.name in initialized_variable_names:
        init_string = ", *INIT_FROM_CKPT*"
      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
                      init_string)

    # [cls] output -> [b, h]
    cls_output = model.get_cls_output()
    # sequence_output -> [b, s, h], do not contain [CLS], because the mask indices do not shift
    sequence_output = model.get_sequence_output()[:, 1:, :]

    # project the hidden size to the num_classes
    with tf.variable_scope('final_output'):
      # [b, num_classes]
      output_logits = tf.layers.dense(
        cls_output,
        cg.BertEncoderConfig.num_classes,
        name='final_output',
        kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range))

    if mode == tf.estimator.ModeKeys.PREDICT:
      output_softmax = tf.nn.softmax(output_logits, axis=-1)
      output_result = tf.argmax(output_softmax, axis=-1)
      predictions = {'predict': output_result}
      output_spec = tf.estimator.EstimatorSpec(mode, predictions=predictions)
    else:
      if mode == tf.estimator.ModeKeys.TRAIN:
        # masked_output -> [b * x, h]
        masked_output = gather_indexs(sequence_output, sentiment_mask_indices)
        
        # get output for word polarity prediction
        with tf.variable_scope('sentiment_project'):
          # [b * x, 2]
          output_sentiment = tf.layers.dense(
            masked_output,
            2,
            name='final_output',
            kernel_initializer=ft.create_initializer(initializer_range=cg.BertEncoderConfig.initializer_range))
        # output_sentiment_probs = tf.nn.softmax(output_sentiment, axis=-1)

        batch_size = tf.cast(ft.get_shape_list(labels, expected_rank=1)[0], dtype=tf.float32)
        # cross-entropy loss
        cls_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=labels,
          logits=output_logits)) / batch_size

        # mse loss
        # # Regression Model
        true_sequence = get_true_sequence(true_length_from_data)
        # mse_loss = calculate_mse_loss(
        #   output_sentiment, sentiment_labels, true_sequence)

        # # Classification Model
        true_label_flatten = tf.reshape(sentiment_labels, [-1])
        mse_loss = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(
          labels=true_label_flatten,
          logits=output_sentiment) * true_sequence) / tf.reduce_sum(true_sequence)

        loss = cls_loss + mse_loss
        # loss = cls_loss

        learning_rate = tf.train.polynomial_decay(cg.learning_rate,
                                  tf.train.get_or_create_global_step(),
                                  cg.train_steps,
                                  end_learning_rate=cg.lr_limit,
                                  power=1.0,
                                  cycle=False)

        lr = tf.maximum(tf.constant(cg.lr_limit), learning_rate)
        optimizer = tf.train.AdamOptimizer(lr, name='optimizer')
        tvars = tf.trainable_variables()
        gradients = tf.gradients(loss, tvars, colocate_gradients_with_ops=cg.colocate_gradients_with_ops)
        clipped_gradients, _ = tf.clip_by_global_norm(gradients, 5.0)
        train_op = optimizer.apply_gradients(zip(clipped_gradients, tvars), global_step=tf.train.get_global_step())

        current_steps = tf.train.get_or_create_global_step()
        logging_hook = tf.train.LoggingTensorHook(
          {'step' : current_steps, 'loss' : loss, 'cls_loss' : cls_loss, 'mse_loss': mse_loss, 'lr' : lr}, 
          every_n_iter=cg.print_info_interval)

        output_spec = tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op, training_hooks=[logging_hook])
      elif mode == tf.estimator.ModeKeys.EVAL:
        # TODO
        raise NotImplementedError
    
    return output_spec