Example #1
0
    def __init__(self, params):
        """Constructor for BertModel.

    Args:
      params: `BigBirdConfig` dictionary.
    """
        self.params = copy.deepcopy(params)
        self.scope = params["scope"]

        with tf.compat.v1.variable_scope(self.scope,
                                         reuse=tf.compat.v1.AUTO_REUSE) as vs:
            self.embeder = utils.EmbeddingLayer(
                vocab_size=self.params["vocab_size"],
                emb_dim=self.params["hidden_size"],
                initializer=utils.create_initializer(
                    self.params["initializer_range"]),
                scale_emb=self.params["rescale_embedding"],
                use_token_type=True,
                num_token_types=self.params["type_vocab_size"],
                use_position_embeddings=True,
                max_position_embeddings=self.params["max_position_embeddings"],
                dropout_prob=self.params["hidden_dropout_prob"])
            self.encoder = encoder.EncoderStack(self.params)
            self.pooler = tf.compat.v1.layers.Dense(
                units=self.params["hidden_size"],
                activation=tf.tanh,
                kernel_initializer=utils.create_initializer(
                    self.params["initializer_range"]),
                name="pooler/dense")
            super(BertModel, self).__init__(name=self.scope, _scope=vs)
Example #2
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in Pegasus style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PrenormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer
    attention_head_size = hidden_size // num_attention_heads
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks,
        attention_head_size, initializer_range, block_size, block_size,
        attention_probs_dropout_prob, use_bias, seed, name="self")

    # Dense layers
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Example #3
0
    def __init__(self, params):
        """Constructor for BertModel.

    Args:
      params: `BigBirdConfig` dictionary.
    """
        self.params = copy.deepcopy(params)
        self.scope = params["scope"]
        super(BertModel, self).__init__(name=self.scope)

        # validate params
        self.pad = lambda x: x
        if params["max_encoder_length"] <= 512:
            logging.info("Switching to full attention for short sequences")
            self.params["attention_type"] = "original_full"
        if self.params["attention_type"] == "simulated_sparse" or self.params[
                "attention_type"] == "block_sparse":
            if params["max_encoder_length"] % params["block_size"]:
                logging.info(
                    "Expand max_encoder_length to next multiple of block_size")
                self.params["max_encoder_length"] = (
                    params["max_encoder_length"] // params["block_size"] +
                    1) * params["block_size"]
                pad_size = self.params["max_encoder_length"] - params[
                    "max_encoder_length"]
                paddings = [[0, 0], [0, pad_size]]
                self.pad = lambda x: tf.pad(x, paddings)

        with tf.compat.v1.variable_scope(self.scope,
                                         reuse=tf.compat.v1.AUTO_REUSE):
            self.embeder = utils.EmbeddingLayer(
                vocab_size=self.params["vocab_size"],
                emb_dim=self.params["hidden_size"],
                initializer=utils.create_initializer(
                    self.params["initializer_range"]),
                scale_emb=self.params["rescale_embedding"],
                use_token_type=True,
                num_token_types=self.params["type_vocab_size"],
                use_position_embeddings=True,
                max_position_embeddings=self.params["max_position_embeddings"],
                dropout_prob=self.params["hidden_dropout_prob"])
            self.encoder = encoder.EncoderStack(self.params)
            self.pooler = utils.SimpleDenseLayer(
                input_size=self.params["hidden_size"],
                output_size=self.params["hidden_size"],
                initializer=utils.create_initializer(
                    self.params["initializer_range"]),
                activation=tf.tanh,
                name="pooler/dense")
Example #4
0
  def __init__(self, params, input_ids, target_ids=None, training=None):
    """Constructor for TransformerModel.

    Args:
      params: `BigBirdConfig` dictionary.
      # Run the inputs through the encoder layer to map the symbol
      # representations to continuous representations.
    """
    self.params = copy.deepcopy(params)
    self.scope = params["scope"]

    with tf.compat.v1.variable_scope(
        self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs:
      self.embeder = utils.EmbeddingLayer(
          vocab_size=self.params["vocab_size"],
          emb_dim=self.params["hidden_size"],
          initializer=utils.create_initializer(
              self.params["initializer_range"]),
          scale_emb=self.params["rescale_embedding"],
          use_token_type=False,
          num_token_types=None,
          use_position_embeddings=True,
          max_position_embeddings=self.params["max_position_embeddings"],
          dropout_prob=self.params["hidden_dropout_prob"])

      # encoder
      self.encoder = encoder.EncoderStack(self.params)
      self.encoder_output, encoder_mask = self._encode(input_ids, training)

      # decoder
      self.decoder = decoder.DecoderStack(self.params)
      self.predictions = self._decode_and_predict(target_ids, self.encoder_output,
                                             encoder_mask, training)

      super(TransformerModel, self).__init__(name=self.scope, _scope=vs)
Example #5
0
    def __init__(self, params):
        """Constructor for TransformerModel.

    Args:
      params: `BigBirdConfig` dictionary.
    """
        self.params = copy.deepcopy(params)
        self.scope = params["scope"]

        with tf.compat.v1.variable_scope(self.scope,
                                         reuse=tf.compat.v1.AUTO_REUSE) as vs:
            self.embeder = utils.EmbeddingLayer(
                vocab_size=self.params["vocab_size"],
                emb_dim=self.params["hidden_size"],
                initializer=utils.create_initializer(
                    self.params["initializer_range"]),
                scale_emb=self.params["rescale_embedding"],
                use_token_type=False,
                num_token_types=None,
                use_position_embeddings=True,
                max_position_embeddings=self.params["max_position_embeddings"],
                dropout_prob=self.params["hidden_dropout_prob"])
            self.encoder = encoder.EncoderStack(self.params)
            self.decoder = decoder.DecoderStack(self.params)
            super(TransformerModel, self).__init__(name=self.scope, _scope=vs)
Example #6
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = modeling.BertModel(bert_config)
        masked_lm = MaskedLMLayer(bert_config["hidden_size"],
                                  bert_config["vocab_size"],
                                  model.embeder,
                                  initializer=utils.create_initializer(
                                      bert_config["initializer_range"]),
                                  activation_fn=utils.get_activation(
                                      bert_config["hidden_act"]))
        next_sentence = NSPLayer(bert_config["hidden_size"],
                                 initializer=utils.create_initializer(
                                     bert_config["initializer_range"]))

        sequence_output, pooled_output = model(
            features["input_ids"],
            training=is_training,
            token_type_ids=features.get("segment_ids"))

        masked_lm_loss, masked_lm_log_probs = masked_lm(
            sequence_output,
            label_ids=features.get("masked_lm_ids"),
            label_weights=features.get("masked_lm_weights"),
            masked_lm_positions=features.get("masked_lm_positions"))

        next_sentence_loss, next_sentence_log_probs = next_sentence(
            pooled_output, features.get("next_sentence_labels"))

        total_loss = masked_lm_loss
        if bert_config["use_nsp"]:
            total_loss += next_sentence_loss

        tvars = tf.compat.v1.trainable_variables()
        utils.log_variables(tvars, bert_config["ckpt_var_list"])

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            learning_rate = optimization.get_linear_warmup_linear_decay_lr(
                init_lr=bert_config["learning_rate"],
                num_train_steps=bert_config["num_train_steps"],
                num_warmup_steps=bert_config["num_warmup_steps"])

            optimizer = optimization.get_optimizer(bert_config, learning_rate)

            global_step = tf.compat.v1.train.get_global_step()

            gradients = optimizer.compute_gradients(total_loss, tvars)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                host_call=utils.add_scalars_to_summary(
                    bert_config["output_dir"],
                    {"learning_rate": learning_rate}))

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_loss_value, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_loss_value, next_sentence_log_probs,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.compat.v1.metrics.mean(
                    values=masked_lm_loss_value)

                next_sentence_predictions = tf.argmax(next_sentence_log_probs,
                                                      axis=-1,
                                                      output_type=tf.int32)
                next_sentence_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.compat.v1.metrics.mean(
                    values=next_sentence_loss_value)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"],
                features["masked_lm_weights"], next_sentence_loss,
                next_sentence_log_probs, features["next_sentence_labels"]
            ])
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, eval_metrics=eval_metrics)
        else:

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={
                    "log-probabilities": masked_lm_log_probs,
                    "seq-embeddings": sequence_output
                })

        return output_spec
Example #7
0
  def __init__(self,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               use_bias=True,
               name=None):
    """Constructor of a decoder layer of a transformer in Pegasus style.

    Args:
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      name: The name scope of this layer.
    """
    super(PrenormDecoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layers
    attention_head_size = hidden_size // num_attention_heads
    self.self_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="self",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)
    self.cross_attn_layer = attention.MultiHeadedAttentionLayer(
        "original_full", use_bias=use_bias, name="encdec",
        num_attention_heads=num_attention_heads,
        size_per_head=attention_head_size,
        initializer_range=initializer_range,
        attention_probs_dropout_prob=attention_probs_dropout_prob)

    # Dense layers
    self.self_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.cross_proj_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size,
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    self.contract_layer = utils.Dense2dLayer(
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
    self.third_layer_norm = utils.NormLayer()
Example #8
0
  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
    """The `model_fn` for TPUEstimator."""

    if isinstance(features, dict):
      if not labels and "labels" in features:
        labels = features["labels"]
      features = features["input_ids"]

    is_training = (mode == tf.estimator.ModeKeys.TRAIN)

    model = modeling.BertModel(bert_config)
    headl = ClassifierLossLayer(
        bert_config["num_labels"], bert_config["hidden_dropout_prob"],
        utils.create_initializer(bert_config["initializer_range"]),
        name=bert_config["scope"]+"/classifier")

    _, pooled_output = model(features, training=is_training)
    total_loss, log_probs = headl(pooled_output, labels, is_training)

    tvars = tf.compat.v1.trainable_variables()
    utils.log_variables(tvars, bert_config["ckpt_var_list"])

    output_spec = None
    if mode == tf.estimator.ModeKeys.TRAIN:

      learning_rate = optimization.get_linear_warmup_linear_decay_lr(
          init_lr=bert_config["learning_rate"],
          num_train_steps=bert_config["num_train_steps"],
          num_warmup_steps=bert_config["num_warmup_steps"])

      optimizer = optimization.get_optimizer(bert_config, learning_rate)

      global_step = tf.compat.v1.train.get_or_create_global_step()

      gradients = optimizer.compute_gradients(total_loss, tvars)
      train_op = optimizer.apply_gradients(gradients, global_step=global_step)

      output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          train_op=train_op,
          host_call=utils.add_scalars_to_summary(
              bert_config["output_dir"], {"learning_rate": learning_rate}))

    elif mode == tf.estimator.ModeKeys.EVAL:

      def metric_fn(loss_value, label_ids, log_probs):
        loss = tf.compat.v1.metrics.mean(values=loss_value)

        predictions = tf.argmax(log_probs, axis=-1, output_type=tf.int32)
        accuracy = tf.compat.v1.metrics.accuracy(
            labels=label_ids, predictions=predictions)
        p1, p1_op = tf.compat.v1.metrics.precision_at_k(
            labels=tf.cast(label_ids, tf.int64), predictions=log_probs, k=1)
        r1, r1_op = tf.compat.v1.metrics.recall_at_k(
            labels=tf.cast(label_ids, tf.int64), predictions=log_probs, k=1)
        f11 = tf.math.divide_no_nan(2*p1*r1, p1+r1)

        metric_dict = {
            "P@1": (p1, p1_op),
            "R@1": (r1, r1_op),
            "f1@1": (f11, tf.no_op()),
            "classification_accuracy": accuracy,
            "classification_loss": loss,
        }

        return metric_dict

      eval_metrics = (metric_fn,
                      [tf.expand_dims(total_loss, 0), labels, log_probs])
      output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          loss=total_loss,
          eval_metrics=eval_metrics)
    else:
      output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
          mode=mode,
          predictions={"log-probabilities": log_probs})

    return output_spec
Example #9
0
  def __init__(self,
               attention_type,
               hidden_size=768,
               intermediate_size=3072,
               intermediate_act_fn=utils.gelu,
               attention_probs_dropout_prob=0.0,
               hidden_dropout_prob=0.1,
               initializer_range=0.02,
               num_attention_heads=12,
               num_rand_blocks=3,
               block_size=64,
               use_bias=True,
               seed=None,
               name=None):
    """Constructor of an encoder layer of a transformer in BERT style.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      hidden_size: (optional) int. Size of hidden dimension.
      intermediate_size: (optional) int. Size of intermediate dimension.
      intermediate_act_fn: optional) Activation function for intermediate layer.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      hidden_dropout_prob: (optional) float. Dropout probability of the
        attention.
      initializer_range: (optional) float. Range of the weight initializer.
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      block_size: (optional) int. size of block in sequence.
      use_bias: (optional) bool. Whether key/query/value uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      name: The name scope of this layer.
    """
    super(PostnormEncoderLayer, self).__init__(name=name)
    self.hidden_dropout_prob = hidden_dropout_prob

    # Attention layer의 정
    attention_head_size = hidden_size // num_attention_heads # 12 multi-head attention 을 위해서 head size를 정의
    self.attn_layer = attention.MultiHeadedAttentionLayer(
        attention_type, num_attention_heads, num_rand_blocks, # block_sparse, 12, 3
        attention_head_size, initializer_range, block_size, block_size, # 64, 0.01, 16, 16
        attention_probs_dropout_prob, use_bias, seed, name="self") # 0.01, true, (0~11 seed encoder layer에 만큼 커짐)

    # Dense layers: attention 결과를 1)추출 -> 2)확장 -> 3)축소 하는 방식으로 Feature를 더 정교하게 뽑아내는 과정
    # 1) 어텐션을 projection 하는 레이어
    self.projection_layer = utils.Dense3dProjLayer(
        num_attention_heads, attention_head_size, # 12, 64
        utils.create_initializer(initializer_range), None, "dense", use_bias)
    # 2) 확장 레이어 정의
    self.expand_layer = utils.Dense2dLayer(
        intermediate_size, utils.create_initializer(initializer_range),
        intermediate_act_fn, "dense")
    # 3) 축소 레이어 정의
    self.contract_layer = utils.Dense2dLayer( # 마지막 레이어 feature를 뽑아내는 레이어
        hidden_size, utils.create_initializer(initializer_range),
        None, "dense")

    # Normalization layer
    self.first_layer_norm = utils.NormLayer()
    self.second_layer_norm = utils.NormLayer()
Example #10
0
    def __init__(self,
                 attention_type,
                 num_attention_heads=1,
                 num_rand_blocks=3,
                 size_per_head=512,
                 initializer_range=0.02,
                 from_block_size=64,
                 to_block_size=64,
                 attention_probs_dropout_prob=0.0,
                 use_bias=True,
                 seed=None,
                 query_act=None,
                 key_act=None,
                 value_act=None,
                 name=None,
                 **kwargs):
        """Constructor for a multi-headed attention layer.

    Args:
      attention_type: Type of attention, needs to be one of ['original_full',
        'simulated_sparse', 'block_sparse'].
      num_attention_heads: (optional) int. Number of attention heads.
      num_rand_blocks: (optional) int. Number of random chunks per row.
      size_per_head: (optional) int. Size of each attention head.
      initializer_range: (optional) float. Range of the weight initializer.
      from_block_size: (optional) int. size of block in from sequence.
      to_block_size: (optional) int. size of block in to sequence.
      attention_probs_dropout_prob: (optional) float. Dropout probability of the
        attention probabilities.
      use_bias: Whether the layer uses a bias vector.
      seed: (Optional) int. Reandom seed for generating random mask.
      query_act: (optional) Activation function for the query transform.
      key_act: (optional) Activation function for the key transform.
      value_act: (optional) Activation function for the value transform.
      name: The name scope of this layer.
      **kwargs: others
    """
        super(MultiHeadedAttentionLayer, self).__init__(name=name, **kwargs)
        self.query_layer = utils.Dense3dLayer(
            num_attention_heads,
            size_per_head,
            utils.create_initializer(initializer_range),
            query_act,
            "query",
            head_first=True,
            use_bias=use_bias)

        self.key_layer = utils.Dense3dLayer(
            num_attention_heads,
            size_per_head,
            utils.create_initializer(initializer_range),
            key_act,
            "key",
            head_first=True,
            use_bias=use_bias)

        self.value_layer = utils.Dense3dLayer(
            num_attention_heads,
            size_per_head,
            utils.create_initializer(initializer_range),
            value_act,
            "value",
            head_first=True,
            use_bias=use_bias)

        def attn_impl(query, key, value, attention_mask, band_mask, from_mask,
                      to_mask, from_blocked_mask, to_blocked_mask, batch_size,
                      from_seq_length, to_seq_length, training):
            if attention_type == "original_full":
                logging.info("**** Using original full attention ****")
                attn_fn = original_full_attention(
                    query, key, value, attention_mask, size_per_head,
                    attention_probs_dropout_prob if training else 0.0)
            elif attention_type == "simulated_sparse":
                logging.info("**** Using simulated sparse attention ****")
                attn_fn = bigbird_simulated_attention(
                    query, key, value, attention_mask, num_attention_heads,
                    num_rand_blocks, size_per_head, from_seq_length,
                    to_seq_length, from_block_size, to_block_size, seed)
            elif attention_type == "block_sparse":
                logging.info("**** Using block sparse attention ****")
                attn_fn = bigbird_block_sparse_attention(
                    query, key, value, band_mask, from_mask, to_mask,
                    from_blocked_mask, to_blocked_mask, num_attention_heads,
                    num_rand_blocks, size_per_head, batch_size,
                    from_seq_length, to_seq_length, from_block_size,
                    to_block_size, seed)
            else:
                raise NotImplementedError(
                    "Attention type {} is not implemented".format(
                        attention_type))
            return attn_fn

        self.attn_impl = attn_impl
Example #11
0
  def __init__(self, params,
               input_ids,
               token_type_ids=None,
               training=None):

    """Constructor for BertModel.

    Args:
      params: `BigBirdConfig` dictionary.
      input_ids: int32 Tensor of shape [batch_size, seq_length].
      token_type_ids: (optional) int32 Tensor of shape [batch_size, seq_length].
      training: Boolean indicating whether the call is training or inference.

    """
    self.params = copy.deepcopy(params)
    self.scope = params["scope"]

    with tf.compat.v1.variable_scope(
        self.scope, reuse=tf.compat.v1.AUTO_REUSE) as vs:

        #token type 의 embedding을 위해서 따로 token type을 만들지 않았다면 모두 0번 token([CLS])의 embedding을 만들어줌
        if token_type_ids is None:
            token_type_ids = tf.zeros_like(input_ids, dtype=tf.int32)

        # input_ids의 input_mask를 생성하는 부분
        input_mask = tf.where(input_ids > 0,
                              tf.ones_like(input_ids), tf.zeros_like(input_ids))
        # 1) embedding process
        # 1-1 embedding layer정의
        self.embeder = utils.EmbeddingLayer(
          vocab_size=self.params["vocab_size"], # 50358
          emb_dim=self.params["hidden_size"], # 768
          initializer=utils.create_initializer(
              self.params["initializer_range"]), #초기화 0.02 truncated_normal_initializer 사
          scale_emb=self.params["rescale_embedding"], # false
          use_token_type=True,
          num_token_types=self.params["type_vocab_size"], # 2
          use_position_embeddings=True, # position embedding 실행
          max_position_embeddings=self.params["max_position_embeddings"],# 4096
          dropout_prob=self.params["hidden_dropout_prob"]) # drop out 10%

        # 1-2 embedding layer 사용 token + token_type + position
        embedding_output = self.embeder.operation(input_ids,
                                        self.params["max_encoder_length"],
                                        token_type_ids=token_type_ids,
                                        training=training)


        # 2) encoder 레이어 정의
        # 2-1 encoder 레이어 정의
        self.encoder = encoder.EncoderStack(self.params)

        # 2-2 encoder 계산 Sparse Attention
        self.sequence_output = self.encoder.operation(embedding_output, input_mask, training)

        # 3) Pooling 레이어 정의

        # The "pooler" converts the encoded sequence tensor of shape
        # [batch_size, seq_length, hidden_size] to a tensor of shape
        # [batch_size, hidden_size]. This is necessary for segment-level
        # (or segment-pair-level) classification tasks where we need a fixed
        # dimensional representation of the segment.
        first_token_tensor = self.sequence_output[:, 0, :] # [CLS] token에 대한 attetion 값을 가져(4, 768)

        # We "pool" the model by simply taking the hidden state corresponding
        # to the first token. We assume that this has been pre-trained
        # 마지막 768 만큼 dense  계산
        self.pooler = tf.compat.v1.layers.Dense(
          units=self.params["hidden_size"],
          activation=tf.tanh,
          kernel_initializer=utils.create_initializer(
              self.params["initializer_range"]),
          name="pooler/dense") # 결과 -> (4, 786)

        self.pooled_output = self.pooler(first_token_tensor)

        super(BertModel, self).__init__(name=self.scope, _scope=vs)
Example #12
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # BigBird model  정의
        model = modeling.BertModel(bert_config,
                                   features["input_ids"],
                                   training=is_training,
                                   token_type_ids=features.get("segment_ids"))
        # attention feature와 cls token에 대한 pooling feature를 가져옴
        sequence_output, pooled_output = model.get_output_feature()

        masked_lm = MaskedLMLayer(  # masked language output 계산 모델 정의
            bert_config["hidden_size"],
            bert_config["vocab_size"],
            model.embeder,
            input_tensor=sequence_output,
            label_ids=features.get("masked_lm_ids"),
            label_weights=features.get("masked_lm_weights"),
            masked_lm_positions=features.get("masked_lm_positions"),
            initializer=utils.create_initializer(
                bert_config["initializer_range"]),
            activation_fn=utils.get_activation(bert_config["hidden_act"]))

        masked_lm_loss, masked_lm_log_probs = masked_lm.get_mlm_loss()

        total_loss = masked_lm_loss

        tvars = tf.compat.v1.trainable_variables()
        utils.LogVariable(tvars, bert_config["ckpt_var_list"])

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            # optimize 계산
            opt_model = optimization.LinearWarmupLinearDecay(  # optimize model 불러옴
                init_lr=bert_config["learning_rate"],
                num_train_steps=bert_config["num_train_steps"],
                num_warmup_steps=bert_config["num_warmup_steps"])
            learning_rate = opt_model.get_learning_rate()  # laernin rate 가져옴

            optimizer = optimization.Optimizer(bert_config, learning_rate)
            optimizer = optimizer.get_optimizer()

            global_step = tf.compat.v1.train.get_global_step()

            gradients = optimizer.compute_gradients(total_loss, tvars)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            logging_hook = [
                tf.compat.v1.train.LoggingTensorHook(
                    {"loss is -> ": total_loss}, every_n_iter=256),
                tf.compat.v1.train.LoggingTensorHook(
                    {"global step -> ": global_step}, every_n_iter=256),
                tf.compat.v1.train.LoggingTensorHook(
                    {"learning rate -> ": learning_rate}, every_n_iter=256)
            ]

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                training_hooks=logging_hook,
                host_call=utils.add_scalars_to_summary(
                    bert_config["output_dir"],
                    {"learning_rate": learning_rate}))

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_loss_value, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights):

                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.compat.v1.metrics.mean(
                    values=masked_lm_loss_value)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"],
                features["masked_lm_weights"]
            ])

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, eval_metrics=eval_metrics)
        else:
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={
                    "log-probabilities": masked_lm_log_probs,
                    "seq-embeddings": sequence_output
                })

        return output_spec