Ejemplo n.º 1
0
  def __init__(self, params):
    name = "encoder"
    super(EncoderStack, self).__init__(name=name)
    self.params = params

    if params["norm_type"] == "prenorm": # layer norm type을 설졍
      encoder_class = PrenormEncoderLayer
    elif params["norm_type"] == "postnorm": # 기본 postnorm encoder 사
      encoder_class = PostnormEncoderLayer
    else:
      raise NotImplementedError(
          "Norm type {} is not implemented".format(params["norm_type"]))

    # Encoder layers
    self.encoder_layers = [
        encoder_class(  # pylint: disable=g-complex-comprehension
            self.params["attention_type"], # block_sparse attention type 설정
            self.params["hidden_size"], # 768
            self.params["intermediate_size"], # intermediate_size
            utils.get_activation(self.params["hidden_act"]), # gelu activation function
            self.params["attention_probs_dropout_prob"], # 0.1
            self.params["hidden_dropout_prob"], # 0.1
            self.params["initializer_range"], # 0.02
            self.params["num_attention_heads"], # num_attention_heads
            self.params["num_rand_blocks"], # rand block : 3
            self.params["block_size"], # 16
            self.params["use_bias"], # True
            seed=layer_idx,
            name="layer_%d" % layer_idx)
        for layer_idx in range(self.params["num_hidden_layers"]) # 개 encoder 12개를 list에 담음
    ]

    # Normalization layer
    self.layer_norm = utils.NormLayer()
Ejemplo n.º 2
0
    def __init__(self, params):
        name = "encoder"
        super(EncoderStack, self).__init__(name=name)
        self.params = params

        if params["norm_type"] == "prenorm":
            encoder_class = PrenormEncoderLayer
        elif params["norm_type"] == "postnorm":
            encoder_class = PostnormEncoderLayer
        else:
            raise NotImplementedError("Norm type {} is not implemented".format(
                params["norm_type"]))

        # Encoder layers
        self.encoder_layers = [
            encoder_class(  # pylint: disable=g-complex-comprehension
                self.params["attention_type"],
                self.params["hidden_size"],
                self.params["intermediate_size"],
                utils.get_activation(self.params["hidden_act"]),
                self.params["attention_probs_dropout_prob"],
                self.params["hidden_dropout_prob"],
                self.params["initializer_range"],
                self.params["num_attention_heads"],
                self.params["num_rand_blocks"],
                self.params["block_size"],
                self.params["use_bias"],
                seed=layer_idx,
                name="layer_%d" % layer_idx)
            for layer_idx in range(self.params["num_hidden_layers"])
        ]

        # Normalization layer
        self.layer_norm = utils.NormLayer()
Ejemplo n.º 3
0
    def __init__(self, params):
        if params["couple_encoder_decoder"]:
            name = "encoder"
            super(DecoderStack, self).__init__(name=name)
        else:
            name = "decoder"
            super(DecoderStack, self).__init__(name=name)

        self.params = params

        if params["norm_type"] == "prenorm":
            decoder_class = PrenormDecoderLayer
        elif params["norm_type"] == "postnorm":
            decoder_class = PostnormDecoderLayer
        else:
            raise NotImplementedError("Norm type {} is not implemented".format(
                params["norm_type"]))

        if params["use_gradient_checkpointing"]:
            decoder_class = add_gradient_recomputation(decoder_class)

        if self.params.get("num_decoder_layers", None) is not None:
            num_hidden_layers = self.params["num_decoder_layers"]
        else:
            num_hidden_layers = self.params["num_hidden_layers"]

        with tf.compat.v1.variable_scope(name):
            # Decoder layers
            self.decoder_layers = [
                decoder_class(  # pylint: disable=g-complex-comprehension
                    self.params["hidden_size"],
                    self.params["intermediate_size"],
                    utils.get_activation(self.params["hidden_act"]),
                    self.params["attention_probs_dropout_prob"],
                    self.params["hidden_dropout_prob"],
                    self.params["initializer_range"],
                    self.params["num_attention_heads"],
                    self.params["use_bias"],
                    name="layer_%d" % layer_idx)
                for layer_idx in range(num_hidden_layers)
            ]

            # Normalization layer
            self.layer_norm = utils.NormLayer(self.params["hidden_size"])
Ejemplo n.º 4
0
  def __init__(self, params):
    if params["couple_encoder_decoder"]: # encoder 같다
      name = "encoder"
      with tf.compat.v1.variable_scope(
          name, reuse=tf.compat.v1.AUTO_REUSE) as scope:
        super(DecoderStack, self).__init__(name=name, _scope=scope)
    else:
      name = "decoder"
      super(DecoderStack, self).__init__(name=name)

    self.params = params

    if params["norm_type"] == "prenorm": # decoder prenorm 선택
      decoder_class = PrenormDecoderLayer
    elif params["norm_type"] == "postnorm":
      decoder_class = PostnormDecoderLayer
    else:
      raise NotImplementedError(
          "Norm type {} is not implemented".format(params["norm_type"]))

    if self.params.get("num_decoder_layers", None) is not None: # decoder number layer 를 설정했다
      num_hidden_layers = self.params["num_decoder_layers"]
    else:
      num_hidden_layers = self.params["num_hidden_layers"]# 하지 않았다면 기존 number layer 사

    # Decoder layers
    self.decoder_layers = [
        decoder_class(  # pylint: disable=g-complex-comprehension
            self.params["hidden_size"],
            self.params["intermediate_size"],
            utils.get_activation(self.params["hidden_act"]),
            self.params["attention_probs_dropout_prob"],
            self.params["hidden_dropout_prob"],
            self.params["initializer_range"],
            self.params["num_attention_heads"],
            self.params["use_bias"],
            name="layer_%d" % layer_idx)
        for layer_idx in range(num_hidden_layers)
    ]

    # Normalization layer
    self.layer_norm = utils.NormLayer()
Ejemplo n.º 5
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = modeling.BertModel(bert_config)
        masked_lm = MaskedLMLayer(bert_config["hidden_size"],
                                  bert_config["vocab_size"],
                                  model.embeder,
                                  initializer=utils.create_initializer(
                                      bert_config["initializer_range"]),
                                  activation_fn=utils.get_activation(
                                      bert_config["hidden_act"]))
        next_sentence = NSPLayer(bert_config["hidden_size"],
                                 initializer=utils.create_initializer(
                                     bert_config["initializer_range"]))

        sequence_output, pooled_output = model(
            features["input_ids"],
            training=is_training,
            token_type_ids=features.get("segment_ids"))

        masked_lm_loss, masked_lm_log_probs = masked_lm(
            sequence_output,
            label_ids=features.get("masked_lm_ids"),
            label_weights=features.get("masked_lm_weights"),
            masked_lm_positions=features.get("masked_lm_positions"))

        next_sentence_loss, next_sentence_log_probs = next_sentence(
            pooled_output, features.get("next_sentence_labels"))

        total_loss = masked_lm_loss
        if bert_config["use_nsp"]:
            total_loss += next_sentence_loss

        tvars = tf.compat.v1.trainable_variables()
        utils.log_variables(tvars, bert_config["ckpt_var_list"])

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:

            learning_rate = optimization.get_linear_warmup_linear_decay_lr(
                init_lr=bert_config["learning_rate"],
                num_train_steps=bert_config["num_train_steps"],
                num_warmup_steps=bert_config["num_warmup_steps"])

            optimizer = optimization.get_optimizer(bert_config, learning_rate)

            global_step = tf.compat.v1.train.get_global_step()

            gradients = optimizer.compute_gradients(total_loss, tvars)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                host_call=utils.add_scalars_to_summary(
                    bert_config["output_dir"],
                    {"learning_rate": learning_rate}))

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_loss_value, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights,
                          next_sentence_loss_value, next_sentence_log_probs,
                          next_sentence_labels):
                """Computes the loss and accuracy of the model."""
                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.compat.v1.metrics.mean(
                    values=masked_lm_loss_value)

                next_sentence_predictions = tf.argmax(next_sentence_log_probs,
                                                      axis=-1,
                                                      output_type=tf.int32)
                next_sentence_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=next_sentence_labels,
                    predictions=next_sentence_predictions)
                next_sentence_mean_loss = tf.compat.v1.metrics.mean(
                    values=next_sentence_loss_value)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                    "next_sentence_accuracy": next_sentence_accuracy,
                    "next_sentence_loss": next_sentence_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"],
                features["masked_lm_weights"], next_sentence_loss,
                next_sentence_log_probs, features["next_sentence_labels"]
            ])
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, eval_metrics=eval_metrics)
        else:

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={
                    "log-probabilities": masked_lm_log_probs,
                    "seq-embeddings": sequence_output
                })

        return output_spec
Ejemplo n.º 6
0
    def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        # BigBird model  정의
        model = modeling.BertModel(bert_config,
                                   features["input_ids"],
                                   training=is_training,
                                   token_type_ids=features.get("segment_ids"))
        # attention feature와 cls token에 대한 pooling feature를 가져옴
        sequence_output, pooled_output = model.get_output_feature()

        masked_lm = MaskedLMLayer(  # masked language output 계산 모델 정의
            bert_config["hidden_size"],
            bert_config["vocab_size"],
            model.embeder,
            input_tensor=sequence_output,
            label_ids=features.get("masked_lm_ids"),
            label_weights=features.get("masked_lm_weights"),
            masked_lm_positions=features.get("masked_lm_positions"),
            initializer=utils.create_initializer(
                bert_config["initializer_range"]),
            activation_fn=utils.get_activation(bert_config["hidden_act"]))

        masked_lm_loss, masked_lm_log_probs = masked_lm.get_mlm_loss()

        total_loss = masked_lm_loss

        tvars = tf.compat.v1.trainable_variables()
        utils.LogVariable(tvars, bert_config["ckpt_var_list"])

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            # optimize 계산
            opt_model = optimization.LinearWarmupLinearDecay(  # optimize model 불러옴
                init_lr=bert_config["learning_rate"],
                num_train_steps=bert_config["num_train_steps"],
                num_warmup_steps=bert_config["num_warmup_steps"])
            learning_rate = opt_model.get_learning_rate()  # laernin rate 가져옴

            optimizer = optimization.Optimizer(bert_config, learning_rate)
            optimizer = optimizer.get_optimizer()

            global_step = tf.compat.v1.train.get_global_step()

            gradients = optimizer.compute_gradients(total_loss, tvars)
            train_op = optimizer.apply_gradients(gradients,
                                                 global_step=global_step)
            logging_hook = [
                tf.compat.v1.train.LoggingTensorHook(
                    {"loss is -> ": total_loss}, every_n_iter=256),
                tf.compat.v1.train.LoggingTensorHook(
                    {"global step -> ": global_step}, every_n_iter=256),
                tf.compat.v1.train.LoggingTensorHook(
                    {"learning rate -> ": learning_rate}, every_n_iter=256)
            ]

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                loss=total_loss,
                train_op=train_op,
                training_hooks=logging_hook,
                host_call=utils.add_scalars_to_summary(
                    bert_config["output_dir"],
                    {"learning_rate": learning_rate}))

        elif mode == tf.estimator.ModeKeys.EVAL:

            def metric_fn(masked_lm_loss_value, masked_lm_log_probs,
                          masked_lm_ids, masked_lm_weights):

                masked_lm_predictions = tf.argmax(masked_lm_log_probs,
                                                  axis=-1,
                                                  output_type=tf.int32)
                masked_lm_accuracy = tf.compat.v1.metrics.accuracy(
                    labels=masked_lm_ids,
                    predictions=masked_lm_predictions,
                    weights=masked_lm_weights)
                masked_lm_mean_loss = tf.compat.v1.metrics.mean(
                    values=masked_lm_loss_value)

                return {
                    "masked_lm_accuracy": masked_lm_accuracy,
                    "masked_lm_loss": masked_lm_mean_loss,
                }

            eval_metrics = (metric_fn, [
                masked_lm_loss, masked_lm_log_probs, features["masked_lm_ids"],
                features["masked_lm_weights"]
            ])

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode, loss=total_loss, eval_metrics=eval_metrics)
        else:
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                mode=mode,
                predictions={
                    "log-probabilities": masked_lm_log_probs,
                    "seq-embeddings": sequence_output
                })

        return output_spec