コード例 #1
0
def create_mobilebert_pretrainer(bert_config):
    """Creates a BertPretrainerV2 that wraps MobileBERTEncoder model."""
    mobilebert_encoder = networks.MobileBERTEncoder(
        word_vocab_size=bert_config.vocab_size,
        word_embed_size=bert_config.embedding_size,
        type_vocab_size=bert_config.type_vocab_size,
        max_sequence_length=bert_config.max_position_embeddings,
        num_blocks=bert_config.num_hidden_layers,
        hidden_size=bert_config.hidden_size,
        num_attention_heads=bert_config.num_attention_heads,
        intermediate_size=bert_config.intermediate_size,
        intermediate_act_fn=tf_utils.get_activation(bert_config.hidden_act),
        hidden_dropout_prob=bert_config.hidden_dropout_prob,
        attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob,
        intra_bottleneck_size=bert_config.intra_bottleneck_size,
        initializer_range=bert_config.initializer_range,
        use_bottleneck_attention=bert_config.use_bottleneck_attention,
        key_query_shared_bottleneck=bert_config.key_query_shared_bottleneck,
        num_feedforward_networks=bert_config.num_feedforward_networks,
        normalization_type=bert_config.normalization_type,
        classifier_activation=bert_config.classifier_activation)

    masked_lm = layers.MobileBertMaskedLM(
        embedding_table=mobilebert_encoder.get_embedding_table(),
        activation=tf_utils.get_activation(bert_config.hidden_act),
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=bert_config.initializer_range),
        name="cls/predictions")

    pretrainer = models.BertPretrainerV2(encoder_network=mobilebert_encoder,
                                         customized_masked_lm=masked_lm)
    # Makes sure the pretrainer variables are created.
    _ = pretrainer(pretrainer.inputs)
    return pretrainer
コード例 #2
0
    def test_copy_pooler_dense_to_encoder(self):
        encoder_config = encoders.EncoderConfig(
            type="bert",
            bert=encoders.BertEncoderConfig(hidden_size=24,
                                            intermediate_size=48,
                                            num_layers=2))
        cls_heads = [
            layers.ClassificationHead(inner_dim=24,
                                      num_classes=2,
                                      name="next_sentence")
        ]
        encoder = encoders.build_encoder(encoder_config)
        pretrainer = models.BertPretrainerV2(
            encoder_network=encoder,
            classification_heads=cls_heads,
            mlm_activation=tf_utils.get_activation(
                encoder_config.get().hidden_activation))
        # Makes sure the pretrainer variables are created.
        _ = pretrainer(pretrainer.inputs)
        checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items)
        model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint")
        checkpoint.save(os.path.join(model_checkpoint_dir, "test"))

        vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy(
            self.get_temp_dir(), use_sp_model=True)
        export_path = os.path.join(self.get_temp_dir(), "hub")
        export_tfhub_lib.export_model(
            export_path=export_path,
            encoder_config=encoder_config,
            model_checkpoint_path=tf.train.latest_checkpoint(
                model_checkpoint_dir),
            with_mlm=True,
            copy_pooler_dense_to_encoder=True,
            vocab_file=vocab_file,
            sp_model_file=sp_model_file,
            do_lower_case=True)
        # Restores a hub KerasLayer.
        hub_layer = hub.KerasLayer(export_path, trainable=True)
        dummy_ids = np.zeros((2, 10), dtype=np.int32)
        input_dict = dict(input_word_ids=dummy_ids,
                          input_mask=dummy_ids,
                          input_type_ids=dummy_ids)
        hub_pooled_output = hub_layer(input_dict)["pooled_output"]
        encoder_outputs = encoder(input_dict)
        # Verify that hub_layer's pooled_output is the same as the output of next
        # sentence prediction's dense layer.
        pretrained_pooled_output = cls_heads[0].dense(
            (encoder_outputs["sequence_output"][:, 0, :]))
        self.assertAllClose(hub_pooled_output, pretrained_pooled_output)
        # But the pooled_output between encoder and hub_layer are not the same.
        encoder_pooled_output = encoder_outputs["pooled_output"]
        self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
コード例 #3
0
 def build_model(self, params=None):
   config = params or self.task_config.model
   encoder_cfg = config.encoder
   encoder_network = self._build_encoder(encoder_cfg)
   cls_heads = [
       layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads
   ] if config.cls_heads else []
   return models.BertPretrainerV2(
       mlm_activation=tf_utils.get_activation(config.mlm_activation),
       mlm_initializer=tf.keras.initializers.TruncatedNormal(
           stddev=config.mlm_initializer_range),
       encoder_network=encoder_network,
       classification_heads=cls_heads)
コード例 #4
0
def _create_bert_pretrainer_model(cfg):
    """Creates a BERT keras core model from BERT configuration.

  Args:
    cfg: A `BertConfig` to create the core model.

  Returns:
    A BertPretrainerV2 model.
  """
    bert_encoder = _create_bert_model(cfg)
    pretrainer = models.BertPretrainerV2(
        encoder_network=bert_encoder,
        mlm_activation=tf_utils.get_activation(cfg.hidden_act),
        mlm_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=cfg.initializer_range))
    return pretrainer
コード例 #5
0
def _create_pretrainer_model(cfg):
    """Creates a pretrainer with AlbertEncoder from ALBERT configuration.

  Args:
    cfg: A `BertConfig` to create the core model.

  Returns:
    A BertPretrainerV2 model.
  """
    albert_encoder = _create_albert_model(cfg)
    pretrainer = models.BertPretrainerV2(
        encoder_network=albert_encoder,
        mlm_activation=tf_utils.get_activation(cfg.hidden_act),
        mlm_initializer=tf.keras.initializers.TruncatedNormal(
            stddev=cfg.initializer_range))
    # Makes sure masked_lm layer's variables in pretrainer are created.
    _ = pretrainer(pretrainer.inputs)
    return pretrainer
コード例 #6
0
ファイル: distillation.py プロジェクト: ykate1998/models
  def _build_pretrainer(self, pretrainer_cfg: bert.PretrainerConfig, name: str):
    """Builds pretrainer from config and encoder."""
    encoder = encoders.build_encoder(pretrainer_cfg.encoder)
    if pretrainer_cfg.cls_heads:
      cls_heads = [
          layers.ClassificationHead(**cfg.as_dict())
          for cfg in pretrainer_cfg.cls_heads
      ]
    else:
      cls_heads = []

    masked_lm = layers.MobileBertMaskedLM(
        embedding_table=encoder.get_embedding_table(),
        activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation),
        initializer=tf.keras.initializers.TruncatedNormal(
            stddev=pretrainer_cfg.mlm_initializer_range),
        name='cls/predictions')

    pretrainer = models.BertPretrainerV2(
        encoder_network=encoder,
        classification_heads=cls_heads,
        customized_masked_lm=masked_lm,
        name=name)
    return pretrainer
コード例 #7
0
    def prepare_config(self, teacher_block_num, student_block_num,
                       transfer_teacher_layers):
        # using small model for testing
        task_config = distillation.BertDistillationTaskConfig(
            teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=teacher_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='gelu'),
            student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig(
                type='mobilebert',
                mobilebert=encoders.MobileBertEncoderConfig(
                    num_blocks=student_block_num)),
                                                cls_heads=[
                                                    bert.ClsHeadConfig(
                                                        inner_dim=256,
                                                        num_classes=2,
                                                        dropout_rate=0.1,
                                                        name='next_sentence')
                                                ],
                                                mlm_activation='relu'),
            train_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10),
            validation_data=pretrain_dataloader.BertPretrainDataConfig(
                input_path='dummy',
                max_predictions_per_seq=76,
                seq_length=512,
                global_batch_size=10))

        # set only 1 step for each stage
        progressive_config = distillation.BertDistillationProgressiveConfig()
        progressive_config.layer_wise_distill_config.transfer_teacher_layers = (
            transfer_teacher_layers)
        progressive_config.layer_wise_distill_config.num_steps = 1
        progressive_config.pretrain_distill_config.num_steps = 1

        optimization_config = optimization.OptimizationConfig(
            optimizer=optimization.OptimizerConfig(
                type='lamb',
                lamb=optimization.LAMBConfig(weight_decay_rate=0.0001,
                                             exclude_from_weight_decay=[
                                                 'LayerNorm', 'layer_norm',
                                                 'bias', 'no_norm'
                                             ])),
            learning_rate=optimization.LrConfig(
                type='polynomial',
                polynomial=optimization.PolynomialLrConfig(
                    initial_learning_rate=1.5e-3,
                    decay_steps=10000,
                    end_learning_rate=1.5e-3)),
            warmup=optimization.WarmupConfig(
                type='linear',
                linear=optimization.LinearWarmupConfig(
                    warmup_learning_rate=0)))

        exp_config = cfg.ExperimentConfig(
            task=task_config,
            trainer=prog_trainer_lib.ProgressiveTrainerConfig(
                progressive=progressive_config,
                optimizer_config=optimization_config))

        # Create a teacher model checkpoint.
        teacher_encoder = encoders.build_encoder(
            task_config.teacher_model.encoder)
        pretrainer_config = task_config.teacher_model
        if pretrainer_config.cls_heads:
            teacher_cls_heads = [
                layers.ClassificationHead(**cfg.as_dict())
                for cfg in pretrainer_config.cls_heads
            ]
        else:
            teacher_cls_heads = []

        masked_lm = layers.MobileBertMaskedLM(
            embedding_table=teacher_encoder.get_embedding_table(),
            activation=tf_utils.get_activation(
                pretrainer_config.mlm_activation),
            initializer=tf.keras.initializers.TruncatedNormal(
                stddev=pretrainer_config.mlm_initializer_range),
            name='cls/predictions')
        teacher_pretrainer = models.BertPretrainerV2(
            encoder_network=teacher_encoder,
            classification_heads=teacher_cls_heads,
            customized_masked_lm=masked_lm)

        # The model variables will be created after the forward call.
        _ = teacher_pretrainer(teacher_pretrainer.inputs)
        teacher_pretrainer_ckpt = tf.train.Checkpoint(
            **teacher_pretrainer.checkpoint_items)
        teacher_ckpt_path = os.path.join(self.get_temp_dir(),
                                         'teacher_model.ckpt')
        teacher_pretrainer_ckpt.save(teacher_ckpt_path)
        exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir()

        return exp_config
コード例 #8
0
ファイル: export_tfhub_lib.py プロジェクト: nibeh/models
def _create_model(
    *,
    bert_config: Optional[configs.BertConfig] = None,
    encoder_config: Optional[encoders.EncoderConfig] = None,
    with_mlm: bool,
) -> Tuple[tf.keras.Model, tf.keras.Model]:
    """Creates the model to export and the model to restore the checkpoint.

  Args:
    bert_config: A legacy `BertConfig` to create a `BertEncoder` object.
      Exactly one of encoder_config and bert_config must be set.
    encoder_config: An `EncoderConfig` to create an encoder of the configured
      type (`BertEncoder` or other).
    with_mlm: A bool to control the second component of the result.
      If True, will create a `BertPretrainerV2` object; otherwise, will
      create a `BertEncoder` object.

  Returns:
    A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2`
    object or `BertEncoder` object depending on the value of `with_mlm`
    argument, which contains the first model and will be used for restoring
    weights from the checkpoint.
  """
    if (bert_config is not None) == (encoder_config is not None):
        raise ValueError("Exactly one of `bert_config` and `encoder_config` "
                         "can be specified, but got %s and %s" %
                         (bert_config, encoder_config))

    if bert_config is not None:
        encoder = get_bert_encoder(bert_config)
    else:
        encoder = encoders.build_encoder(encoder_config)

    # Convert from list of named inputs to dict of inputs keyed by name.
    # Only the latter accepts a dict of inputs after restoring from SavedModel.
    encoder_inputs_dict = {x.name: x for x in encoder.inputs}
    encoder_output_dict = encoder(encoder_inputs_dict)
    # For interchangeability with other text representations,
    # add "default" as an alias for BERT's whole-input reptesentations.
    encoder_output_dict["default"] = encoder_output_dict["pooled_output"]
    core_model = tf.keras.Model(inputs=encoder_inputs_dict,
                                outputs=encoder_output_dict)

    if with_mlm:
        if bert_config is not None:
            hidden_act = bert_config.hidden_act
        else:
            assert encoder_config is not None
            hidden_act = encoder_config.get().hidden_activation

        pretrainer = models.BertPretrainerV2(
            encoder_network=encoder,
            mlm_activation=tf_utils.get_activation(hidden_act))

        pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs}
        pretrainer_output_dict = pretrainer(pretrainer_inputs_dict)
        mlm_model = tf.keras.Model(inputs=pretrainer_inputs_dict,
                                   outputs=pretrainer_output_dict)
        # Set `_auto_track_sub_layers` to False, so that the additional weights
        # from `mlm` sub-object will not be included in the core model.
        # TODO(b/169210253): Use a public API when available.
        core_model._auto_track_sub_layers = False  # pylint: disable=protected-access
        core_model.mlm = mlm_model
        return core_model, pretrainer
    else:
        return core_model, encoder
コード例 #9
0
def get_insertion_model(bert_config,
                        seq_length,
                        max_predictions_per_seq,
                        is_training = True):
  """Returns a Felix MLM insertion model.

  Args:
      bert_config: Configuration that defines the core BERT model.
      seq_length: Maximum sequence length of the training data.
      max_predictions_per_seq: Maximum number of masked tokens in sequence.
      is_training: Will the model be trained or is it inference time.

  Returns:
      Felix MLM insertion model as well as core BERT submodel from which to save
      weights after training.
  """
  input_word_ids = tf.keras.layers.Input(
      shape=(seq_length,), name='input_word_ids', dtype=tf.int32)
  input_mask = tf.keras.layers.Input(
      shape=(seq_length,), name='input_mask', dtype=tf.int32)
  input_type_ids = tf.keras.layers.Input(
      shape=(seq_length,), name='input_type_ids', dtype=tf.int32)
  masked_lm_positions = tf.keras.layers.Input(
      shape=(max_predictions_per_seq,),
      name='masked_lm_positions',
      dtype=tf.int32)

  bert_encoder = networks.BertEncoder(
      vocab_size=bert_config.vocab_size,
      hidden_size=bert_config.hidden_size,
      num_layers=bert_config.num_hidden_layers,
      num_attention_heads=bert_config.num_attention_heads,
      intermediate_size=bert_config.intermediate_size,
      activation=activations.gelu,
      dropout_rate=bert_config.hidden_dropout_prob,
      attention_dropout_rate=bert_config.attention_probs_dropout_prob,
      sequence_length=seq_length,
      max_sequence_length=bert_config.max_position_embeddings,
      type_vocab_size=bert_config.type_vocab_size,
      initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))

  pretrainer_model = models.BertPretrainerV2(
      encoder_network=bert_encoder,
      mlm_initializer=tf.keras.initializers.TruncatedNormal(
          stddev=bert_config.initializer_range))

  felix_inputs = [
      input_word_ids,
      input_mask,
      input_type_ids,
      masked_lm_positions,
  ]
  outputs = pretrainer_model(felix_inputs)
  if is_training:
    masked_lm_ids = tf.keras.layers.Input(
        shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32)
    masked_lm_weights = tf.keras.layers.Input(
        shape=(max_predictions_per_seq,),
        name='masked_lm_weights',
        dtype=tf.int32)
    output_loss = BertPretrainLossAndMetricLayer()(outputs['mlm_logits'],
                                                   masked_lm_ids,
                                                   masked_lm_weights)
    felix_inputs.append(masked_lm_ids)
    felix_inputs.append(masked_lm_weights)
    keras_model = tf.keras.Model(inputs=felix_inputs, outputs=output_loss)
  else:
    keras_model = tf.keras.Model(
        inputs=felix_inputs, outputs=outputs['mlm_logits'])

  return keras_model, bert_encoder