def create_mobilebert_pretrainer(bert_config): """Creates a BertPretrainerV2 that wraps MobileBERTEncoder model.""" mobilebert_encoder = networks.MobileBERTEncoder( word_vocab_size=bert_config.vocab_size, word_embed_size=bert_config.embedding_size, type_vocab_size=bert_config.type_vocab_size, max_sequence_length=bert_config.max_position_embeddings, num_blocks=bert_config.num_hidden_layers, hidden_size=bert_config.hidden_size, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_act_fn=tf_utils.get_activation(bert_config.hidden_act), hidden_dropout_prob=bert_config.hidden_dropout_prob, attention_probs_dropout_prob=bert_config.attention_probs_dropout_prob, intra_bottleneck_size=bert_config.intra_bottleneck_size, initializer_range=bert_config.initializer_range, use_bottleneck_attention=bert_config.use_bottleneck_attention, key_query_shared_bottleneck=bert_config.key_query_shared_bottleneck, num_feedforward_networks=bert_config.num_feedforward_networks, normalization_type=bert_config.normalization_type, classifier_activation=bert_config.classifier_activation) masked_lm = layers.MobileBertMaskedLM( embedding_table=mobilebert_encoder.get_embedding_table(), activation=tf_utils.get_activation(bert_config.hidden_act), initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), name="cls/predictions") pretrainer = models.BertPretrainerV2(encoder_network=mobilebert_encoder, customized_masked_lm=masked_lm) # Makes sure the pretrainer variables are created. _ = pretrainer(pretrainer.inputs) return pretrainer
def test_copy_pooler_dense_to_encoder(self): encoder_config = encoders.EncoderConfig( type="bert", bert=encoders.BertEncoderConfig(hidden_size=24, intermediate_size=48, num_layers=2)) cls_heads = [ layers.ClassificationHead(inner_dim=24, num_classes=2, name="next_sentence") ] encoder = encoders.build_encoder(encoder_config) pretrainer = models.BertPretrainerV2( encoder_network=encoder, classification_heads=cls_heads, mlm_activation=tf_utils.get_activation( encoder_config.get().hidden_activation)) # Makes sure the pretrainer variables are created. _ = pretrainer(pretrainer.inputs) checkpoint = tf.train.Checkpoint(**pretrainer.checkpoint_items) model_checkpoint_dir = os.path.join(self.get_temp_dir(), "checkpoint") checkpoint.save(os.path.join(model_checkpoint_dir, "test")) vocab_file, sp_model_file = _get_vocab_or_sp_model_dummy( self.get_temp_dir(), use_sp_model=True) export_path = os.path.join(self.get_temp_dir(), "hub") export_tfhub_lib.export_model( export_path=export_path, encoder_config=encoder_config, model_checkpoint_path=tf.train.latest_checkpoint( model_checkpoint_dir), with_mlm=True, copy_pooler_dense_to_encoder=True, vocab_file=vocab_file, sp_model_file=sp_model_file, do_lower_case=True) # Restores a hub KerasLayer. hub_layer = hub.KerasLayer(export_path, trainable=True) dummy_ids = np.zeros((2, 10), dtype=np.int32) input_dict = dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) hub_pooled_output = hub_layer(input_dict)["pooled_output"] encoder_outputs = encoder(input_dict) # Verify that hub_layer's pooled_output is the same as the output of next # sentence prediction's dense layer. pretrained_pooled_output = cls_heads[0].dense( (encoder_outputs["sequence_output"][:, 0, :])) self.assertAllClose(hub_pooled_output, pretrained_pooled_output) # But the pooled_output between encoder and hub_layer are not the same. encoder_pooled_output = encoder_outputs["pooled_output"] self.assertNotAllClose(hub_pooled_output, encoder_pooled_output)
def build_model(self, params=None): config = params or self.task_config.model encoder_cfg = config.encoder encoder_network = self._build_encoder(encoder_cfg) cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in config.cls_heads ] if config.cls_heads else [] return models.BertPretrainerV2( mlm_activation=tf_utils.get_activation(config.mlm_activation), mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=config.mlm_initializer_range), encoder_network=encoder_network, classification_heads=cls_heads)
def _create_bert_pretrainer_model(cfg): """Creates a BERT keras core model from BERT configuration. Args: cfg: A `BertConfig` to create the core model. Returns: A BertPretrainerV2 model. """ bert_encoder = _create_bert_model(cfg) pretrainer = models.BertPretrainerV2( encoder_network=bert_encoder, mlm_activation=tf_utils.get_activation(cfg.hidden_act), mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=cfg.initializer_range)) return pretrainer
def _create_pretrainer_model(cfg): """Creates a pretrainer with AlbertEncoder from ALBERT configuration. Args: cfg: A `BertConfig` to create the core model. Returns: A BertPretrainerV2 model. """ albert_encoder = _create_albert_model(cfg) pretrainer = models.BertPretrainerV2( encoder_network=albert_encoder, mlm_activation=tf_utils.get_activation(cfg.hidden_act), mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=cfg.initializer_range)) # Makes sure masked_lm layer's variables in pretrainer are created. _ = pretrainer(pretrainer.inputs) return pretrainer
def _build_pretrainer(self, pretrainer_cfg: bert.PretrainerConfig, name: str): """Builds pretrainer from config and encoder.""" encoder = encoders.build_encoder(pretrainer_cfg.encoder) if pretrainer_cfg.cls_heads: cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in pretrainer_cfg.cls_heads ] else: cls_heads = [] masked_lm = layers.MobileBertMaskedLM( embedding_table=encoder.get_embedding_table(), activation=tf_utils.get_activation(pretrainer_cfg.mlm_activation), initializer=tf.keras.initializers.TruncatedNormal( stddev=pretrainer_cfg.mlm_initializer_range), name='cls/predictions') pretrainer = models.BertPretrainerV2( encoder_network=encoder, classification_heads=cls_heads, customized_masked_lm=masked_lm, name=name) return pretrainer
def prepare_config(self, teacher_block_num, student_block_num, transfer_teacher_layers): # using small model for testing task_config = distillation.BertDistillationTaskConfig( teacher_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=teacher_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='gelu'), student_model=bert.PretrainerConfig(encoder=encoders.EncoderConfig( type='mobilebert', mobilebert=encoders.MobileBertEncoderConfig( num_blocks=student_block_num)), cls_heads=[ bert.ClsHeadConfig( inner_dim=256, num_classes=2, dropout_rate=0.1, name='next_sentence') ], mlm_activation='relu'), train_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10), validation_data=pretrain_dataloader.BertPretrainDataConfig( input_path='dummy', max_predictions_per_seq=76, seq_length=512, global_batch_size=10)) # set only 1 step for each stage progressive_config = distillation.BertDistillationProgressiveConfig() progressive_config.layer_wise_distill_config.transfer_teacher_layers = ( transfer_teacher_layers) progressive_config.layer_wise_distill_config.num_steps = 1 progressive_config.pretrain_distill_config.num_steps = 1 optimization_config = optimization.OptimizationConfig( optimizer=optimization.OptimizerConfig( type='lamb', lamb=optimization.LAMBConfig(weight_decay_rate=0.0001, exclude_from_weight_decay=[ 'LayerNorm', 'layer_norm', 'bias', 'no_norm' ])), learning_rate=optimization.LrConfig( type='polynomial', polynomial=optimization.PolynomialLrConfig( initial_learning_rate=1.5e-3, decay_steps=10000, end_learning_rate=1.5e-3)), warmup=optimization.WarmupConfig( type='linear', linear=optimization.LinearWarmupConfig( warmup_learning_rate=0))) exp_config = cfg.ExperimentConfig( task=task_config, trainer=prog_trainer_lib.ProgressiveTrainerConfig( progressive=progressive_config, optimizer_config=optimization_config)) # Create a teacher model checkpoint. teacher_encoder = encoders.build_encoder( task_config.teacher_model.encoder) pretrainer_config = task_config.teacher_model if pretrainer_config.cls_heads: teacher_cls_heads = [ layers.ClassificationHead(**cfg.as_dict()) for cfg in pretrainer_config.cls_heads ] else: teacher_cls_heads = [] masked_lm = layers.MobileBertMaskedLM( embedding_table=teacher_encoder.get_embedding_table(), activation=tf_utils.get_activation( pretrainer_config.mlm_activation), initializer=tf.keras.initializers.TruncatedNormal( stddev=pretrainer_config.mlm_initializer_range), name='cls/predictions') teacher_pretrainer = models.BertPretrainerV2( encoder_network=teacher_encoder, classification_heads=teacher_cls_heads, customized_masked_lm=masked_lm) # The model variables will be created after the forward call. _ = teacher_pretrainer(teacher_pretrainer.inputs) teacher_pretrainer_ckpt = tf.train.Checkpoint( **teacher_pretrainer.checkpoint_items) teacher_ckpt_path = os.path.join(self.get_temp_dir(), 'teacher_model.ckpt') teacher_pretrainer_ckpt.save(teacher_ckpt_path) exp_config.task.teacher_model_init_checkpoint = self.get_temp_dir() return exp_config
def _create_model( *, bert_config: Optional[configs.BertConfig] = None, encoder_config: Optional[encoders.EncoderConfig] = None, with_mlm: bool, ) -> Tuple[tf.keras.Model, tf.keras.Model]: """Creates the model to export and the model to restore the checkpoint. Args: bert_config: A legacy `BertConfig` to create a `BertEncoder` object. Exactly one of encoder_config and bert_config must be set. encoder_config: An `EncoderConfig` to create an encoder of the configured type (`BertEncoder` or other). with_mlm: A bool to control the second component of the result. If True, will create a `BertPretrainerV2` object; otherwise, will create a `BertEncoder` object. Returns: A Tuple of (1) a Keras model that will be exported, (2) a `BertPretrainerV2` object or `BertEncoder` object depending on the value of `with_mlm` argument, which contains the first model and will be used for restoring weights from the checkpoint. """ if (bert_config is not None) == (encoder_config is not None): raise ValueError("Exactly one of `bert_config` and `encoder_config` " "can be specified, but got %s and %s" % (bert_config, encoder_config)) if bert_config is not None: encoder = get_bert_encoder(bert_config) else: encoder = encoders.build_encoder(encoder_config) # Convert from list of named inputs to dict of inputs keyed by name. # Only the latter accepts a dict of inputs after restoring from SavedModel. encoder_inputs_dict = {x.name: x for x in encoder.inputs} encoder_output_dict = encoder(encoder_inputs_dict) # For interchangeability with other text representations, # add "default" as an alias for BERT's whole-input reptesentations. encoder_output_dict["default"] = encoder_output_dict["pooled_output"] core_model = tf.keras.Model(inputs=encoder_inputs_dict, outputs=encoder_output_dict) if with_mlm: if bert_config is not None: hidden_act = bert_config.hidden_act else: assert encoder_config is not None hidden_act = encoder_config.get().hidden_activation pretrainer = models.BertPretrainerV2( encoder_network=encoder, mlm_activation=tf_utils.get_activation(hidden_act)) pretrainer_inputs_dict = {x.name: x for x in pretrainer.inputs} pretrainer_output_dict = pretrainer(pretrainer_inputs_dict) mlm_model = tf.keras.Model(inputs=pretrainer_inputs_dict, outputs=pretrainer_output_dict) # Set `_auto_track_sub_layers` to False, so that the additional weights # from `mlm` sub-object will not be included in the core model. # TODO(b/169210253): Use a public API when available. core_model._auto_track_sub_layers = False # pylint: disable=protected-access core_model.mlm = mlm_model return core_model, pretrainer else: return core_model, encoder
def get_insertion_model(bert_config, seq_length, max_predictions_per_seq, is_training = True): """Returns a Felix MLM insertion model. Args: bert_config: Configuration that defines the core BERT model. seq_length: Maximum sequence length of the training data. max_predictions_per_seq: Maximum number of masked tokens in sequence. is_training: Will the model be trained or is it inference time. Returns: Felix MLM insertion model as well as core BERT submodel from which to save weights after training. """ input_word_ids = tf.keras.layers.Input( shape=(seq_length,), name='input_word_ids', dtype=tf.int32) input_mask = tf.keras.layers.Input( shape=(seq_length,), name='input_mask', dtype=tf.int32) input_type_ids = tf.keras.layers.Input( shape=(seq_length,), name='input_type_ids', dtype=tf.int32) masked_lm_positions = tf.keras.layers.Input( shape=(max_predictions_per_seq,), name='masked_lm_positions', dtype=tf.int32) bert_encoder = networks.BertEncoder( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=activations.gelu, dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=seq_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) pretrainer_model = models.BertPretrainerV2( encoder_network=bert_encoder, mlm_initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) felix_inputs = [ input_word_ids, input_mask, input_type_ids, masked_lm_positions, ] outputs = pretrainer_model(felix_inputs) if is_training: masked_lm_ids = tf.keras.layers.Input( shape=(max_predictions_per_seq,), name='masked_lm_ids', dtype=tf.int32) masked_lm_weights = tf.keras.layers.Input( shape=(max_predictions_per_seq,), name='masked_lm_weights', dtype=tf.int32) output_loss = BertPretrainLossAndMetricLayer()(outputs['mlm_logits'], masked_lm_ids, masked_lm_weights) felix_inputs.append(masked_lm_ids) felix_inputs.append(masked_lm_weights) keras_model = tf.keras.Model(inputs=felix_inputs, outputs=output_loss) else: keras_model = tf.keras.Model( inputs=felix_inputs, outputs=outputs['mlm_logits']) return keras_model, bert_encoder