def _create_albert_model(cfg): """Creates a BERT keras core model from BERT configuration. Args: cfg: A `BertConfig` to create the core model. Returns: A keras model. """ albert_encoder = networks.AlbertTransformerEncoder( vocab_size=cfg.vocab_size, hidden_size=cfg.hidden_size, embedding_width=cfg.embedding_size, num_layers=cfg.num_hidden_layers, num_attention_heads=cfg.num_attention_heads, intermediate_size=cfg.intermediate_size, activation=activations.gelu, dropout_rate=cfg.hidden_dropout_prob, attention_dropout_rate=cfg.attention_probs_dropout_prob, <<<<<<< HEAD sequence_length=cfg.max_position_embeddings, ======= max_sequence_length=cfg.max_position_embeddings, >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 type_vocab_size=cfg.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=cfg.initializer_range)) return albert_encoder
def get_transformer_encoder(bert_config, sequence_length): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. sequence_length: Maximum sequence length of the training data. Returns: A networks.TransformerEncoder object. """ kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): kwargs['embedding_width'] = bert_config.embedding_size return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) return networks.TransformerEncoder(**kwargs)
def get_transformer_encoder(bert_config, sequence_length, transformer_encoder_cls=None): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. sequence_length: Maximum sequence length of the training data. transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the default BERT encoder implementation. Returns: A networks.TransformerEncoder object. """ if transformer_encoder_cls is not None: # TODO(hongkuny): evaluate if it is better to put cfg definition in gin. embedding_cfg = dict( vocab_size=bert_config.vocab_size, type_vocab_size=bert_config.type_vocab_size, hidden_size=bert_config.hidden_size, seq_length=sequence_length, max_seq_length=bert_config.max_position_embeddings, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), dropout_rate=bert_config.hidden_dropout_prob, ) hidden_cfg = dict( num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, ) kwargs = dict(embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg, num_hidden_instances=bert_config.num_hidden_layers,) # Relies on gin configuration to define the Transformer encoder arguments. return transformer_encoder_cls(**kwargs) kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, sequence_length=sequence_length, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): kwargs['embedding_width'] = bert_config.embedding_size return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) return networks.TransformerEncoder(**kwargs)
num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, <<<<<<< HEAD sequence_length=sequence_length, ======= >>>>>>> a811a3b7e640722318ad868c99feddf3f3063e36 max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, embedding_width=bert_config.embedding_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) kwargs['output_range'] = output_range return networks.TransformerEncoder(**kwargs) def pretrain_model(bert_config, seq_length, max_predictions_per_seq, initializer=None, use_next_sentence_label=True, return_core_pretrainer_model=False): """Returns model to be used for pre-training. Args:
def get_transformer_encoder(bert_config, sequence_length=None, transformer_encoder_cls=None, output_range=None): """Gets a 'TransformerEncoder' object. Args: bert_config: A 'modeling.BertConfig' or 'modeling.AlbertConfig' object. sequence_length: [Deprecated]. transformer_encoder_cls: A EncoderScaffold class. If it is None, uses the default BERT encoder implementation. output_range: the sequence output range, [0, output_range). Default setting is to return the entire sequence output. Returns: A encoder object. """ del sequence_length if transformer_encoder_cls is not None: # TODO(hongkuny): evaluate if it is better to put cfg definition in gin. embedding_cfg = dict( vocab_size=bert_config.vocab_size, type_vocab_size=bert_config.type_vocab_size, hidden_size=bert_config.hidden_size, max_seq_length=bert_config.max_position_embeddings, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), dropout_rate=bert_config.hidden_dropout_prob, ) hidden_cfg = dict( num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, intermediate_activation=tf_utils.get_activation( bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range), ) kwargs = dict( embedding_cfg=embedding_cfg, hidden_cfg=hidden_cfg, num_hidden_instances=bert_config.num_hidden_layers, pooled_output_dim=bert_config.hidden_size, pooler_layer_initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) # Relies on gin configuration to define the Transformer encoder arguments. return transformer_encoder_cls(**kwargs) kwargs = dict( vocab_size=bert_config.vocab_size, hidden_size=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_attention_heads=bert_config.num_attention_heads, intermediate_size=bert_config.intermediate_size, activation=tf_utils.get_activation(bert_config.hidden_act), dropout_rate=bert_config.hidden_dropout_prob, attention_dropout_rate=bert_config.attention_probs_dropout_prob, max_sequence_length=bert_config.max_position_embeddings, type_vocab_size=bert_config.type_vocab_size, embedding_width=bert_config.embedding_size, initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_config.initializer_range)) if isinstance(bert_config, albert_configs.AlbertConfig): return networks.AlbertTransformerEncoder(**kwargs) else: assert isinstance(bert_config, configs.BertConfig) kwargs['output_range'] = output_range return networks.BertEncoder(**kwargs)