def __init__(self, embedding_size, layer_norm_epsilon, hidden_act, **kwargs): super(MLMLayer, self).__init__(**kwargs) self.embedding_size = embedding_size self.layer_norm_epsilon = layer_norm_epsilon self.dense1 = tf.keras.layers.Dense(embedding_size) self.act = get_activation(hidden_act) self._extra_norm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=layer_norm_epsilon, dtype=tf.float32)
def __init__( self, config, mask_mode="causal", name=None, use_dropout=False, is_training=None, batch_size=None, sequence_length=None, use_type_embeddings=False, use_positonal_embeddings=True, pipeline_mode=None, is_decoder=False, cross_attention_inside_encoder=False, share_attention_layers=True, share_encoder_embeddings=False, encoder_embedding_layer=None, encoder_type_embedding_layer=None, encoder_positional_embedding_layer=None, use_mlm_layer=False, return_all_layer_token_embeddings=True, **kwargs, ): """ Args: config: dict mask_mode: str, `user_defined` BERT by default uses masking for PADDED or MLM. But can be overridden . # noqa name: str, Name of the model use_dropout: bool, It is strictly optional. Sometimes, while training you can set `use_dropout` to False. If `is_training` is False, `use_dropout` will be automatically set to False. # noqa batch_size: int, `batch_size` can be None or any int sequence_length: int, `sequence_length` can be None or any int use_type_embeddings: bool, By default BERT has type_embeddings, GPT2 don't. use_positonal_embeddings: bool, T5 don't have postional embeddings bidirectional: use in relative postional embedding (we can infer it based on mask_mode) is_decoder: bool, if True it will become decoder mode (as in Seq2Seq) use_mlm_layer: bool ( To use MLM layer or not ) share_encoder_embeddings: bool, When is_decoder = True, most cases, it will re-use the embedding layer from Encoder. So. if you still want to initialize , set this to False. If True, share embedding layers from encoder (word_embeddings, positional_embeddings, type_embeddings) cross_attention_inside_encoder: bool, Encoder Decoder Cross attention in each layer """ # Because saved_model causes some serialization problems here # self.config = config self.vocab_size = config["vocab_size"] self.type_vocab_size = config["type_vocab_size"] self.num_hidden_layers = config["num_hidden_layers"] self.num_attention_heads = config["num_attention_heads"] self.attention_head_size = config["attention_head_size"] self.max_position_embeddings = config["max_position_embeddings"] self.intermediate_size = config["intermediate_size"] self.embedding_size = config["embedding_size"] self.initializer_range = config["initializer_range"] self.hidden_act = config["hidden_act"] self.hidden_dropout_prob = config["hidden_dropout_prob"] self.attention_probs_dropout_prob = config["attention_probs_dropout_prob"] self.intermediate_act = config["intermediate_act"] self.layer_norm_epsilon = config["layer_norm_epsilon"] # Get activation and initiliazers self.activation = get_activation(self.hidden_act) self.intermediate_activation = get_activation(self.intermediate_act) initializer = tf.keras.initializers.TruncatedNormal(stddev=self.initializer_range) self.initializer = tf.keras.initializers.get(initializer) self.mask_mode = mask_mode # If we use self.name , its a conflict with keras property self.model_name = name self.pipeline_mode = pipeline_mode self.is_decoder = is_decoder # self._self_setattr_tracking = False self.mask_mode = mask_mode self.use_dropout = use_dropout self.is_training = is_training self.batch_size = batch_size self.sequence_length = sequence_length self.use_type_embeddings = use_type_embeddings self.use_positonal_embeddings = use_positonal_embeddings self.share_encoder_embeddings = share_encoder_embeddings self.share_attention_layers = share_attention_layers self.use_mlm_layer = use_mlm_layer self.cross_attention_inside_encoder = cross_attention_inside_encoder self.return_all_layer_token_embeddings = return_all_layer_token_embeddings if not name.startswith("tf_transformers"): kwargs["name"] = "tf_transformers/" + self.model_name else: kwargs["name"] = self.model_name self.validate_and_set_inputs() super(GPT2Encoder, self).__init__(is_training=self.is_training, use_dropout=self.use_dropout, **kwargs) self._config_dict = { "initializer": tf.keras.initializers.serialize(initializer), "is_training": self.is_training, "use_dropout": self.use_dropout, "batch_size": self.batch_size, "sequence_length": self.sequence_length, "name": kwargs["name"], "use_type_embeddings": self.use_type_embeddings, "use_positonal_embeddings": self.use_positonal_embeddings, "is_decoder": self.is_decoder, "share_encoder_embeddings": self.share_encoder_embeddings, "share_attention_layers": self.share_attention_layers, "cross_attention_inside_encoder": cross_attention_inside_encoder, "return_all_layer_token_embeddings": self.return_all_layer_token_embeddings, } # Update config dict with passed config self._config_dict.update(config) # Call embedding layers self._embedding_layer, self._type_embeddings, self._position_embedding_layer = self.get_embedding_layers() if self.is_decoder: # If embedding has to shared from the encoder if self.share_encoder_embeddings: self._embedding_layer = encoder_embedding_layer self._type_embeddings = encoder_type_embedding_layer self._position_embedding_layer = encoder_positional_embedding_layer # Embedding Norm self._embedding_norm = tf.keras.layers.LayerNormalization( name="embeddings/layer_norm", axis=-1, epsilon=self.layer_norm_epsilon, dtype=tf.float32, ) # Embedding dropout Layer self._embedding_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob) # Transformer Layer self._transformer_layers = [] for i in range(self.num_hidden_layers): layer = TransformerGPT2( num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, intermediate_activation=self.activation, dropout_rate=self.hidden_dropout_prob, attention_dropout_rate=self.attention_probs_dropout_prob, kernel_initializer=self.initializer, is_training=self.is_training, use_dropout=self.use_dropout, is_decoder=is_decoder, share_attention_layers=share_attention_layers, layer_norm_epsilon=self.layer_norm_epsilon, cross_attention_inside_encoder=self.cross_attention_inside_encoder, name="transformer/layer_%d" % i, ) self._transformer_layers.append(layer) if self.use_mlm_layer: self.mlm_layer = MLMLayer( self.embedding_size, self.layer_norm_epsilon, self.hidden_act, name="mlm_layer", ) self._last_logits_bias = self.add_weight( "tf_transformers/last_logits_bias", shape=(self.vocab_size,), dtype=tf.float32, trainable=True, ) # Last Layer Normalization (only in GPT2) self._last_layer_norm = GPT2LayerNormalization( name="ln_f/layer_norm", axis=-1, epsilon=self.layer_norm_epsilon, dtype=tf.float32, ) self.call_fn = self.get_call_method() # Initialize model self.model_inputs, self.model_outputs = self.get_model(initialize_only=True) logging.info("Initialized Variables")
def __init__( self, config, mask_mode="user_defined", name=None, use_dropout=False, is_training=None, batch_size=None, sequence_length=None, use_type_embeddings=True, use_positonal_embeddings=True, pipeline_mode=None, is_decoder=False, initialize_embeddings=False, model_dir=None, **kwargs, ): """ Args: config: dict mask_mode: str, `user_defined` BERT by default uses masking for PADDED or MLM. But can be overridden . # noqa name: str, Name of the model use_dropout: bool, It is strictly optional. Sometimes, while training you can set `use_dropout` to False. # noqa If `is_training` is False, `use_dropout` will be automatically set to False. # noqa batch_size: int, `batch_size` can be None or any int sequence_length: int, `sequence_length` can be None or any int use_type_embeddings: bool, By default BERT has type_embeddings, GPT2 don't. use_positonal_embeddings: bool, T5 don't have postional embeddings bidirectional: use in relative postional embedding (we can infer it based on mask_mode) is_decoder: bool, if True it will become decoder mode (as in Seq2Seq) initialize_embeddings: bool, When is_decoder = True, most cases, it will re-use the embedding layer from Encoder. # noqa So. if you still want to initialize , set this to True # noqa """ # Because saved_model causes some serialization problems here # self.config = config self.vocab_size = config["vocab_size"] self.type_vocab_size = config["type_vocab_size"] self.num_hidden_layers = config["num_hidden_layers"] self.num_attention_heads = config["num_attention_heads"] self.max_position_embeddings = config["max_position_embeddings"] self.intermediate_size = config["intermediate_size"] self.embedding_size = config["embedding_size"] self.initializer_range = config["initializer_range"] self.hidden_act = config["hidden_act"] self.hidden_dropout_prob = config["hidden_dropout_prob"] self.attention_probs_dropout_prob = config[ "attention_probs_dropout_prob"] self.intermediate_act = config["intermediate_act"] self.layer_norm_epsilon = config["layer_norm_epsilon"] # Get activation and initiliazers self.activation = get_activation(self.hidden_act) self.intermediate_activation = get_activation(self.intermediate_act) initializer = tf.keras.initializers.TruncatedNormal( stddev=self.initializer_range) self.initializer = tf.keras.initializers.get(initializer) self.mask_mode = mask_mode # If we use self.name , its a conflict with keras property self.model_name = name self.pipeline_mode = pipeline_mode self.is_decoder = is_decoder self.model_dir = model_dir if self.mask_mode not in ["user_defined", "causal", "prefix"]: raise ValueError( "Unknown mask_mode `{}`provided. Supported modes are `{}`". format(self.mask_mode, ["user_defined", "causal", "prefix"])) if self.model_name is None: raise ValueError( "`name` cannot be None. Please provide a meaningful name") if is_training is None: raise ValueError( "`is_training` cannot be None. Please provide a `True` or `False`" ) if self.mask_mode is None: raise ValueError( "`mask_mode` cannot be None. Please provide `['user_defined', 'causal', 'prefix']`" ) # self._self_setattr_tracking = False self.mask_mode = mask_mode self.use_dropout = use_dropout self.is_training = is_training self.batch_size = batch_size self.sequence_length = sequence_length self.use_type_embeddings = use_type_embeddings self.use_positonal_embeddings = use_positonal_embeddings self.initialize_embeddings = initialize_embeddings # If `is_training` is False and `pipeline is None` means, we are using it for inference. # We will forcefully set it back to `is_training` is True and `use_dropout` is False. # For encoder-decoder models # noqa # this is the encoder mode params. Same mode will also be applicable for classification, QA etc. # noqa if self.is_training: if self.pipeline_mode is not None: raise ValueError( "When `is_training` is True, `pipeline_mode` should be None. \ But rather got `pipeline_mode` as {}".format( self.pipeline_mode)) if self.is_training is False: if self.pipeline_mode is None: logging.info( "We are overwriding `is_training` is False to `is_training` to True \ with `use_dropout` is False, no effects on your inference pipeline" ) self.is_training = True self.use_dropout = False # Decoder Mode if self.is_decoder: # Decoder will never have a prefix model for time being if self.mask_mode == "prefix": raise ValueError( "As you are in Decoder Mode (`is_decoder` is True), {} mask_mode \ doesn't make sense. For Decode `mask_mode` \ should be `causal` or `user_defined` ".format( self.mask_mode)) # If predict pipeline if self.is_training is False: # Auto Regressive setting should only support causal mode if self.pipeline_mode == "auto-regressive": if self.mask_mode != "causal": raise ValueError( "As you are in Decoder Mode and auto-regressive \ pipeline(`is_decoder` is True), \ {} mask_mode doesn't make sense. For Decode \ `mask_mode` should be `causal` ".format( self.mask_mode) # noqa ) if not name.startswith("tf_transformers"): kwargs["name"] = "tf_transformers/" + self.model_name else: kwargs["name"] = self.model_name super(UNILMEncoder, self).__init__(is_training=self.is_training, use_dropout=self.use_dropout, **kwargs) self._config_dict = { "initializer": tf.keras.initializers.serialize(initializer), "is_training": self.is_training, "use_dropout": self.use_dropout, "batch_size": self.batch_size, "sequence_length": self.sequence_length, "name": kwargs["name"], "use_type_embeddings": self.use_type_embeddings, "use_positonal_embeddings": self.use_positonal_embeddings, "is_decoder": self.is_decoder, "initialize_embeddings": self.initialize_embeddings, "layer_norm_epsilon": self.layer_norm_epsilon, } # Update config dict with passed config self._config_dict.update(config) if self.is_decoder: if self.initialize_embeddings: # Word Embedding Layer self._embedding_layer = OnDeviceEmbedding( vocab_size=self.vocab_size, embedding_width=self.embedding_size, initializer=initializer, name="word_embeddings", ) if self.use_type_embeddings: # Type Embeddings self._type_embeddings = OnDeviceEmbedding( vocab_size=self.type_vocab_size, embedding_width=self.embedding_size, initializer=initializer, name="type_embeddings", ) if self.use_positonal_embeddings: # Positional Embedding self._position_embedding_layer = SimplePositionEmbedding( initializer=initializer, max_sequence_length=self.max_position_embeddings, embedding_width=self.embedding_size, name="positional_embeddings", ) else: # Word Embedding Layer self._embedding_layer = OnDeviceEmbedding( vocab_size=self.vocab_size, embedding_width=self.embedding_size, initializer=initializer, name="word_embeddings", ) if self.use_type_embeddings: # Type Embeddings self._type_embeddings = OnDeviceEmbedding( vocab_size=self.type_vocab_size, embedding_width=self.embedding_size, initializer=initializer, name="type_embeddings", ) if self.use_positonal_embeddings: # Positional Embedding self._position_embedding_layer = SimplePositionEmbedding( initializer=initializer, max_sequence_length=self.max_position_embeddings, embedding_width=self.embedding_size, name="positional_embeddings", ) # Embedding Norm self._embedding_norm = tf.keras.layers.LayerNormalization( name="embeddings/layer_norm", axis=-1, epsilon=self.layer_norm_epsilon, dtype=tf.float32, ) # Embedding dropout Layer self._embedding_dropout = tf.keras.layers.Dropout( rate=self.hidden_dropout_prob) # Transformer Layer self._transformer_layers = [] for i in range(self.num_hidden_layers): layer = TransformerBERT( num_attention_heads=self.num_attention_heads, intermediate_size=self.intermediate_size, intermediate_activation=self.activation, dropout_rate=self.hidden_dropout_prob, attention_dropout_rate=self.attention_probs_dropout_prob, kernel_initializer=self.initializer, is_training=self.is_training, use_dropout=self.use_dropout, is_decoder=is_decoder, layer_norm_epsilon=self.layer_norm_epsilon, name="transformer/layer_%d" % i, ) self._transformer_layers.append(layer) self._pooler_layer = tf.keras.layers.Dense( units=self.embedding_size, activation="tanh", kernel_initializer=self.initializer, name="pooler_transform", ) self.mlm_layer = MLMLayer( self.embedding_size, self.layer_norm_epsilon, self.hidden_act, name="mlm_layer", ) self._last_logits_bias = self.add_weight( "tf_transformers/last_logits_bias", shape=(self.vocab_size, ), dtype=tf.float32, trainable=True, ) # Initialize model self.model_inputs = self.get_model(initialize_only=True) logging.info("Initialized Variables") if self.model_dir: self.load_model(self, self.model_dir) logging.info("Loaded Variables from {}".format(self.model_dir))