def __init__(self, embedding_size, layer_norm_epsilon, hidden_act, **kwargs):
     super(MLMLayer, self).__init__(**kwargs)
     self.embedding_size = embedding_size
     self.layer_norm_epsilon = layer_norm_epsilon
     self.dense1 = tf.keras.layers.Dense(embedding_size)
     self.act = get_activation(hidden_act)
     self._extra_norm = tf.keras.layers.LayerNormalization(axis=-1, epsilon=layer_norm_epsilon, dtype=tf.float32)
Esempio n. 2
0
    def __init__(
        self,
        config,
        mask_mode="causal",
        name=None,
        use_dropout=False,
        is_training=None,
        batch_size=None,
        sequence_length=None,
        use_type_embeddings=False,
        use_positonal_embeddings=True,
        pipeline_mode=None,
        is_decoder=False,
        cross_attention_inside_encoder=False,
        share_attention_layers=True,
        share_encoder_embeddings=False,
        encoder_embedding_layer=None,
        encoder_type_embedding_layer=None,
        encoder_positional_embedding_layer=None,
        use_mlm_layer=False,
        return_all_layer_token_embeddings=True,
        **kwargs,
    ):
        """
        Args:
            config: dict
            mask_mode: str, `user_defined` BERT by default uses masking for PADDED or MLM. But can be overridden . # noqa
            name: str, Name of the model
            use_dropout: bool, It is strictly optional. Sometimes,
                        while training you can set `use_dropout` to False.
                         If `is_training` is False, `use_dropout` will be automatically set to False. # noqa
            batch_size: int, `batch_size` can be None or any int
            sequence_length: int, `sequence_length` can be None or any int
            use_type_embeddings: bool, By default BERT has type_embeddings, GPT2 don't.
            use_positonal_embeddings: bool, T5 don't have postional embeddings
            bidirectional: use in relative postional embedding (we can infer it based on mask_mode)
            is_decoder: bool, if True it will become decoder mode (as in Seq2Seq)
            use_mlm_layer: bool ( To use MLM layer or not )
            share_encoder_embeddings: bool, When is_decoder = True, most cases,
                            it will re-use the embedding layer from Encoder.
                            So. if you still want to initialize , set this to False.
                            If True, share embedding layers from encoder
                            (word_embeddings, positional_embeddings, type_embeddings)
            cross_attention_inside_encoder: bool, Encoder Decoder Cross attention in each layer
        """
        # Because saved_model causes some serialization problems here
        # self.config              = config
        self.vocab_size = config["vocab_size"]
        self.type_vocab_size = config["type_vocab_size"]
        self.num_hidden_layers = config["num_hidden_layers"]
        self.num_attention_heads = config["num_attention_heads"]
        self.attention_head_size = config["attention_head_size"]
        self.max_position_embeddings = config["max_position_embeddings"]
        self.intermediate_size = config["intermediate_size"]
        self.embedding_size = config["embedding_size"]
        self.initializer_range = config["initializer_range"]
        self.hidden_act = config["hidden_act"]
        self.hidden_dropout_prob = config["hidden_dropout_prob"]
        self.attention_probs_dropout_prob = config["attention_probs_dropout_prob"]
        self.intermediate_act = config["intermediate_act"]
        self.layer_norm_epsilon = config["layer_norm_epsilon"]

        # Get activation and initiliazers
        self.activation = get_activation(self.hidden_act)
        self.intermediate_activation = get_activation(self.intermediate_act)
        initializer = tf.keras.initializers.TruncatedNormal(stddev=self.initializer_range)
        self.initializer = tf.keras.initializers.get(initializer)
        self.mask_mode = mask_mode
        # If we use self.name , its a conflict with keras property
        self.model_name = name
        self.pipeline_mode = pipeline_mode
        self.is_decoder = is_decoder

        # self._self_setattr_tracking = False
        self.mask_mode = mask_mode
        self.use_dropout = use_dropout
        self.is_training = is_training
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.use_type_embeddings = use_type_embeddings
        self.use_positonal_embeddings = use_positonal_embeddings
        self.share_encoder_embeddings = share_encoder_embeddings
        self.share_attention_layers = share_attention_layers
        self.use_mlm_layer = use_mlm_layer
        self.cross_attention_inside_encoder = cross_attention_inside_encoder
        self.return_all_layer_token_embeddings = return_all_layer_token_embeddings

        if not name.startswith("tf_transformers"):
            kwargs["name"] = "tf_transformers/" + self.model_name
        else:
            kwargs["name"] = self.model_name
        self.validate_and_set_inputs()

        super(GPT2Encoder, self).__init__(is_training=self.is_training, use_dropout=self.use_dropout, **kwargs)
        self._config_dict = {
            "initializer": tf.keras.initializers.serialize(initializer),
            "is_training": self.is_training,
            "use_dropout": self.use_dropout,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "name": kwargs["name"],
            "use_type_embeddings": self.use_type_embeddings,
            "use_positonal_embeddings": self.use_positonal_embeddings,
            "is_decoder": self.is_decoder,
            "share_encoder_embeddings": self.share_encoder_embeddings,
            "share_attention_layers": self.share_attention_layers,
            "cross_attention_inside_encoder": cross_attention_inside_encoder,
            "return_all_layer_token_embeddings": self.return_all_layer_token_embeddings,
        }

        # Update config dict with passed config
        self._config_dict.update(config)

        # Call embedding layers
        self._embedding_layer, self._type_embeddings, self._position_embedding_layer = self.get_embedding_layers()
        if self.is_decoder:
            # If embedding has to shared from the encoder
            if self.share_encoder_embeddings:
                self._embedding_layer = encoder_embedding_layer
                self._type_embeddings = encoder_type_embedding_layer
                self._position_embedding_layer = encoder_positional_embedding_layer

        # Embedding Norm
        self._embedding_norm = tf.keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=self.layer_norm_epsilon,
            dtype=tf.float32,
        )

        # Embedding dropout Layer
        self._embedding_dropout = tf.keras.layers.Dropout(rate=self.hidden_dropout_prob)

        # Transformer Layer
        self._transformer_layers = []
        for i in range(self.num_hidden_layers):
            layer = TransformerGPT2(
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                intermediate_activation=self.activation,
                dropout_rate=self.hidden_dropout_prob,
                attention_dropout_rate=self.attention_probs_dropout_prob,
                kernel_initializer=self.initializer,
                is_training=self.is_training,
                use_dropout=self.use_dropout,
                is_decoder=is_decoder,
                share_attention_layers=share_attention_layers,
                layer_norm_epsilon=self.layer_norm_epsilon,
                cross_attention_inside_encoder=self.cross_attention_inside_encoder,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        if self.use_mlm_layer:
            self.mlm_layer = MLMLayer(
                self.embedding_size,
                self.layer_norm_epsilon,
                self.hidden_act,
                name="mlm_layer",
            )

            self._last_logits_bias = self.add_weight(
                "tf_transformers/last_logits_bias",
                shape=(self.vocab_size,),
                dtype=tf.float32,
                trainable=True,
            )
        # Last Layer Normalization (only in GPT2)
        self._last_layer_norm = GPT2LayerNormalization(
            name="ln_f/layer_norm",
            axis=-1,
            epsilon=self.layer_norm_epsilon,
            dtype=tf.float32,
        )
        self.call_fn = self.get_call_method()
        # Initialize model
        self.model_inputs, self.model_outputs = self.get_model(initialize_only=True)
        logging.info("Initialized Variables")
    def __init__(
        self,
        config,
        mask_mode="user_defined",
        name=None,
        use_dropout=False,
        is_training=None,
        batch_size=None,
        sequence_length=None,
        use_type_embeddings=True,
        use_positonal_embeddings=True,
        pipeline_mode=None,
        is_decoder=False,
        initialize_embeddings=False,
        model_dir=None,
        **kwargs,
    ):
        """
        Args:
            config: dict
            mask_mode: str, `user_defined` BERT by default uses masking for PADDED or MLM. But can be overridden . # noqa
            name: str, Name of the model
            use_dropout: bool, It is strictly optional. Sometimes, while training you can set `use_dropout` to False. # noqa
                         If `is_training` is False, `use_dropout` will be automatically set to False. # noqa
            batch_size: int, `batch_size` can be None or any int
            sequence_length: int, `sequence_length` can be None or any int
            use_type_embeddings: bool, By default BERT has type_embeddings, GPT2 don't.
            use_positonal_embeddings: bool, T5 don't have postional embeddings
            bidirectional: use in relative postional embedding (we can infer it based on mask_mode)
            is_decoder: bool, if True it will become decoder mode (as in Seq2Seq)
            initialize_embeddings: bool, When is_decoder = True, most cases, it will re-use the embedding layer from Encoder. # noqa
                            So. if you still want to initialize , set this to True # noqa
        """
        # Because saved_model causes some serialization problems here
        # self.config              = config
        self.vocab_size = config["vocab_size"]
        self.type_vocab_size = config["type_vocab_size"]
        self.num_hidden_layers = config["num_hidden_layers"]
        self.num_attention_heads = config["num_attention_heads"]
        self.max_position_embeddings = config["max_position_embeddings"]
        self.intermediate_size = config["intermediate_size"]
        self.embedding_size = config["embedding_size"]
        self.initializer_range = config["initializer_range"]
        self.hidden_act = config["hidden_act"]
        self.hidden_dropout_prob = config["hidden_dropout_prob"]
        self.attention_probs_dropout_prob = config[
            "attention_probs_dropout_prob"]
        self.intermediate_act = config["intermediate_act"]
        self.layer_norm_epsilon = config["layer_norm_epsilon"]

        # Get activation and initiliazers
        self.activation = get_activation(self.hidden_act)
        self.intermediate_activation = get_activation(self.intermediate_act)
        initializer = tf.keras.initializers.TruncatedNormal(
            stddev=self.initializer_range)
        self.initializer = tf.keras.initializers.get(initializer)
        self.mask_mode = mask_mode
        # If we use self.name , its a conflict with keras property
        self.model_name = name
        self.pipeline_mode = pipeline_mode
        self.is_decoder = is_decoder
        self.model_dir = model_dir

        if self.mask_mode not in ["user_defined", "causal", "prefix"]:
            raise ValueError(
                "Unknown mask_mode `{}`provided. Supported modes are `{}`".
                format(self.mask_mode, ["user_defined", "causal", "prefix"]))
        if self.model_name is None:
            raise ValueError(
                "`name` cannot be None. Please provide a meaningful name")
        if is_training is None:
            raise ValueError(
                "`is_training` cannot be None. Please provide a `True` or `False`"
            )
        if self.mask_mode is None:
            raise ValueError(
                "`mask_mode` cannot be None. Please provide `['user_defined', 'causal', 'prefix']`"
            )

        # self._self_setattr_tracking = False
        self.mask_mode = mask_mode
        self.use_dropout = use_dropout
        self.is_training = is_training
        self.batch_size = batch_size
        self.sequence_length = sequence_length
        self.use_type_embeddings = use_type_embeddings
        self.use_positonal_embeddings = use_positonal_embeddings
        self.initialize_embeddings = initialize_embeddings

        # If `is_training` is False and `pipeline is None` means, we are using it for inference.
        # We will forcefully set it back to `is_training` is True and `use_dropout` is False.
        # For encoder-decoder models # noqa
        # this is the encoder mode params. Same mode will also be applicable for classification, QA etc. # noqa

        if self.is_training:
            if self.pipeline_mode is not None:
                raise ValueError(
                    "When `is_training` is True, `pipeline_mode` should be None. \
                     But rather got `pipeline_mode` as {}".format(
                        self.pipeline_mode))

        if self.is_training is False:
            if self.pipeline_mode is None:
                logging.info(
                    "We are overwriding `is_training` is False to `is_training` to True \
                     with `use_dropout` is False, no effects on your inference pipeline"
                )
                self.is_training = True
                self.use_dropout = False

        # Decoder Mode
        if self.is_decoder:
            # Decoder will never have a prefix model for time being
            if self.mask_mode == "prefix":
                raise ValueError(
                    "As you are in Decoder Mode (`is_decoder` is True), {} mask_mode \
                     doesn't make sense. For Decode `mask_mode` \
                     should be `causal` or `user_defined` ".format(
                        self.mask_mode))
            # If predict pipeline
            if self.is_training is False:
                # Auto Regressive setting should only support causal mode
                if self.pipeline_mode == "auto-regressive":
                    if self.mask_mode != "causal":
                        raise ValueError(
                            "As you are in Decoder Mode  and auto-regressive \
                              pipeline(`is_decoder` is True), \
                              {} mask_mode doesn't make sense. For Decode \
                              `mask_mode` should be `causal` ".format(
                                self.mask_mode)  # noqa
                        )

        if not name.startswith("tf_transformers"):
            kwargs["name"] = "tf_transformers/" + self.model_name
        else:
            kwargs["name"] = self.model_name
        super(UNILMEncoder, self).__init__(is_training=self.is_training,
                                           use_dropout=self.use_dropout,
                                           **kwargs)
        self._config_dict = {
            "initializer": tf.keras.initializers.serialize(initializer),
            "is_training": self.is_training,
            "use_dropout": self.use_dropout,
            "batch_size": self.batch_size,
            "sequence_length": self.sequence_length,
            "name": kwargs["name"],
            "use_type_embeddings": self.use_type_embeddings,
            "use_positonal_embeddings": self.use_positonal_embeddings,
            "is_decoder": self.is_decoder,
            "initialize_embeddings": self.initialize_embeddings,
            "layer_norm_epsilon": self.layer_norm_epsilon,
        }

        # Update config dict with passed config
        self._config_dict.update(config)

        if self.is_decoder:
            if self.initialize_embeddings:
                # Word Embedding Layer
                self._embedding_layer = OnDeviceEmbedding(
                    vocab_size=self.vocab_size,
                    embedding_width=self.embedding_size,
                    initializer=initializer,
                    name="word_embeddings",
                )
            if self.use_type_embeddings:
                # Type Embeddings
                self._type_embeddings = OnDeviceEmbedding(
                    vocab_size=self.type_vocab_size,
                    embedding_width=self.embedding_size,
                    initializer=initializer,
                    name="type_embeddings",
                )
            if self.use_positonal_embeddings:
                # Positional Embedding
                self._position_embedding_layer = SimplePositionEmbedding(
                    initializer=initializer,
                    max_sequence_length=self.max_position_embeddings,
                    embedding_width=self.embedding_size,
                    name="positional_embeddings",
                )

        else:
            # Word Embedding Layer
            self._embedding_layer = OnDeviceEmbedding(
                vocab_size=self.vocab_size,
                embedding_width=self.embedding_size,
                initializer=initializer,
                name="word_embeddings",
            )

            if self.use_type_embeddings:
                # Type Embeddings
                self._type_embeddings = OnDeviceEmbedding(
                    vocab_size=self.type_vocab_size,
                    embedding_width=self.embedding_size,
                    initializer=initializer,
                    name="type_embeddings",
                )
            if self.use_positonal_embeddings:
                # Positional Embedding
                self._position_embedding_layer = SimplePositionEmbedding(
                    initializer=initializer,
                    max_sequence_length=self.max_position_embeddings,
                    embedding_width=self.embedding_size,
                    name="positional_embeddings",
                )

        # Embedding Norm
        self._embedding_norm = tf.keras.layers.LayerNormalization(
            name="embeddings/layer_norm",
            axis=-1,
            epsilon=self.layer_norm_epsilon,
            dtype=tf.float32,
        )

        # Embedding dropout Layer
        self._embedding_dropout = tf.keras.layers.Dropout(
            rate=self.hidden_dropout_prob)

        # Transformer Layer
        self._transformer_layers = []
        for i in range(self.num_hidden_layers):
            layer = TransformerBERT(
                num_attention_heads=self.num_attention_heads,
                intermediate_size=self.intermediate_size,
                intermediate_activation=self.activation,
                dropout_rate=self.hidden_dropout_prob,
                attention_dropout_rate=self.attention_probs_dropout_prob,
                kernel_initializer=self.initializer,
                is_training=self.is_training,
                use_dropout=self.use_dropout,
                is_decoder=is_decoder,
                layer_norm_epsilon=self.layer_norm_epsilon,
                name="transformer/layer_%d" % i,
            )
            self._transformer_layers.append(layer)

        self._pooler_layer = tf.keras.layers.Dense(
            units=self.embedding_size,
            activation="tanh",
            kernel_initializer=self.initializer,
            name="pooler_transform",
        )

        self.mlm_layer = MLMLayer(
            self.embedding_size,
            self.layer_norm_epsilon,
            self.hidden_act,
            name="mlm_layer",
        )

        self._last_logits_bias = self.add_weight(
            "tf_transformers/last_logits_bias",
            shape=(self.vocab_size, ),
            dtype=tf.float32,
            trainable=True,
        )

        # Initialize model
        self.model_inputs = self.get_model(initialize_only=True)
        logging.info("Initialized Variables")

        if self.model_dir:
            self.load_model(self, self.model_dir)
            logging.info("Loaded Variables from {}".format(self.model_dir))