Example #1
0
 def build(self, input_shape):
     """ Builds the transformer encoder layer. """
     params = self.get_config()
     for _ in range(params["num_layers"]):
         self._stacking_layers.append([
             build_transformer_component(
                 {
                     "base_layer.class":
                     MultiHeadSelfAttention.__name__,
                     "base_layer.params":
                     dict(num_heads=params["num_attention_heads"],
                          num_units=params["hidden_size"],
                          attention_dropout_rate=params[
                              "attention_dropout_rate"],
                          attention_type=params["attention_type"],
                          name="self_attention")
                 },
                 dropout_rate=params["layer_postprocess_dropout_rate"]),
             build_transformer_component(
                 {
                     "base_layer.class":
                     TransformerFFN.__name__,
                     "base_layer.params":
                     dict(filter_size=params["filter_size"],
                          output_size=params["hidden_size"],
                          dropout_rate=params["ffn_dropout_rate"],
                          activation=params["ffn_activation"],
                          name="ffn")
                 },
                 dropout_rate=params["layer_postprocess_dropout_rate"])
         ])
     self._output_norm_layer = tf.keras.layers.LayerNormalization(
         epsilon=1e-6, dtype="float32", name="output_ln")
     super(TransformerEncoder, self).build(input_shape)
 def build(self, input_shape):
     """ Builds the transformer decoder layer. """
     params = self.get_config()
     for lid in range(params["num_layers"]):
         self._stacking_layers.append([
             build_transformer_component(
                 {
                     "base_layer.class":
                     LightConvolutionLayer.__name__,
                     "base_layer.params":
                     dict(kernel_size=params["conv_kernel_size_list"][lid],
                          num_heads=params["num_conv_heads"],
                          conv_type=params["conv_type"],
                          conv_dim=params["conv_hidden_size"],
                          use_glu=params["glu_after_proj"],
                          weight_dropout_rate=params[
                              "conv_weight_dropout_rate"],
                          name="light_conv")
                 },
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"]),
             build_transformer_component(
                 {
                     "base_layer.class":
                     MultiHeadAttention.__name__,
                     "base_layer.params":
                     dict(num_heads=params["num_attention_heads"],
                          num_units=input_shape[-1],
                          attention_dropout_rate=params[
                              "attention_dropout_rate"],
                          attention_type=params["attention_type"],
                          name="encdec_attention")
                 },
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"]),
             build_transformer_component(
                 {
                     "base_layer.class":
                     TransformerFFN.__name__,
                     "base_layer.params":
                     dict(filter_size=params["filter_size"],
                          output_size=input_shape[-1],
                          dropout_rate=params["ffn_dropout_rate"],
                          activation=params["ffn_activation"],
                          name="ffn")
                 },
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"])
         ])
     self._output_norm_layer = tf.keras.layers.LayerNormalization(
         epsilon=params["layer_postprocess_epsilon"],
         dtype="float32",
         name="output_ln")
     super(LightConvolutionDecoder, self).build(input_shape)
 def build(self, input_shape):
     """ Builds the transformer decoder layer. """
     params = self.get_config()
     for _ in range(params["num_layers"]):
         self._stacking_layers.append([
             build_transformer_component({
                 "base_layer.class": MultiHeadSelfAttention.__name__,
                 "base_layer.params": dict(
                     num_heads=params["num_attention_heads"],
                     num_units=params["hidden_size"],
                     attention_dropout_rate=params["attention_dropout_rate"],
                     attention_type=params["attention_type"],
                     name="self_attention"
                 )},
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"],
                 pre_norm=(not params["post_normalize"])),
             (build_transformer_component({
                 "base_layer.class": MultiHeadAttention.__name__,
                 "base_layer.params": dict(
                     num_heads=params["num_attention_heads"],
                     num_units=params["hidden_size"],
                     attention_dropout_rate=params["attention_dropout_rate"],
                     attention_type=params["attention_type"],
                     name="encdec_attention")},
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"],
                 pre_norm=(not params["post_normalize"]))
              if self._with_encoder_decoder_attention else None),
             build_transformer_component({
                 "base_layer.class": TransformerFFN.__name__,
                 "base_layer.params": dict(
                     filter_size=params["filter_size"],
                     output_size=params["hidden_size"],
                     dropout_rate=params["ffn_dropout_rate"],
                     activation=params["ffn_activation"],
                     name="ffn")},
                 dropout_rate=params["layer_postprocess_dropout_rate"],
                 epsilon=params["layer_postprocess_epsilon"],
                 pre_norm=(not params["post_normalize"]))])
     if not params["post_normalize"]:
         self._output_norm_layer = tf.keras.layers.LayerNormalization(
             epsilon=params["layer_postprocess_epsilon"],
             dtype="float32", name="output_ln")
         self.add_activation_quantizer(name="output_ln", activation="act")
     super(TransformerDecoder, self).build(input_shape)