Ejemplo n.º 1
0
 def __init__(self,
              model_dimension,
              num_heads,
              intermediate_size,
              initializer_stddev=0.02,
              activation_dropout_rate=0.0,
              attention_dropout_rate=0.0,
              **kwargs):
     super(FunnelTransformerEncoder, self).__init__(**kwargs)
     self.model_dimension = model_dimension
     self.parameters.initializer = tf.keras.initializers.TruncatedNormal(
         stddev=initializer_stddev)
     self.self_attn = FunnelAttention(
         model_dimension,
         num_heads,
         attention_dropout_rate=attention_dropout_rate,
         parameters=self.parameters)
     self.prx = dense_layers.BaseQDenseVarLen(model_dimension,
                                              activation=None,
                                              parameters=self.parameters)
     self.upprx = dense_layers.BaseQDenseVarLen(intermediate_size,
                                                parameters=self.parameters)
     self.downprx = dense_layers.BaseQDenseVarLen(
         model_dimension, activation=None, parameters=self.parameters)
     self.activation_dropout_rate = activation_dropout_rate
     self.ln1 = normalization_layers.LayerNormalization(**kwargs)
     self.ln2 = normalization_layers.LayerNormalization(**kwargs)
     self.q1 = quantization_layers.ActivationQuantization(**kwargs)
     self.q2 = quantization_layers.ActivationQuantization(**kwargs)
Ejemplo n.º 2
0
    def __init__(self, config, mode):
        super(Encoder, self).__init__()

        def _get_params(varname, default_value=None):
            value = config[varname] if varname in config else default_value
            default = "" if varname in config else " (default)"
            logging.info("%s = %s%s", varname, value, default)
            setattr(self, varname, value)

        _get_params("labels")
        _get_params("quantize", True)
        _get_params("embedding_regularizer_scale", 35e-3)
        _get_params("embedding_size", 64)
        _get_params("unigram_channels", 0)
        _get_params("bigram_channels", 0)
        _get_params("trigram_channels", 0)
        _get_params("fourgram_channels", 0)
        _get_params("fivegram_channels", 0)
        _get_params("skip1bigram_channels", 0)
        _get_params("skip2bigram_channels", 0)
        _get_params("network_regularizer_scale", 1e-4)
        _get_params("keep_prob", 0.5)
        self.num_classes = len(self.labels)

        self.parameters = base_layers.Parameters(
            mode,
            quantize=self.quantize,
            regularizer_scale=self.embedding_regularizer_scale)
        self.values_fc = dense_layers.BaseQDenseVarLen(
            units=self.embedding_size, rank=3, parameters=self.parameters)
        self.attention_fc = dense_layers.BaseQDenseVarLen(
            units=self.embedding_size, rank=3, parameters=self.parameters)
        self.dropout = tf.keras.layers.Dropout(rate=(1 - self.keep_prob))

        self.parameters = copy.copy(self.parameters)
        self.parameters.regularizer_scale = self.network_regularizer_scale
        self.attention_pool_layers = []
        self._add_attention_pool_layer(self.unigram_channels, 1)
        self._add_attention_pool_layer(self.bigram_channels, 2)
        self._add_attention_pool_layer(self.trigram_channels, 3)
        self._add_attention_pool_layer(self.fourgram_channels, 4)
        self._add_attention_pool_layer(self.fivegram_channels, 5)
        self._add_attention_pool_layer(self.skip1bigram_channels, None, 1)
        self._add_attention_pool_layer(self.skip2bigram_channels, None, 2)

        self.concat_quantizer = quantization_layers.ConcatQuantization(
            axis=1, parameters=self.parameters)
        self.final_fc = dense_layers.BaseQDense(units=self.num_classes,
                                                rank=2,
                                                parameters=self.parameters,
                                                activation=None)
Ejemplo n.º 3
0
 def __init__(self,
              model_dimension,
              num_heads,
              attention_dropout_rate=0.0,
              **kwargs):
     self.model_dimension = model_dimension
     self.num_heads = num_heads
     self.filters = model_dimension // num_heads
     self.q_dense_layer = dense_layers.BaseQDenseVarLen(
         units=model_dimension, activation=None, **kwargs)
     self.kv_dense_layer = dense_layers.BaseQDenseVarLen(
         units=model_dimension * 2, activation=None, **kwargs)
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     self.attention_dropout_rate = attention_dropout_rate
     self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs)
     super(FunnelAttention, self).__init__(**kwargs)
Ejemplo n.º 4
0
 def __init__(self, scalar=True, **kwargs):
   self.scalar = scalar
   # Attention logits should not have activation post linear layer so it can
   # be positive or negative. This would enable the attention distribution to
   # be anything that the network likes. Using relu activation makes the
   # attention distribution biased towards uniform distribution.
   # This gets better results for attention pooling. Though some outputs are
   # emphasized for making classification decision, all other outputs have
   # a non zero probability of influencing the class. This seems to result
   # in better backprop.
   self.attention = dense_layers.BaseQDenseVarLen(units=1, rank=3, **kwargs)
   self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
   super(AttentionPooling, self).__init__(**kwargs)
Ejemplo n.º 5
0
 def __init__(self,
              model_dimension,
              num_heads,
              attention_dropout_rate=0.0,
              **kwargs):
     self.model_dimension = model_dimension
     self.num_heads = num_heads
     self.filters = model_dimension // num_heads
     self.dense_layers = [
         dense_layers.BaseQDenseVarLen(units=self.filters,
                                       activation=None,
                                       **kwargs)
         for i in range(num_heads * 3)
     ]
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     self.attention_dropout_rate = attention_dropout_rate
     self.qconcat = quantization_layers.ConcatQuantization(axis=2, **kwargs)
     super(SelfAttention, self).__init__(**kwargs)
Ejemplo n.º 6
0
    def __init__(self, config, mode, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        def _get_params(varname, default_value=None):
            value = config[varname] if varname in config else default_value
            default = "" if varname in config else " (default)"
            logging.info("%s = %s%s", varname, value, default)
            setattr(self, varname, value)

        _get_params("projection_bottleneck_size")
        _get_params("qrnn_state_size")
        _get_params("qrnn_kernel_width", 3)
        _get_params("qrnn_zoneout_probability")
        _get_params("number_qrnn_layers")
        _get_params("labels", [])
        _get_params("regularizer_scale")
        _get_params("quantize")

        self.num_classes = len(self.labels)
        self.parameters = base_layers.Parameters(
            mode,
            quantize=self.quantize,
            regularizer_scale=self.regularizer_scale)

        self.bottleneck_layer = dense_layers.BaseQDenseVarLen(
            units=self.projection_bottleneck_size,
            rank=3,
            parameters=self.parameters)

        self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack(
            parameters=self.parameters,
            zoneout_probability=self.qrnn_zoneout_probability,
            kwidth=self.qrnn_kernel_width,
            state_size=self.qrnn_state_size,
            num_layers=self.number_qrnn_layers)

        self.attention_pool = misc_layers.AttentionPooling(
            parameters=self.parameters)

        if self.num_classes:
            self.final_fc = dense_layers.BaseQDense(units=self.num_classes,
                                                    rank=2,
                                                    parameters=self.parameters,
                                                    activation=None)
Ejemplo n.º 7
0
 def __init__(self,
              model_dimension,
              max_time_step,
              attention_dropout_rate=0.0,
              beam_size=1,
              **kwargs):
     self.model_dimension = model_dimension
     self.max_time_step = max_time_step
     self.beam_size = beam_size
     self.causal_mask = tf.expand_dims(
         tf.linalg.band_part(tf.ones([max_time_step, max_time_step]), -1,
                             0), 0)
     self.dense_layers = dense_layers.BaseQDenseVarLen(
         units=model_dimension,
         activation=None,
         normalize=False,
         bias=False,
         rank=3,
         **kwargs)
     self.qoutput = quantization_layers.ActivationQuantization(**kwargs)
     super(DecoderUniformAttention, self).__init__(**kwargs)
Ejemplo n.º 8
0
 def __init__(self,
              model_dimension,
              num_heads,
              attention_dropout_rate=0.0,
              cached_kv=False,
              **kwargs):
     self.model_dimension = model_dimension
     self.num_heads = num_heads
     self.filters = model_dimension // num_heads
     self.cached_kv = cached_kv
     self.q_dense_layers = dense_layers.BaseQDense(units=model_dimension,
                                                   activation=None,
                                                   normalize=False,
                                                   bias=False,
                                                   **kwargs)
     self.kv_dense_layers = dense_layers.BaseQDenseVarLen(
         units=model_dimension * 2, activation=None, **kwargs)
     self.qactivation = quantization_layers.ActivationQuantization(**kwargs)
     self.attention_dropout_rate = attention_dropout_rate
     self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs)
     super(DecoderMultiheadAttention, self).__init__(**kwargs)
Ejemplo n.º 9
0
    def __init__(self, config, mode):

        super(Model, self).__init__()

        def _get_params(varname, default_value=None):
            value = config[varname] if varname in config else default_value
            default = "" if varname in config else " (default)"
            logging.info("%s = %s%s", varname, value, default)
            setattr(self, varname, value)

        _get_params("intermediate_size")
        _get_params("max_dec_time_step")
        _get_params("max_enc_time_step")
        _get_params("embedding_size")
        _get_params("vocabulary_size")
        _get_params("num_layers")
        _get_params("labels")
        _get_params("regularizer_scale")
        _get_params("num_heads")
        _get_params("model_dimension")
        _get_params("beam_size", 1)
        _get_params("quantize", True)
        _get_params("cached_kv", False)
        _get_params("attention_dropout_rate", 0.0)
        _get_params("activation_dropout_rate", 0.0)
        # If set, a separate dense layer is used to generate the logits instead of
        # re-using the input embedding table.
        _get_params("use_output_layer", False)
        self.parameters = base_layers.Parameters(mode, self.quantize,
                                                 self.regularizer_scale)
        # Activation/Normalization enabled on input bottleneck as there is no
        # temporal information.
        self.input_bottleneck = dense_layers.BaseQDenseVarLen(
            self.model_dimension, rank=3, parameters=self.parameters)
        self.output_bottleneck = dense_layers.BaseQDense(
            self.embedding_size,
            normalize=False,
            activation=None,
            bias=False,
            parameters=self.parameters)

        self.embedding = embedding_layers.EmbeddingFullyConnected(
            shape=[self.vocabulary_size, self.embedding_size],
            initializer=tf.random_uniform_initializer(-math.sqrt(3),
                                                      math.sqrt(3)),
            parameters=self.parameters)
        if self.use_output_layer:
            self.output_layer = dense_layers.BaseQDense(
                self.vocabulary_size,
                activation=None,
                normalize=False,
                bias=False,
                parameters=self.parameters)
        self.positional_embedding = embedding_layers.EmbeddingLayer(
            shape=[self.max_dec_time_step, self.model_dimension],
            initializer=tf.random_uniform_initializer(-math.sqrt(3),
                                                      math.sqrt(3)),
            parameters=self.parameters)
        self.ln = normalization_layers.LayerNormalization(
            parameters=self.parameters)
        self.qact = quantization_layers.ActivationQuantization(
            parameters=self.parameters)
        # Scales the weights for computing logits.
        self.logits_fc_weights_scale_factor = None
        self.logits_fc_bias = self.add_weight(
            "logits_fc_bias",
            shape=[self.vocabulary_size],
            initializer=tf.constant_initializer(0),
            dtype="float32")
        # Optional bias which can be used to mask logits output.
        self.output_bias = None
        self.transformer_uniform_attn_decoder = TransformerUniformAttnDecoderStack(
            parameters=self.parameters,
            num_layers=self.num_layers,
            intermediate_size=self.intermediate_size,
            embedding_size=self.embedding_size,
            max_time_step=self.max_dec_time_step,
            num_heads=self.num_heads,
            model_dimension=self.model_dimension,
            vocabulary_size=self.vocabulary_size,
            beam_size=self.beam_size,
            cached_kv=self.cached_kv,
            attention_dropout_rate=self.attention_dropout_rate,
            activation_dropout_rate=self.activation_dropout_rate)
        # Beam search output.
        self.finished_seq = None
        self.finished_scores = None
Ejemplo n.º 10
0
    def __init__(self, config, mode, **kwargs):
        super(Encoder, self).__init__(**kwargs)

        def _get_params(varname, default_value=None):
            value = config.get(varname, default_value)
            default = "" if varname in config else " (default)"
            logging.info("%s = %s%s", varname, value, default)
            setattr(self, varname, value)

        _get_params("feature_size")
        _get_params("bottleneck_size", self.feature_size)
        _get_params("qrnn_state_size")
        _get_params("qrnn_kernel_width", 3)
        _get_params("qrnn_zoneout_probability")
        _get_params("number_qrnn_layers")
        _get_params("labels", [])
        _get_params("regularizer_scale")
        _get_params("quantize")
        _get_params("gbst_max_token_len", 128)
        _get_params("gbst_downsample_rate", 1)
        _get_params("gbst_max_subword_block_width", 4)
        _get_params("gbst_conv_kernel_size", 5)
        _get_params("gbst_block_mixing_mode")
        _get_params("gbst_add_block_pos_embed", False)
        _get_params("attn_pool_output", True)

        self.num_classes = len(config.get("labels", []))

        self.parameters = base_layers.Parameters(
            mode,
            quantize=self.quantize,
            regularizer_scale=self.regularizer_scale)
        # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK).
        self.vocabulary_size = 259
        self.embedding = embedding_layers.EmbeddingLayer(
            shape=[self.vocabulary_size, self.feature_size],
            parameters=self.parameters)

        self.bottleneck_layer = dense_layers.BaseQDenseVarLen(
            units=self.bottleneck_size, rank=3, parameters=self.parameters)

        self.gbst_layer = misc_layers.GBSTLayerV2(
            feature_size=self.bottleneck_size,
            max_seq_len=self.gbst_max_token_len,
            downsample_rate=self.gbst_downsample_rate,
            max_subword_block_width=self.gbst_max_subword_block_width,
            conv_kernel_size=self.gbst_conv_kernel_size,
            block_mixing_mode=self.gbst_block_mixing_mode,
            add_block_pos_embed=self.gbst_add_block_pos_embed,
            parameters=self.parameters)

        self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack(
            parameters=self.parameters,
            zoneout_probability=self.qrnn_zoneout_probability,
            kwidth=self.qrnn_kernel_width,
            state_size=self.qrnn_state_size,
            num_layers=self.number_qrnn_layers)
        self.attention_pool = misc_layers.AttentionPooling(
            parameters=self.parameters)

        if self.num_classes:
            self.final_fc = dense_layers.BaseQDense(units=self.num_classes,
                                                    rank=2,
                                                    parameters=self.parameters,
                                                    activation=None)
Ejemplo n.º 11
0
  def __init__(self, config, mode, **kwargs):
    super(Encoder, self).__init__(**kwargs)

    def _get_params(varname, default_value=None):
      value = config[varname] if varname in config else default_value
      default = "" if varname in config else " (default)"
      logging.info("%s = %s%s", varname, value, default)
      setattr(self, varname, value)

    _get_params("labels", [])
    _get_params("regularizer_scale")
    _get_params("quantize")
    _get_params("feature_size")
    _get_params("bottleneck_size")

    self.max_seq_len = config.get("max_seq_len", 128)
    self.gbst_max_token_len = config.get("gbst_max_token_len", 128)
    # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK).
    self.vocabulary_size = config.get("vocabulary_size", 259)
    self.parameters = base_layers.Parameters(
        mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale)

    self.embedding = embedding_layers.EmbeddingLayer(
        shape=[self.vocabulary_size, self.feature_size],
        parameters=self.parameters)
    self.gbst_downsample_rate = config.get("gbst_downsample_rate", 1)
    self.positional_embedding = embedding_layers.EmbeddingLayer(
        shape=[self.gbst_max_token_len, self.feature_size],
        parameters=self.parameters)
    self.ln = normalization_layers.LayerNormalization(
        parameters=self.parameters)
    self.qact = quantization_layers.ActivationQuantization(
        parameters=self.parameters)

    self.bottleneck_layer = None
    gbst_size = self.feature_size
    if self.bottleneck_size != self.feature_size:
      self.bottleneck_layer = dense_layers.BaseQDenseVarLen(
          self.bottleneck_size,
          rank=3,
          normalize=False,
          activation=None,
          parameters=self.parameters)
      gbst_size = self.bottleneck_size

    self.gbst_max_subword_block_width = config.get(
        "gbst_max_subword_block_width", 5)
    self.gbst_conv_kernel_size = config.get("gbst_conv_kernel_size", 5)
    self.gbst_block_mixing_mode = config.get("gbst_block_mixing_mode", None)
    self.gbst_layer = misc_layers.GBSTLayerV2(
        feature_size=gbst_size,
        max_seq_len=self.gbst_max_token_len,
        downsample_rate=self.gbst_downsample_rate,
        max_subword_block_width=self.gbst_max_subword_block_width,
        conv_kernel_size=self.gbst_conv_kernel_size,
        block_mixing_mode=self.gbst_block_mixing_mode,
        parameters=self.parameters)

    self.pool_windows = config.get("pool_windows", None)
    if self.pool_windows:
      self.transformer_encoder_layer = transformer_encoder.FunnelTransformerModel(
          config, mode)
    else:
      self.transformer_encoder_layer = transformer_encoder.ModelWithEmbeddings(
          config, mode)
    self.attention_pool = misc_layers.AttentionPooling(
        parameters=self.parameters)
    self.num_classes = len(self.labels)
    if self.num_classes:
      self.final_fc = dense_layers.BaseQDense(
          units=self.num_classes,
          rank=2,
          parameters=self.parameters,
          activation=None)