def __init__(self, model_dimension, num_heads, intermediate_size, initializer_stddev=0.02, activation_dropout_rate=0.0, attention_dropout_rate=0.0, **kwargs): super(FunnelTransformerEncoder, self).__init__(**kwargs) self.model_dimension = model_dimension self.parameters.initializer = tf.keras.initializers.TruncatedNormal( stddev=initializer_stddev) self.self_attn = FunnelAttention( model_dimension, num_heads, attention_dropout_rate=attention_dropout_rate, parameters=self.parameters) self.prx = dense_layers.BaseQDenseVarLen(model_dimension, activation=None, parameters=self.parameters) self.upprx = dense_layers.BaseQDenseVarLen(intermediate_size, parameters=self.parameters) self.downprx = dense_layers.BaseQDenseVarLen( model_dimension, activation=None, parameters=self.parameters) self.activation_dropout_rate = activation_dropout_rate self.ln1 = normalization_layers.LayerNormalization(**kwargs) self.ln2 = normalization_layers.LayerNormalization(**kwargs) self.q1 = quantization_layers.ActivationQuantization(**kwargs) self.q2 = quantization_layers.ActivationQuantization(**kwargs)
def __init__(self, config, mode): super(Encoder, self).__init__() def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("labels") _get_params("quantize", True) _get_params("embedding_regularizer_scale", 35e-3) _get_params("embedding_size", 64) _get_params("unigram_channels", 0) _get_params("bigram_channels", 0) _get_params("trigram_channels", 0) _get_params("fourgram_channels", 0) _get_params("fivegram_channels", 0) _get_params("skip1bigram_channels", 0) _get_params("skip2bigram_channels", 0) _get_params("network_regularizer_scale", 1e-4) _get_params("keep_prob", 0.5) self.num_classes = len(self.labels) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.embedding_regularizer_scale) self.values_fc = dense_layers.BaseQDenseVarLen( units=self.embedding_size, rank=3, parameters=self.parameters) self.attention_fc = dense_layers.BaseQDenseVarLen( units=self.embedding_size, rank=3, parameters=self.parameters) self.dropout = tf.keras.layers.Dropout(rate=(1 - self.keep_prob)) self.parameters = copy.copy(self.parameters) self.parameters.regularizer_scale = self.network_regularizer_scale self.attention_pool_layers = [] self._add_attention_pool_layer(self.unigram_channels, 1) self._add_attention_pool_layer(self.bigram_channels, 2) self._add_attention_pool_layer(self.trigram_channels, 3) self._add_attention_pool_layer(self.fourgram_channels, 4) self._add_attention_pool_layer(self.fivegram_channels, 5) self._add_attention_pool_layer(self.skip1bigram_channels, None, 1) self._add_attention_pool_layer(self.skip2bigram_channels, None, 2) self.concat_quantizer = quantization_layers.ConcatQuantization( axis=1, parameters=self.parameters) self.final_fc = dense_layers.BaseQDense(units=self.num_classes, rank=2, parameters=self.parameters, activation=None)
def __init__(self, model_dimension, num_heads, attention_dropout_rate=0.0, **kwargs): self.model_dimension = model_dimension self.num_heads = num_heads self.filters = model_dimension // num_heads self.q_dense_layer = dense_layers.BaseQDenseVarLen( units=model_dimension, activation=None, **kwargs) self.kv_dense_layer = dense_layers.BaseQDenseVarLen( units=model_dimension * 2, activation=None, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) self.attention_dropout_rate = attention_dropout_rate self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs) super(FunnelAttention, self).__init__(**kwargs)
def __init__(self, scalar=True, **kwargs): self.scalar = scalar # Attention logits should not have activation post linear layer so it can # be positive or negative. This would enable the attention distribution to # be anything that the network likes. Using relu activation makes the # attention distribution biased towards uniform distribution. # This gets better results for attention pooling. Though some outputs are # emphasized for making classification decision, all other outputs have # a non zero probability of influencing the class. This seems to result # in better backprop. self.attention = dense_layers.BaseQDenseVarLen(units=1, rank=3, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) super(AttentionPooling, self).__init__(**kwargs)
def __init__(self, model_dimension, num_heads, attention_dropout_rate=0.0, **kwargs): self.model_dimension = model_dimension self.num_heads = num_heads self.filters = model_dimension // num_heads self.dense_layers = [ dense_layers.BaseQDenseVarLen(units=self.filters, activation=None, **kwargs) for i in range(num_heads * 3) ] self.qactivation = quantization_layers.ActivationQuantization(**kwargs) self.attention_dropout_rate = attention_dropout_rate self.qconcat = quantization_layers.ConcatQuantization(axis=2, **kwargs) super(SelfAttention, self).__init__(**kwargs)
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("projection_bottleneck_size") _get_params("qrnn_state_size") _get_params("qrnn_kernel_width", 3) _get_params("qrnn_zoneout_probability") _get_params("number_qrnn_layers") _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") self.num_classes = len(self.labels) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) self.bottleneck_layer = dense_layers.BaseQDenseVarLen( units=self.projection_bottleneck_size, rank=3, parameters=self.parameters) self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack( parameters=self.parameters, zoneout_probability=self.qrnn_zoneout_probability, kwidth=self.qrnn_kernel_width, state_size=self.qrnn_state_size, num_layers=self.number_qrnn_layers) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) if self.num_classes: self.final_fc = dense_layers.BaseQDense(units=self.num_classes, rank=2, parameters=self.parameters, activation=None)
def __init__(self, model_dimension, max_time_step, attention_dropout_rate=0.0, beam_size=1, **kwargs): self.model_dimension = model_dimension self.max_time_step = max_time_step self.beam_size = beam_size self.causal_mask = tf.expand_dims( tf.linalg.band_part(tf.ones([max_time_step, max_time_step]), -1, 0), 0) self.dense_layers = dense_layers.BaseQDenseVarLen( units=model_dimension, activation=None, normalize=False, bias=False, rank=3, **kwargs) self.qoutput = quantization_layers.ActivationQuantization(**kwargs) super(DecoderUniformAttention, self).__init__(**kwargs)
def __init__(self, model_dimension, num_heads, attention_dropout_rate=0.0, cached_kv=False, **kwargs): self.model_dimension = model_dimension self.num_heads = num_heads self.filters = model_dimension // num_heads self.cached_kv = cached_kv self.q_dense_layers = dense_layers.BaseQDense(units=model_dimension, activation=None, normalize=False, bias=False, **kwargs) self.kv_dense_layers = dense_layers.BaseQDenseVarLen( units=model_dimension * 2, activation=None, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) self.attention_dropout_rate = attention_dropout_rate self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs) super(DecoderMultiheadAttention, self).__init__(**kwargs)
def __init__(self, config, mode): super(Model, self).__init__() def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("intermediate_size") _get_params("max_dec_time_step") _get_params("max_enc_time_step") _get_params("embedding_size") _get_params("vocabulary_size") _get_params("num_layers") _get_params("labels") _get_params("regularizer_scale") _get_params("num_heads") _get_params("model_dimension") _get_params("beam_size", 1) _get_params("quantize", True) _get_params("cached_kv", False) _get_params("attention_dropout_rate", 0.0) _get_params("activation_dropout_rate", 0.0) # If set, a separate dense layer is used to generate the logits instead of # re-using the input embedding table. _get_params("use_output_layer", False) self.parameters = base_layers.Parameters(mode, self.quantize, self.regularizer_scale) # Activation/Normalization enabled on input bottleneck as there is no # temporal information. self.input_bottleneck = dense_layers.BaseQDenseVarLen( self.model_dimension, rank=3, parameters=self.parameters) self.output_bottleneck = dense_layers.BaseQDense( self.embedding_size, normalize=False, activation=None, bias=False, parameters=self.parameters) self.embedding = embedding_layers.EmbeddingFullyConnected( shape=[self.vocabulary_size, self.embedding_size], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) if self.use_output_layer: self.output_layer = dense_layers.BaseQDense( self.vocabulary_size, activation=None, normalize=False, bias=False, parameters=self.parameters) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.max_dec_time_step, self.model_dimension], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) # Scales the weights for computing logits. self.logits_fc_weights_scale_factor = None self.logits_fc_bias = self.add_weight( "logits_fc_bias", shape=[self.vocabulary_size], initializer=tf.constant_initializer(0), dtype="float32") # Optional bias which can be used to mask logits output. self.output_bias = None self.transformer_uniform_attn_decoder = TransformerUniformAttnDecoderStack( parameters=self.parameters, num_layers=self.num_layers, intermediate_size=self.intermediate_size, embedding_size=self.embedding_size, max_time_step=self.max_dec_time_step, num_heads=self.num_heads, model_dimension=self.model_dimension, vocabulary_size=self.vocabulary_size, beam_size=self.beam_size, cached_kv=self.cached_kv, attention_dropout_rate=self.attention_dropout_rate, activation_dropout_rate=self.activation_dropout_rate) # Beam search output. self.finished_seq = None self.finished_scores = None
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config.get(varname, default_value) default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("feature_size") _get_params("bottleneck_size", self.feature_size) _get_params("qrnn_state_size") _get_params("qrnn_kernel_width", 3) _get_params("qrnn_zoneout_probability") _get_params("number_qrnn_layers") _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") _get_params("gbst_max_token_len", 128) _get_params("gbst_downsample_rate", 1) _get_params("gbst_max_subword_block_width", 4) _get_params("gbst_conv_kernel_size", 5) _get_params("gbst_block_mixing_mode") _get_params("gbst_add_block_pos_embed", False) _get_params("attn_pool_output", True) self.num_classes = len(config.get("labels", [])) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK). self.vocabulary_size = 259 self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.feature_size], parameters=self.parameters) self.bottleneck_layer = dense_layers.BaseQDenseVarLen( units=self.bottleneck_size, rank=3, parameters=self.parameters) self.gbst_layer = misc_layers.GBSTLayerV2( feature_size=self.bottleneck_size, max_seq_len=self.gbst_max_token_len, downsample_rate=self.gbst_downsample_rate, max_subword_block_width=self.gbst_max_subword_block_width, conv_kernel_size=self.gbst_conv_kernel_size, block_mixing_mode=self.gbst_block_mixing_mode, add_block_pos_embed=self.gbst_add_block_pos_embed, parameters=self.parameters) self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack( parameters=self.parameters, zoneout_probability=self.qrnn_zoneout_probability, kwidth=self.qrnn_kernel_width, state_size=self.qrnn_state_size, num_layers=self.number_qrnn_layers) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) if self.num_classes: self.final_fc = dense_layers.BaseQDense(units=self.num_classes, rank=2, parameters=self.parameters, activation=None)
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") _get_params("feature_size") _get_params("bottleneck_size") self.max_seq_len = config.get("max_seq_len", 128) self.gbst_max_token_len = config.get("gbst_max_token_len", 128) # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK). self.vocabulary_size = config.get("vocabulary_size", 259) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.feature_size], parameters=self.parameters) self.gbst_downsample_rate = config.get("gbst_downsample_rate", 1) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.gbst_max_token_len, self.feature_size], parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) self.bottleneck_layer = None gbst_size = self.feature_size if self.bottleneck_size != self.feature_size: self.bottleneck_layer = dense_layers.BaseQDenseVarLen( self.bottleneck_size, rank=3, normalize=False, activation=None, parameters=self.parameters) gbst_size = self.bottleneck_size self.gbst_max_subword_block_width = config.get( "gbst_max_subword_block_width", 5) self.gbst_conv_kernel_size = config.get("gbst_conv_kernel_size", 5) self.gbst_block_mixing_mode = config.get("gbst_block_mixing_mode", None) self.gbst_layer = misc_layers.GBSTLayerV2( feature_size=gbst_size, max_seq_len=self.gbst_max_token_len, downsample_rate=self.gbst_downsample_rate, max_subword_block_width=self.gbst_max_subword_block_width, conv_kernel_size=self.gbst_conv_kernel_size, block_mixing_mode=self.gbst_block_mixing_mode, parameters=self.parameters) self.pool_windows = config.get("pool_windows", None) if self.pool_windows: self.transformer_encoder_layer = transformer_encoder.FunnelTransformerModel( config, mode) else: self.transformer_encoder_layer = transformer_encoder.ModelWithEmbeddings( config, mode) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) self.num_classes = len(self.labels) if self.num_classes: self.final_fc = dense_layers.BaseQDense( units=self.num_classes, rank=2, parameters=self.parameters, activation=None)