def __init__(self, num_layers, max_time_step, vocabulary_size, embedding_size, model_dimension, num_heads, intermediate_size, **kwargs): self.max_time_step = max_time_step self.vocabulary_size = vocabulary_size self.embedding_size = embedding_size activation_dropout_rate = kwargs.pop('activation_dropout_rate', 0.0) attention_dropout_rate = kwargs.pop('attention_dropout_rate', 0.0) self.layers = [] for _ in range(num_layers): self.layers.append( TransformerEncoder( model_dimension=model_dimension, num_heads=num_heads, intermediate_size=intermediate_size, activation_dropout_rate=activation_dropout_rate, attention_dropout_rate=attention_dropout_rate, **kwargs)) self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.embedding_size], **kwargs) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.max_time_step, self.embedding_size], **kwargs) self.ln = normalization_layers.LayerNormalization(**kwargs) self.qact = quantization_layers.ActivationQuantization(**kwargs) super(TransformerEncoderStack, self).__init__(**kwargs)
def __init__(self, config, mode): super(Model, self).__init__() def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("intermediate_size") _get_params("max_dec_time_step") _get_params("max_enc_time_step") _get_params("embedding_size") _get_params("vocabulary_size") _get_params("num_layers") _get_params("labels") _get_params("regularizer_scale") _get_params("num_heads") _get_params("model_dimension") _get_params("beam_size", 1) _get_params("quantize", True) _get_params("cached_kv", False) _get_params("attention_dropout_rate", 0.0) _get_params("activation_dropout_rate", 0.0) # If set, a separate dense layer is used to generate the logits instead of # re-using the input embedding table. _get_params("use_output_layer", False) self.parameters = base_layers.Parameters(mode, self.quantize, self.regularizer_scale) # Activation/Normalization enabled on input bottleneck as there is no # temporal information. self.input_bottleneck = dense_layers.BaseQDenseVarLen( self.model_dimension, rank=3, parameters=self.parameters) self.output_bottleneck = dense_layers.BaseQDense( self.embedding_size, normalize=False, activation=None, bias=False, parameters=self.parameters) self.embedding = embedding_layers.EmbeddingFullyConnected( shape=[self.vocabulary_size, self.embedding_size], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) if self.use_output_layer: self.output_layer = dense_layers.BaseQDense( self.vocabulary_size, activation=None, normalize=False, bias=False, parameters=self.parameters) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.max_dec_time_step, self.model_dimension], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) # Scales the weights for computing logits. self.logits_fc_weights_scale_factor = None self.logits_fc_bias = self.add_weight( "logits_fc_bias", shape=[self.vocabulary_size], initializer=tf.constant_initializer(0), dtype="float32") # Optional bias which can be used to mask logits output. self.output_bias = None self.transformer_uniform_attn_decoder = TransformerUniformAttnDecoderStack( parameters=self.parameters, num_layers=self.num_layers, intermediate_size=self.intermediate_size, embedding_size=self.embedding_size, max_time_step=self.max_dec_time_step, num_heads=self.num_heads, model_dimension=self.model_dimension, vocabulary_size=self.vocabulary_size, beam_size=self.beam_size, cached_kv=self.cached_kv, attention_dropout_rate=self.attention_dropout_rate, activation_dropout_rate=self.activation_dropout_rate) # Beam search output. self.finished_seq = None self.finished_scores = None
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config.get(varname, default_value) default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("feature_size") _get_params("bottleneck_size", self.feature_size) _get_params("qrnn_state_size") _get_params("qrnn_kernel_width", 3) _get_params("qrnn_zoneout_probability") _get_params("number_qrnn_layers") _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") _get_params("gbst_max_token_len", 128) _get_params("gbst_downsample_rate", 1) _get_params("gbst_max_subword_block_width", 4) _get_params("gbst_conv_kernel_size", 5) _get_params("gbst_block_mixing_mode") _get_params("gbst_add_block_pos_embed", False) _get_params("attn_pool_output", True) self.num_classes = len(config.get("labels", [])) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK). self.vocabulary_size = 259 self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.feature_size], parameters=self.parameters) self.bottleneck_layer = dense_layers.BaseQDenseVarLen( units=self.bottleneck_size, rank=3, parameters=self.parameters) self.gbst_layer = misc_layers.GBSTLayerV2( feature_size=self.bottleneck_size, max_seq_len=self.gbst_max_token_len, downsample_rate=self.gbst_downsample_rate, max_subword_block_width=self.gbst_max_subword_block_width, conv_kernel_size=self.gbst_conv_kernel_size, block_mixing_mode=self.gbst_block_mixing_mode, add_block_pos_embed=self.gbst_add_block_pos_embed, parameters=self.parameters) self.qrnn_stack = qrnn_layers.QRNNBidirectionalStack( parameters=self.parameters, zoneout_probability=self.qrnn_zoneout_probability, kwidth=self.qrnn_kernel_width, state_size=self.qrnn_state_size, num_layers=self.number_qrnn_layers) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) if self.num_classes: self.final_fc = dense_layers.BaseQDense(units=self.num_classes, rank=2, parameters=self.parameters, activation=None)
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") _get_params("feature_size") _get_params("bottleneck_size") self.max_seq_len = config.get("max_seq_len", 128) self.gbst_max_token_len = config.get("gbst_max_token_len", 128) # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK). self.vocabulary_size = config.get("vocabulary_size", 259) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.feature_size], parameters=self.parameters) self.gbst_downsample_rate = config.get("gbst_downsample_rate", 1) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.gbst_max_token_len, self.feature_size], parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) self.bottleneck_layer = None gbst_size = self.feature_size if self.bottleneck_size != self.feature_size: self.bottleneck_layer = dense_layers.BaseQDenseVarLen( self.bottleneck_size, rank=3, normalize=False, activation=None, parameters=self.parameters) gbst_size = self.bottleneck_size self.gbst_max_subword_block_width = config.get( "gbst_max_subword_block_width", 5) self.gbst_conv_kernel_size = config.get("gbst_conv_kernel_size", 5) self.gbst_block_mixing_mode = config.get("gbst_block_mixing_mode", None) self.gbst_layer = misc_layers.GBSTLayerV2( feature_size=gbst_size, max_seq_len=self.gbst_max_token_len, downsample_rate=self.gbst_downsample_rate, max_subword_block_width=self.gbst_max_subword_block_width, conv_kernel_size=self.gbst_conv_kernel_size, block_mixing_mode=self.gbst_block_mixing_mode, parameters=self.parameters) self.pool_windows = config.get("pool_windows", None) if self.pool_windows: self.transformer_encoder_layer = transformer_encoder.FunnelTransformerModel( config, mode) else: self.transformer_encoder_layer = transformer_encoder.ModelWithEmbeddings( config, mode) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) self.num_classes = len(self.labels) if self.num_classes: self.final_fc = dense_layers.BaseQDense( units=self.num_classes, rank=2, parameters=self.parameters, activation=None)