def __init__(self, model_dimension, num_heads, intermediate_size, initializer_stddev=0.02, activation_dropout_rate=0.0, attention_dropout_rate=0.0, **kwargs): super(FunnelTransformerEncoder, self).__init__(**kwargs) self.model_dimension = model_dimension self.parameters.initializer = tf.keras.initializers.TruncatedNormal( stddev=initializer_stddev) self.self_attn = FunnelAttention( model_dimension, num_heads, attention_dropout_rate=attention_dropout_rate, parameters=self.parameters) self.prx = dense_layers.BaseQDenseVarLen(model_dimension, activation=None, parameters=self.parameters) self.upprx = dense_layers.BaseQDenseVarLen(intermediate_size, parameters=self.parameters) self.downprx = dense_layers.BaseQDenseVarLen( model_dimension, activation=None, parameters=self.parameters) self.activation_dropout_rate = activation_dropout_rate self.ln1 = normalization_layers.LayerNormalization(**kwargs) self.ln2 = normalization_layers.LayerNormalization(**kwargs) self.q1 = quantization_layers.ActivationQuantization(**kwargs) self.q2 = quantization_layers.ActivationQuantization(**kwargs)
def __init__(self, **kwargs): shape = kwargs.pop("shape", None) initializer = kwargs.pop("initializer", None) self.qoutput = quantization_layers.ActivationQuantization(**kwargs) super(EmbeddingFullyConnected, self).__init__(shape=shape, initializer=initializer, **kwargs)
def __init__(self, num_layers, max_time_step, vocabulary_size, embedding_size, model_dimension, num_heads, intermediate_size, **kwargs): self.max_time_step = max_time_step self.vocabulary_size = vocabulary_size self.embedding_size = embedding_size activation_dropout_rate = kwargs.pop('activation_dropout_rate', 0.0) attention_dropout_rate = kwargs.pop('attention_dropout_rate', 0.0) self.layers = [] for _ in range(num_layers): self.layers.append( TransformerEncoder( model_dimension=model_dimension, num_heads=num_heads, intermediate_size=intermediate_size, activation_dropout_rate=activation_dropout_rate, attention_dropout_rate=attention_dropout_rate, **kwargs)) self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.embedding_size], **kwargs) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.max_time_step, self.embedding_size], **kwargs) self.ln = normalization_layers.LayerNormalization(**kwargs) self.qact = quantization_layers.ActivationQuantization(**kwargs) super(TransformerEncoderStack, self).__init__(**kwargs)
def __init__(self, model_dimension, max_time_step, num_heads, intermediate_size, activation_dropout_rate=0.0, attention_dropout_rate=0.0, beam_size=1, cached_kv=False, **kwargs): self.model_dimension = model_dimension self.decoder_uniform_attn = transformer_layers.DecoderUniformAttention( model_dimension, max_time_step, attention_dropout_rate=attention_dropout_rate, beam_size=beam_size, **kwargs) self.multihead_cross_attn = transformer_layers.DecoderMultiheadAttention( model_dimension, num_heads, cached_kv=cached_kv, attention_dropout_rate=attention_dropout_rate, **kwargs) self.prx = dense_layers.BaseQDense(model_dimension, activation=None, normalize=False, bias=False, **kwargs) self.upprx = dense_layers.BaseQDense(intermediate_size, normalize=False, **kwargs) self.downprx = dense_layers.BaseQDense(model_dimension, activation=None, normalize=False, **kwargs) self.activation_dropout_rate = activation_dropout_rate self.ln1 = normalization_layers.LayerNormalization(**kwargs) self.ln2 = normalization_layers.LayerNormalization(**kwargs) self.q0 = quantization_layers.ActivationQuantization(**kwargs) self.q1 = quantization_layers.ActivationQuantization(**kwargs) self.q2 = quantization_layers.ActivationQuantization(**kwargs) super(TransformerUniformAttnDecoder, self).__init__(**kwargs)
def __init__(self, scalar=True, **kwargs): self.scalar = scalar # Attention logits should not have activation post linear layer so it can # be positive or negative. This would enable the attention distribution to # be anything that the network likes. Using relu activation makes the # attention distribution biased towards uniform distribution. # This gets better results for attention pooling. Though some outputs are # emphasized for making classification decision, all other outputs have # a non zero probability of influencing the class. This seems to result # in better backprop. self.attention = dense_layers.BaseQDenseVarLen(units=1, rank=3, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) super(AttentionPooling, self).__init__(**kwargs)
def __init__(self, shape, num_bits=8, initializer=None, trainable=True, **kwargs): self.shape = shape self.quantizer = quantization_layers.ActivationQuantization( num_bits=num_bits, **kwargs) super(EmbeddingLayer, self).__init__(**kwargs) if initializer is None: initializer = tf.keras.initializers.GlorotUniform() self.initializer = initializer self.trainable = trainable
def __init__(self, units, activation=tf.keras.layers.ReLU(), bias=True, rank=2, **kwargs): self.units = units self.rank = rank assert rank >= 2 and rank <= 4 self.activation = activation self.bias = bias self.qoutput = quantization_layers.ActivationQuantization(**kwargs) self._create_normalizer(**kwargs) super(BaseQDense, self).__init__(**kwargs)
def __init__(self, model_dimension, num_heads, attention_dropout_rate=0.0, **kwargs): self.model_dimension = model_dimension self.num_heads = num_heads self.filters = model_dimension // num_heads self.dense_layers = dense_layers.BaseQDenseVarLen( units=model_dimension * 3, activation=None, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) self.attention_dropout_rate = attention_dropout_rate self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs) super(SelfAttentionV2, self).__init__(**kwargs)
def __init__(self, zoneout_probability=0.0, forward=True, pooling=QUASI_RNN_POOLING_FO, output_quantized=True, **kwargs): self.zoneout_probability = zoneout_probability self.pooling = pooling self.forward = forward self.output_quantized = output_quantized if output_quantized and self.pooling == QUASI_RNN_POOLING_IFO: self.qoutputs = quantization_layers.ActivationQuantization() self.num_gates = _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP[pooling] assert pooling in _QUASI_RNN_POOLING_TO_NUMBER_OF_GATES_MAP.keys() self.pooling_core = QRNNUnidirectionalPoolingCore(forward=forward, **kwargs) super(QRNNUnidirectionalPooling, self).__init__(**kwargs)
def __init__(self, model_dimension, max_time_step, attention_dropout_rate=0.0, beam_size=1, **kwargs): self.model_dimension = model_dimension self.max_time_step = max_time_step self.beam_size = beam_size self.causal_mask = tf.expand_dims( tf.linalg.band_part(tf.ones([max_time_step, max_time_step]), -1, 0), 0) self.dense_layers = dense_layers.BaseQDenseVarLen( units=model_dimension, activation=None, normalize=False, bias=False, rank=3, **kwargs) self.qoutput = quantization_layers.ActivationQuantization(**kwargs) super(DecoderUniformAttention, self).__init__(**kwargs)
def __init__(self, model_dimension, num_heads, attention_dropout_rate=0.0, cached_kv=False, **kwargs): self.model_dimension = model_dimension self.num_heads = num_heads self.filters = model_dimension // num_heads self.cached_kv = cached_kv self.q_dense_layers = dense_layers.BaseQDense(units=model_dimension, activation=None, normalize=False, bias=False, **kwargs) self.kv_dense_layers = dense_layers.BaseQDenseVarLen( units=model_dimension * 2, activation=None, **kwargs) self.qactivation = quantization_layers.ActivationQuantization(**kwargs) self.attention_dropout_rate = attention_dropout_rate self.qconcat = quantization_layers.ConcatQuantization(axis=1, **kwargs) super(DecoderMultiheadAttention, self).__init__(**kwargs)
def __init__(self, filters, ksize, stride=1, padding="SAME", dilations=None, activation=tf.keras.layers.ReLU(), bias=True, rank=4, **kwargs): self.out_filters = filters assert rank >= 3 and rank <= 4 self.rank = rank self.ksize = self._unpack(ksize) self.strides = self._unpack(stride) self.dilations = [1] + self._unpack(dilations) + [1] if dilations else None self.activation = activation self.bias = bias self.padding = padding self.qoutput = quantization_layers.ActivationQuantization(**kwargs) self._create_normalizer(**kwargs) super(EncoderQConvolution, self).__init__(**kwargs)
def __init__(self, config, mode): super(Model, self).__init__() def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("intermediate_size") _get_params("max_dec_time_step") _get_params("max_enc_time_step") _get_params("embedding_size") _get_params("vocabulary_size") _get_params("num_layers") _get_params("labels") _get_params("regularizer_scale") _get_params("num_heads") _get_params("model_dimension") _get_params("beam_size", 1) _get_params("quantize", True) _get_params("cached_kv", False) _get_params("attention_dropout_rate", 0.0) _get_params("activation_dropout_rate", 0.0) # If set, a separate dense layer is used to generate the logits instead of # re-using the input embedding table. _get_params("use_output_layer", False) self.parameters = base_layers.Parameters(mode, self.quantize, self.regularizer_scale) # Activation/Normalization enabled on input bottleneck as there is no # temporal information. self.input_bottleneck = dense_layers.BaseQDenseVarLen( self.model_dimension, rank=3, parameters=self.parameters) self.output_bottleneck = dense_layers.BaseQDense( self.embedding_size, normalize=False, activation=None, bias=False, parameters=self.parameters) self.embedding = embedding_layers.EmbeddingFullyConnected( shape=[self.vocabulary_size, self.embedding_size], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) if self.use_output_layer: self.output_layer = dense_layers.BaseQDense( self.vocabulary_size, activation=None, normalize=False, bias=False, parameters=self.parameters) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.max_dec_time_step, self.model_dimension], initializer=tf.random_uniform_initializer(-math.sqrt(3), math.sqrt(3)), parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) # Scales the weights for computing logits. self.logits_fc_weights_scale_factor = None self.logits_fc_bias = self.add_weight( "logits_fc_bias", shape=[self.vocabulary_size], initializer=tf.constant_initializer(0), dtype="float32") # Optional bias which can be used to mask logits output. self.output_bias = None self.transformer_uniform_attn_decoder = TransformerUniformAttnDecoderStack( parameters=self.parameters, num_layers=self.num_layers, intermediate_size=self.intermediate_size, embedding_size=self.embedding_size, max_time_step=self.max_dec_time_step, num_heads=self.num_heads, model_dimension=self.model_dimension, vocabulary_size=self.vocabulary_size, beam_size=self.beam_size, cached_kv=self.cached_kv, attention_dropout_rate=self.attention_dropout_rate, activation_dropout_rate=self.activation_dropout_rate) # Beam search output. self.finished_seq = None self.finished_scores = None
def __init__(self, config, mode, **kwargs): super(Encoder, self).__init__(**kwargs) def _get_params(varname, default_value=None): value = config[varname] if varname in config else default_value default = "" if varname in config else " (default)" logging.info("%s = %s%s", varname, value, default) setattr(self, varname, value) _get_params("labels", []) _get_params("regularizer_scale") _get_params("quantize") _get_params("feature_size") _get_params("bottleneck_size") self.max_seq_len = config.get("max_seq_len", 128) self.gbst_max_token_len = config.get("gbst_max_token_len", 128) # Including 3 additional special token ids (0=padding, 1=EOS, 2=UNK). self.vocabulary_size = config.get("vocabulary_size", 259) self.parameters = base_layers.Parameters( mode, quantize=self.quantize, regularizer_scale=self.regularizer_scale) self.embedding = embedding_layers.EmbeddingLayer( shape=[self.vocabulary_size, self.feature_size], parameters=self.parameters) self.gbst_downsample_rate = config.get("gbst_downsample_rate", 1) self.positional_embedding = embedding_layers.EmbeddingLayer( shape=[self.gbst_max_token_len, self.feature_size], parameters=self.parameters) self.ln = normalization_layers.LayerNormalization( parameters=self.parameters) self.qact = quantization_layers.ActivationQuantization( parameters=self.parameters) self.bottleneck_layer = None gbst_size = self.feature_size if self.bottleneck_size != self.feature_size: self.bottleneck_layer = dense_layers.BaseQDenseVarLen( self.bottleneck_size, rank=3, normalize=False, activation=None, parameters=self.parameters) gbst_size = self.bottleneck_size self.gbst_max_subword_block_width = config.get( "gbst_max_subword_block_width", 5) self.gbst_conv_kernel_size = config.get("gbst_conv_kernel_size", 5) self.gbst_block_mixing_mode = config.get("gbst_block_mixing_mode", None) self.gbst_layer = misc_layers.GBSTLayerV2( feature_size=gbst_size, max_seq_len=self.gbst_max_token_len, downsample_rate=self.gbst_downsample_rate, max_subword_block_width=self.gbst_max_subword_block_width, conv_kernel_size=self.gbst_conv_kernel_size, block_mixing_mode=self.gbst_block_mixing_mode, parameters=self.parameters) self.pool_windows = config.get("pool_windows", None) if self.pool_windows: self.transformer_encoder_layer = transformer_encoder.FunnelTransformerModel( config, mode) else: self.transformer_encoder_layer = transformer_encoder.ModelWithEmbeddings( config, mode) self.attention_pool = misc_layers.AttentionPooling( parameters=self.parameters) self.num_classes = len(self.labels) if self.num_classes: self.final_fc = dense_layers.BaseQDense( units=self.num_classes, rank=2, parameters=self.parameters, activation=None)
def __init__(self, axes=None, **kwargs): self.axes = axes or [-1] self.qactivation = quantization_layers.ActivationQuantization(**kwargs) super(LayerNormalization, self).__init__(**kwargs)
def __init__(self, **kwargs): self.qactivation = quantization_layers.ActivationQuantization(**kwargs) super(TreeInductionLayer, self).__init__(**kwargs)