def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN): if self.position_encoder is not None: inputs = self.position_encoder(inputs, sequence_length=sequence_length) inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) mask = transformer.build_sequence_mask( sequence_length, num_heads=self.num_heads, maximum_length=tf.shape(inputs)[1], dtype=inputs.dtype) state = () for l in range(self.num_layers): with tf.variable_scope("layer_{}".format(l)): with tf.variable_scope("multi_head"): inputs_norm = transformer.norm(inputs) context = transformer.multi_head_attention( self.num_heads, inputs_norm, inputs_norm, mode, num_units=self.num_units, mask=mask, dropout=self.attention_dropout) context = transformer.drop_and_add(inputs, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( transformer.norm(context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( context, transformed, mode, dropout=self.dropout) inputs = transformed state += (tf.reduce_mean(inputs, axis=1), ) outputs = transformer.norm(inputs) return (outputs, state, sequence_length)
def encode(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN): if self.position_encoder is not None: inputs = self.position_encoder(inputs, sequence_length=sequence_length) inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) outputs = [] state = () for l in range(self.num_layers): with tf.variable_scope("layer_{}".format(l)): with tf.variable_scope("multi_head"): context = transformer.multi_head_attention( self.num_heads, inputs, inputs, inputs, mode, values_length=sequence_length, dropout=self.attention_dropout) context = transformer.add_and_norm(inputs, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( context, self.ffn_inner_dim) transformed = transformer.add_and_norm( context, transformed, mode, dropout=self.dropout) inputs = transformed state += (tf.reduce_mean(inputs, axis=1), ) if self.keep_layers_output: outputs.append(inputs) return (inputs if not outputs else outputs, state, sequence_length)
def _self_attention_stack(self, inputs, sequence_length=None, mode=tf.estimator.ModeKeys.TRAIN, cache=None, memory=None, memory_sequence_length=None): inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) decoder_mask = None memory_mask = None if sequence_length is not None: decoder_mask = transformer.build_future_mask( sequence_length, num_heads=self.num_heads, dtype=inputs.dtype) if memory_sequence_length is not None: memory_mask = transformer.build_sequence_mask( memory_sequence_length, num_heads=self.num_heads, dtype=memory.dtype) for l in range(self.num_layers): layer_name = "layer_{}".format(l) layer_cache = cache[layer_name] if cache is not None else None with tf.variable_scope(layer_name): with tf.variable_scope("masked_multi_head"): inputs_norm = transformer.norm(inputs) encoded = transformer.multi_head_attention( self.num_heads, inputs_norm, inputs_norm, mode, num_units=self.num_units, mask=decoder_mask, cache=layer_cache, dropout=self.attention_dropout) encoded = transformer.drop_and_add(inputs, encoded, mode, dropout=self.dropout) if memory is not None: with tf.variable_scope("multi_head"): context = transformer.multi_head_attention( self.num_heads, transformer.norm(encoded), memory, mode, mask=memory_mask, dropout=self.attention_dropout) context = transformer.drop_and_add( encoded, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( transformer.norm(context), self.ffn_inner_dim, mode, dropout=self.relu_dropout) transformed = transformer.drop_and_add( context, transformed, mode, dropout=self.dropout) inputs = transformed outputs = transformer.norm(inputs) return outputs
def _self_attention_stack(self, inputs, sequence_length, mode=tf.estimator.ModeKeys.TRAIN, memory=None, memory_sequence_length=None): if self.position_encoder is not None: inputs = self.position_encoder(inputs, sequence_length=sequence_length) inputs = tf.layers.dropout( inputs, rate=self.dropout, training=mode == tf.estimator.ModeKeys.TRAIN) for l in range(self.num_layers): with tf.variable_scope("layer_{}".format(l)): with tf.variable_scope("masked_multi_head"): encoded = transformer.multi_head_attention( self.num_heads, inputs, inputs, inputs, mode, values_length=sequence_length, mask_future=True, dropout=self.attention_dropout) encoded = transformer.add_and_norm(inputs, encoded, mode, dropout=self.dropout) with tf.variable_scope("multi_head"): if memory is None: values = encoded elif tf.contrib.framework.nest.is_sequence(memory): if l >= len(memory): raise ValueError( """If the encoder memory is a sequence, it must contain one memory per decoder layer""") values = memory[l] else: values = memory keys = values context = transformer.multi_head_attention( self.num_heads, encoded, keys, values, mode, values_length=memory_sequence_length, dropout=self.attention_dropout) context = transformer.add_and_norm(encoded, context, mode, dropout=self.dropout) with tf.variable_scope("ffn"): transformed = transformer.feed_forward( context, self.ffn_inner_dim) transformed = transformer.add_and_norm( context, transformed, mode, dropout=self.dropout) inputs = transformed outputs = inputs return outputs