def evolved_transformer_encoder(encoder_input, encoder_self_attention_bias, hparams, name="encoder", nonpadding=None, save_weights_to=None, make_image_summary=True, losses=None, attn_bias_for_padding=None): """Evolved Transformer encoder. See arxiv.org/abs/1901.11117 for more details. Note: Pad remover is not supported. Args: encoder_input: a Tensor. encoder_self_attention_bias: bias Tensor for self-attention (see common_attention.attention_bias()). hparams: hyperparameters for model. name: a string. nonpadding: optional Tensor with shape [batch_size, encoder_length] indicating what positions are not padding. This must either be passed in, which we do for "packed" datasets, or inferred from encoder_self_attention_bias. The knowledge about padding is used for pad_remover(efficiency) and to mask out padding in convolutional layers. save_weights_to: an optional dictionary to capture attention weights for visualization; the weights tensor will be appended there under a string key created from the variable scope (including name). make_image_summary: Whether to make an attention image summary. losses: Not used. attn_bias_for_padding: Padded attention bias in case a unidirectional encoder is being used where future attention is masked. Returns: Tensor encoder output. """ del losses hidden_state = encoder_input attention_dropout_broadcast_dims = ( common_layers.comma_separated_string_to_integer_list( getattr(hparams, "attention_dropout_broadcast_dims", ""))) with tf.variable_scope(name): if nonpadding is not None: padding = 1.0 - nonpadding else: attention_bias = encoder_self_attention_bias if attn_bias_for_padding is not None: attention_bias = attn_bias_for_padding # Only bfloat16 and float32 supported. float_type = hparams.get("activation_dtype", "float32") if float_type == "bfloat16": cast_fn = tf.to_bfloat16 else: assert float_type == "float32" cast_fn = tf.to_float padding = common_attention.attention_bias_to_padding( attention_bias, cast_fn) nonpadding = 1.0 - padding for layer in range(hparams.num_encoder_layers or hparams.num_hidden_layers): with tf.variable_scope("layer_%d" % layer): with tf.variable_scope("gated_linear_unit"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) values = common_layers.layers().Dense( hparams.hidden_size)(hidden_state) gates = common_layers.layers().Dense( hparams.hidden_size, activation=tf.nn.sigmoid)(hidden_state) hidden_state = values * gates hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("conv_branches"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layers. mask = tf.tile( tf.expand_dims(nonpadding, 2), [1, 1, hparams.hidden_size]) hidden_state *= mask left_output_dim = int(hparams.hidden_size * 4) left_state = common_layers.layers().Dense( left_output_dim, activation=tf.nn.relu)(hidden_state) left_state = tf.nn.dropout(left_state, 1 - hparams.layer_prepostprocess_dropout) right_output_dim = int(hparams.hidden_size / 2) right_state = common_layers.layers().Conv1D( right_output_dim, 3, padding="SAME", name="standard_conv_3x1", activation=tf.nn.relu)(hidden_state) right_state = tf.nn.dropout(right_state, 1 - hparams.layer_prepostprocess_dropout) right_state = tf.pad( right_state, [[0, 0], [0, 0], [0, left_output_dim - right_output_dim]], constant_values=0) hidden_state = left_state + right_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) # Mask padding from conv layer. mask = tf.tile(tf.expand_dims(nonpadding, 2), [1, 1, left_output_dim]) hidden_state *= mask separable_conv_9x1 = common_layers.layers().SeparableConv1D( right_output_dim, 9, padding="SAME", name="separable_conv_9x1") hidden_state = separable_conv_9x1(hidden_state) hidden_state = tf.pad( hidden_state, [[0, 0], [0, 0], [0, hparams.hidden_size - right_output_dim]], constant_values=0) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("self_attention"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = common_attention.multihead_attention( hidden_state, None, encoder_self_attention_bias, hparams.attention_key_channels or hparams.hidden_size, hparams.attention_value_channels or hparams.hidden_size, hparams.hidden_size, hparams.num_heads, hparams.attention_dropout, attention_type=hparams.self_attention_type, max_relative_position=hparams.max_relative_position, heads_share_relative_embedding=( hparams.heads_share_relative_embedding), add_relative_to_values=hparams.add_relative_to_values, save_weights_to=save_weights_to, make_image_summary=make_image_summary, dropout_broadcast_dims=attention_dropout_broadcast_dims, max_length=hparams.get("max_length"), vars_3d=hparams.get("attention_variables_3d"), activation_dtype=hparams.get("activation_dtype", "float32"), weight_dtype=hparams.get("weight_dtype", "float32")) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) with tf.variable_scope("dense_layers"): residual_state = hidden_state hidden_state = common_layers.layer_preprocess(hidden_state, hparams) hidden_state = common_layers.layers().Dense( int(hparams.hidden_size * 4), activation=tf.nn.relu)(hidden_state) hidden_state = tf.nn.dropout(hidden_state, 1 - hparams.layer_prepostprocess_dropout) hidden_state = common_layers.layers().Dense( hparams.hidden_size)(hidden_state) hidden_state = common_layers.layer_postprocess( residual_state, hidden_state, hparams) # If normalization is done in layer_preprocess, then it should also be done # on the output, since the output can grow very large, being the sum of # a whole stack of unnormalized layer outputs. return common_layers.layer_preprocess(hidden_state, hparams)
def layers(): return common_layers.layers()
import numpy as np from tensor2tensor.layers import common_layers import tensorflow as tf from tensorflow.python.ops import summary_op_util # pylint: disable=g-direct-tensorflow-import # After tf-nightly 1.14.1.dev20190314 summary_op_util.skip_summary was extracted # out to the distribute module. try: from tensorflow.python.distribute import summary_op_util as distribute_summary_op_util # pylint: disable=g-direct-tensorflow-import,g-import-not-at-top except ImportError: distribute_summary_op_util = summary_op_util tfl = common_layers.layers() tfcl = None try: tfcl = tf.contrib.layers except AttributeError: pass def swap_time_and_batch_axes(inputs): """Swaps time and batch axis (the first two axis).""" transposed_axes = tf.concat([[1, 0], tf.range(2, tf.rank(inputs))], axis=0) return tf.transpose(inputs, transposed_axes) def encode_to_shape(inputs, shape, scope): """Encode the given tensor to given image shape."""