def enc_dec_attention_layer(hparams, prefix): return transformer_layers.EncDecAttention( num_heads=hparams.get(prefix + "num_heads"), num_memory_heads=hparams.get(prefix + "num_memory_heads"), key_value_size=hparams.d_kv, shared_kv=hparams.get(prefix + "shared_kv", False), attention_kwargs=attention_kwargs_from_hparams(hparams))
def layer_stack(include_encdec_attention): """Create a layer stack. Args: include_encdec_attention: a boolean Returns: a LayerStack """ ret = [] for _ in xrange(FLAGS.num_layers): ret.append( transformer_layers.SelfAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) ret.append( transformer_layers.DenseReluDense( hidden_size=FLAGS.d_ff, dropout_rate=FLAGS.dropout)) return transformer.LayerStack(ret)
def __init__(self, base_num_heads): """Create an DecoderAttentionLayer. Args: base_num_heads: a positive integer, the base number of heads the attention layers are using. """ self._self_attention = transformer_layers.SelfAttention(num_heads=2 * base_num_heads) self._enc_dec_attention = transformer_layers.EncDecAttention( num_heads=base_num_heads)
def default_layer_stack_with_encoder_attention(hparams): return transformer.LayerStack( [ transformer_layers.SelfAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.EncDecAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.DenseReluDense( hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout), ] * hparams.num_hidden_layers, dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def simple_layer_stack(include_encdec_attention, num_layers=6, d_ff=2048, num_heads=8, d_kv=128, dropout_rate=0.1): """Create a layer stack. Args: include_encdec_attention: a boolean num_layers: an integer d_ff: an integer num_heads: an integer d_kv: an integer dropout_rate: a float Returns: a LayerStack """ ret = [] for _ in xrange(num_layers): ret.append( transformer_layers.SelfAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) ret.append( transformer_layers.DenseReluDense(hidden_size=d_ff, dropout_rate=dropout_rate)) return transformer.LayerStack(ret)