Example #1
0
def enc_dec_attention_layer(hparams, prefix):
    return transformer_layers.EncDecAttention(
        num_heads=hparams.get(prefix + "num_heads"),
        num_memory_heads=hparams.get(prefix + "num_memory_heads"),
        key_value_size=hparams.d_kv,
        shared_kv=hparams.get(prefix + "shared_kv", False),
        attention_kwargs=attention_kwargs_from_hparams(hparams))
Example #2
0
def layer_stack(include_encdec_attention):
  """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
  Returns:
    a LayerStack
  """
  ret = []
  for _ in xrange(FLAGS.num_layers):
    ret.append(
        transformer_layers.SelfAttention(
            num_heads=FLAGS.num_heads,
            key_value_size=FLAGS.d_kv,
            attention_kwargs={"dropout_rate": FLAGS.dropout}))
    if include_encdec_attention:
      ret.append(
          transformer_layers.EncDecAttention(
              num_heads=FLAGS.num_heads,
              key_value_size=FLAGS.d_kv,
              attention_kwargs={"dropout_rate": FLAGS.dropout}))
    ret.append(
        transformer_layers.DenseReluDense(
            hidden_size=FLAGS.d_ff,
            dropout_rate=FLAGS.dropout))
  return transformer.LayerStack(ret)
Example #3
0
  def __init__(self, base_num_heads):
    """Create an DecoderAttentionLayer.

    Args:
      base_num_heads: a positive integer, the base number of heads the attention
        layers are using.
    """
    self._self_attention = transformer_layers.SelfAttention(num_heads=2 *
                                                            base_num_heads)
    self._enc_dec_attention = transformer_layers.EncDecAttention(
        num_heads=base_num_heads)
Example #4
0
def default_layer_stack_with_encoder_attention(hparams):
    return transformer.LayerStack(
        [
            transformer_layers.SelfAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.EncDecAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.DenseReluDense(
                hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout),
        ] * hparams.num_hidden_layers,
        dropout_rate=hparams.layer_prepostprocess_dropout,
        norm_epsilon=hparams.norm_epsilon)
Example #5
0
def simple_layer_stack(include_encdec_attention,
                       num_layers=6,
                       d_ff=2048,
                       num_heads=8,
                       d_kv=128,
                       dropout_rate=0.1):
    """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
    num_layers: an integer
    d_ff: an integer
    num_heads: an integer
    d_kv: an integer
    dropout_rate: a float

  Returns:
    a LayerStack
  """
    ret = []
    for _ in xrange(num_layers):
        ret.append(
            transformer_layers.SelfAttention(
                num_heads=num_heads,
                key_value_size=d_kv,
                attention_kwargs={"dropout_rate": dropout_rate}))
        if include_encdec_attention:
            ret.append(
                transformer_layers.EncDecAttention(
                    num_heads=num_heads,
                    key_value_size=d_kv,
                    attention_kwargs={"dropout_rate": dropout_rate}))
        ret.append(
            transformer_layers.DenseReluDense(hidden_size=d_ff,
                                              dropout_rate=dropout_rate))
    return transformer.LayerStack(ret)