Ejemplo n.º 1
0
def layer_stack(include_encdec_attention):
  """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
  Returns:
    a LayerStack
  """
  ret = []
  for _ in xrange(FLAGS.num_layers):
    ret.append(
        transformer_layers.SelfAttention(
            num_heads=FLAGS.num_heads,
            key_value_size=FLAGS.d_kv,
            attention_kwargs={"dropout_rate": FLAGS.dropout}))
    if include_encdec_attention:
      ret.append(
          transformer_layers.EncDecAttention(
              num_heads=FLAGS.num_heads,
              key_value_size=FLAGS.d_kv,
              attention_kwargs={"dropout_rate": FLAGS.dropout}))
    ret.append(
        transformer_layers.DenseReluDense(
            hidden_size=FLAGS.d_ff,
            dropout_rate=FLAGS.dropout))
  return transformer.LayerStack(ret)
Ejemplo n.º 2
0
def layer_stack_from_hparams(hparams, prefix):
    """Create a layer stack based on the hyperparameter values."""
    layers = hparams.get(prefix + "layers")
    return transformer.LayerStack(
        [layers_registry[l](hparams, prefix) for l in layers],
        dropout_rate=hparams.layer_prepostprocess_dropout,
        norm_epsilon=hparams.norm_epsilon)
Ejemplo n.º 3
0
def mtr_lm_v1(num_heads=8, num_memory_heads=0):
    """Model incorporating mixture-of-experts, local and global attention.

  ~6B parameters

  32 experts in 3 hierarchichal moe layers.

  Args:
    num_heads: an optional integer
    num_memory_heads: an optional integer

  Returns:
    a hparams
  """
    hparams = mtr_lm_dense(0)
    local_att = transformer_layers.LocalSelfAttention(
        num_heads=num_heads,
        num_memory_heads=num_memory_heads,
        key_value_size=128)
    att = transformer_layers.SelfAttention(num_heads=num_heads,
                                           num_memory_heads=num_memory_heads,
                                           key_value_size=128)
    drd = transformer_layers.DenseReluDense(hidden_size=2048)
    hmoe = moe.MoE2D(expert_x=8, expert_y=4, hidden_size=32768)
    hparams.layer_stack = transformer.LayerStack(
        ([local_att, local_att, drd, att, drd, local_att, local_att, hmoe] *
         4)[:-1])
    hparams.mesh_shape = "b0:4;b1:8"
    hparams.layout = "outer_batch:b0;inner_batch:b1,expert_x:b1,expert_y:b0"
    hparams.outer_batch_size = 4
    return hparams
Ejemplo n.º 4
0
def layer_stack_from_hparams(hparams, prefix):
  """Create a layer stack based on the hyperparameter values."""
  return transformer.LayerStack(
      [self_attention_from_hparams(hparams, prefix),
       dense_relu_dense_from_hparams(hparams)
      ] * hparams.get(prefix + "num_layers"),
      dropout_rate=hparams.layer_prepostprocess_dropout,
      norm_epsilon=hparams.norm_epsilon)
Ejemplo n.º 5
0
 def my_layer_stack(hparams):
     return transformer.LayerStack([
         transformer_layers.SelfAttention(
             num_heads=hparams.num_heads,
             key_value_size=hparams.d_kv,
             dropout_rate=hparams.attention_dropout),
         transformer_layers.DenseReluDense(
             hidden_size=hparams.d_ff,
             dropout_rate=hparams.layer_prepostprocess_dropout),
     ] * hparams.num_hidden_layers)
Ejemplo n.º 6
0
def mtf_unitransformer_all_layers_tiny():
  """Test out all the layers on local CPU."""
  hparams = mtf_unitransformer_tiny()
  hparams.layer_stack = transformer.LayerStack(
      [transformer_layers.SelfAttention(num_heads=4),
       transformer_layers.LocalSelfAttention(num_heads=4),
       moe.MoE1D(num_experts=4, hidden_size=512),
       moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512),
       transformer_layers.DenseReluDense(hidden_size=512)])
  return hparams
Ejemplo n.º 7
0
def decoder_layer_stack_from_hparams(hparams, prefix):
  if prefix != "decoder_":
    raise ValueError("prefix should be 'decoder'")
  return transformer.LayerStack(
      [self_attention_from_hparams(hparams, prefix),
       enc_dec_attention_from_hparams(hparams, prefix),
       dense_relu_dense_from_hparams(hparams)
      ] * hparams.get(prefix + "num_layers"),
      dropout_rate=hparams.layer_prepostprocess_dropout,
      norm_epsilon=hparams.norm_epsilon)
Ejemplo n.º 8
0
def get_dummy_decoder_context(converter,
                              batch=2,
                              d_model=6,
                              length=4,
                              mode="incremental",
                              initial_position=None,
                              state=None,
                              inputs=None):

    batch_dim = mtf.Dimension("batch", batch)
    length_dim = mtf.Dimension("length", length)

    # Set up a dummy model
    layer_stack = transformer.LayerStack(layers=[])
    model = transformer.Unitransformer(
        d_model=d_model,
        input_vocab_size=10,  # dummy values
        output_vocab_size=10,  # dummy values
        autoregressive=True,
        max_length=length,
        layer_stack=layer_stack)

    if state is not None:
        state_mtf = converter.convert_np_array_to_mtf_tensor(
            state, dtype=tf.float32, dim_names=["batch", "length", "d_model"])
        states = [state_mtf]
    else:
        states = None

    if initial_position:
        initial_position = mtf.constant(converter.mesh,
                                        initial_position,
                                        shape=mtf.Shape([batch_dim]),
                                        dtype=tf.int32)

    if inputs is not None:
        inputs = converter.convert_np_array_to_mtf_tensor(
            inputs, dim_names=["batch", "length"])

    context = transformer.Context(model=model,
                                  mode=mode,
                                  states=states,
                                  new_states=[],
                                  mesh=converter.mesh,
                                  batch_dims=[batch_dim],
                                  length_dim=length_dim,
                                  variable_dtype=mtf.VariableDType(tf.float32),
                                  sequence_id=1,
                                  inputs=inputs,
                                  initial_position=initial_position)
    return context
Ejemplo n.º 9
0
def mtf_transformer2_all_layers_tiny():
    """Test out all the layers on local CPU."""
    hparams = mtf_transformer2_base()
    hparams.batch_size = 2
    hparams.mesh_shape = ""
    hparams.d_model = 128
    hparams.layer_stack = transformer.LayerStack([
        transformer_layers.SelfAttention(num_heads=4),
        transformer_layers.LocalSelfAttention(num_heads=4),
        moe.MoE1D(num_experts=4, hidden_size=512),
        moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512),
        transformer_layers.DenseReluDense(hidden_size=512)
    ])
    return hparams
Ejemplo n.º 10
0
def default_layer_stack_with_encoder_attention(hparams):
    return transformer.LayerStack(
        [
            transformer_layers.SelfAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.EncDecAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.DenseReluDense(
                hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout),
        ] * hparams.num_hidden_layers,
        dropout_rate=hparams.layer_prepostprocess_dropout,
        norm_epsilon=hparams.norm_epsilon)
Ejemplo n.º 11
0
def simple_layer_stack(include_encdec_attention,
                       num_layers=6,
                       d_ff=2048,
                       num_heads=8,
                       d_kv=128,
                       dropout_rate=0.1):
    """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
    num_layers: an integer
    d_ff: an integer
    num_heads: an integer
    d_kv: an integer
    dropout_rate: a float

  Returns:
    a LayerStack
  """
    ret = []
    for _ in xrange(num_layers):
        ret.append(
            transformer_layers.SelfAttention(
                num_heads=num_heads,
                key_value_size=d_kv,
                attention_kwargs={"dropout_rate": dropout_rate}))
        if include_encdec_attention:
            ret.append(
                transformer_layers.EncDecAttention(
                    num_heads=num_heads,
                    key_value_size=d_kv,
                    attention_kwargs={"dropout_rate": dropout_rate}))
        ret.append(
            transformer_layers.DenseReluDense(hidden_size=d_ff,
                                              dropout_rate=dropout_rate))
    return transformer.LayerStack(ret)