Esempio n. 1
0
def layer_stack(include_encdec_attention):
  """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
  Returns:
    a LayerStack
  """
  ret = []
  for _ in xrange(FLAGS.num_layers):
    ret.append(
        transformer_layers.SelfAttention(
            num_heads=FLAGS.num_heads,
            key_value_size=FLAGS.d_kv,
            attention_kwargs={"dropout_rate": FLAGS.dropout}))
    if include_encdec_attention:
      ret.append(
          transformer_layers.EncDecAttention(
              num_heads=FLAGS.num_heads,
              key_value_size=FLAGS.d_kv,
              attention_kwargs={"dropout_rate": FLAGS.dropout}))
    ret.append(
        transformer_layers.DenseReluDense(
            hidden_size=FLAGS.d_ff,
            dropout_rate=FLAGS.dropout))
  return transformer.LayerStack(ret)
def mtr_lm_v1(num_heads=8, num_memory_heads=0):
    """Model incorporating mixture-of-experts, local and global attention.

  ~6B parameters

  32 experts in 3 hierarchichal moe layers.

  Args:
    num_heads: an optional integer
    num_memory_heads: an optional integer

  Returns:
    a hparams
  """
    hparams = mtr_lm_dense(0)
    local_att = transformer_layers.LocalSelfAttention(
        num_heads=num_heads,
        num_memory_heads=num_memory_heads,
        key_value_size=128)
    att = transformer_layers.SelfAttention(num_heads=num_heads,
                                           num_memory_heads=num_memory_heads,
                                           key_value_size=128)
    drd = transformer_layers.DenseReluDense(hidden_size=2048)
    hmoe = moe.MoE2D(expert_x=8, expert_y=4, hidden_size=32768)
    hparams.layer_stack = transformer.LayerStack(
        ([local_att, local_att, drd, att, drd, local_att, local_att, hmoe] *
         4)[:-1])
    hparams.mesh_shape = "b0:4;b1:8"
    hparams.layout = "outer_batch:b0;inner_batch:b1,expert_x:b1,expert_y:b0"
    hparams.outer_batch_size = 4
    return hparams
Esempio n. 3
0
 def __init__(self,
              radius=128,
              num_heads=8,
              num_memory_heads=0,
              key_value_size=128,
              shared_kv=False,
              dropout_rate=0.0,
              attention_kwargs=None,
              downsample_query=2,
              low_rank_features=32,
              project_kv=True,
              use_ffn=True,
              num_memory_slots=0,
              structured=False,
              pre_attention=False,
              local_gate=False,
              norm=False,
              pos_att=False,
              conv_type=None,
              query_func="linear",
              pool_func="max",
              local_attention=False,
              use_offsets=False,
              consider_chars_as_blocks=False,
              use_block_pos_embedding=False,
              canine_mode=False,
              filter_size=5,
              block_mixing_mode=None,
              rank_activation="softmax",
              gbst_pool="mean"):
     super(GradientSubwordLayerV2,
           self).__init__(num_heads, num_memory_heads, key_value_size,
                          shared_kv, dropout_rate, attention_kwargs)
     self.radius = radius
     self.downsample_query = downsample_query
     self.low_rank_features = low_rank_features
     self.project_kv = project_kv
     self.use_ffn = use_ffn
     if self.use_ffn:
         self.ffn = transformer_layers.DenseReluDense()
     self.num_memory_slots = num_memory_slots
     self.structured = structured
     self.pre_attention = pre_attention
     self.local_gate = local_gate
     self.norm = norm
     self.pos_att = pos_att
     self.conv_type = conv_type
     self.query_func = query_func
     self.pool_func = pool_func
     self.local_attention = local_attention
     self.use_offsets = use_offsets
     self.consider_chars_as_blocks = consider_chars_as_blocks
     self.use_block_pos_embedding = use_block_pos_embedding
     self.canine_mode = canine_mode
     self.filter_size = filter_size
     self.block_mixing_mode = block_mixing_mode
     self.rank_activation = rank_activation
     self.gbst_pool = gbst_pool
Esempio n. 4
0
 def my_layer_stack(hparams):
     return transformer.LayerStack([
         transformer_layers.SelfAttention(
             num_heads=hparams.num_heads,
             key_value_size=hparams.d_kv,
             dropout_rate=hparams.attention_dropout),
         transformer_layers.DenseReluDense(
             hidden_size=hparams.d_ff,
             dropout_rate=hparams.layer_prepostprocess_dropout),
     ] * hparams.num_hidden_layers)
def mtf_unitransformer_all_layers_tiny():
  """Test out all the layers on local CPU."""
  hparams = mtf_unitransformer_tiny()
  hparams.layer_stack = transformer.LayerStack(
      [transformer_layers.SelfAttention(num_heads=4),
       transformer_layers.LocalSelfAttention(num_heads=4),
       moe.MoE1D(num_experts=4, hidden_size=512),
       moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512),
       transformer_layers.DenseReluDense(hidden_size=512)])
  return hparams
Esempio n. 6
0
def mtf_transformer2_all_layers_tiny():
    """Test out all the layers on local CPU."""
    hparams = mtf_transformer2_base()
    hparams.batch_size = 2
    hparams.mesh_shape = ""
    hparams.d_model = 128
    hparams.layer_stack = transformer.LayerStack([
        transformer_layers.SelfAttention(num_heads=4),
        transformer_layers.LocalSelfAttention(num_heads=4),
        moe.MoE1D(num_experts=4, hidden_size=512),
        moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512),
        transformer_layers.DenseReluDense(hidden_size=512)
    ])
    return hparams
Esempio n. 7
0
def default_layer_stack_with_encoder_attention(hparams):
    return transformer.LayerStack(
        [
            transformer_layers.SelfAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.EncDecAttention(
                num_heads=hparams.num_heads,
                key_value_size=hparams.d_kv,
                dropout_rate=hparams.attention_dropout),
            transformer_layers.DenseReluDense(
                hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout),
        ] * hparams.num_hidden_layers,
        dropout_rate=hparams.layer_prepostprocess_dropout,
        norm_epsilon=hparams.norm_epsilon)
Esempio n. 8
0
def simple_layer_stack(include_encdec_attention,
                       num_layers=6,
                       d_ff=2048,
                       num_heads=8,
                       d_kv=128,
                       dropout_rate=0.1):
    """Create a layer stack.

  Args:
    include_encdec_attention: a boolean
    num_layers: an integer
    d_ff: an integer
    num_heads: an integer
    d_kv: an integer
    dropout_rate: a float

  Returns:
    a LayerStack
  """
    ret = []
    for _ in xrange(num_layers):
        ret.append(
            transformer_layers.SelfAttention(
                num_heads=num_heads,
                key_value_size=d_kv,
                attention_kwargs={"dropout_rate": dropout_rate}))
        if include_encdec_attention:
            ret.append(
                transformer_layers.EncDecAttention(
                    num_heads=num_heads,
                    key_value_size=d_kv,
                    attention_kwargs={"dropout_rate": dropout_rate}))
        ret.append(
            transformer_layers.DenseReluDense(hidden_size=d_ff,
                                              dropout_rate=dropout_rate))
    return transformer.LayerStack(ret)
Esempio n. 9
0
def dense_relu_dense_layer(hparams, prefix):
    del prefix
    return transformer_layers.DenseReluDense(hidden_size=hparams.d_ff,
                                             dropout_rate=hparams.relu_dropout)
Esempio n. 10
0
def create_dummy_model(mesh,
                       shapes,
                       n_blocks=2,
                       block_param_size_str="2_2",
                       block_repeat_size_str="1_1"):
    """Creates a dummy model and layer stack with 4-dimensional input."""

    assert len(shapes) == 4
    outer_batch_size, batch_size, length, d_model = shapes
    batch_dim = mtf.Dimension("batch", batch_size)
    outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size)
    length_dim = mtf.Dimension("length", length)
    block_param_size = list(map(int, block_param_size_str.split("_")))
    block_repeat_size = list(map(int, block_repeat_size_str.split("_")))

    sublayers_initial = [
        transformer.sublayer_dropout,
    ]
    sublayers_per_layer = [
        transformer.sublayer_rms_norm,
        transformer.sublayer_call_layer,
        transformer.sublayer_dropout,
        transformer.sublayer_residual,
    ]
    sublayers_final = [
        transformer.sublayer_rms_norm,
        transformer.sublayer_dropout,
    ]
    submodules = [
        transformer_layers.SelfAttention(),
        transformer_layers.DenseReluDense()
    ]

    n_sublayers = np.array(block_param_size).prod()
    layers = submodules * n_sublayers
    layer_stack = funnel_transformer.FunnelTransformerLayerStack(
        layers=layers,
        n_blocks=n_blocks,
        block_param_size=block_param_size,
        block_repeat_size=block_repeat_size,
        sublayers_initial=sublayers_initial,
        sublayers_per_layer=sublayers_per_layer,
        sublayers_final=sublayers_final)

    model = transformer.Unitransformer(input_vocab_size=10,
                                       output_vocab_size=10,
                                       autoregressive=False,
                                       max_length=8,
                                       d_model=d_model,
                                       layer_stack=layer_stack)

    context = transformer.Context(model=model,
                                  mesh=mesh,
                                  batch_dims=[batch_dim, outer_batch_dim],
                                  length_dim=length_dim,
                                  variable_dtype=mtf.VariableDType(tf.float32),
                                  sequence_id=mtf.ones(mesh,
                                                       mtf.Shape([length_dim
                                                                  ])),
                                  position=mtf.range(mesh,
                                                     length_dim,
                                                     dtype=tf.int32))
    return layer_stack, context
Esempio n. 11
0
def dense_relu_dense_from_hparams(hparams):
    return transformer_layers.DenseReluDense(hidden_size=hparams.d_ff,
                                             dropout_rate=hparams.relu_dropout)