def layer_stack(include_encdec_attention): """Create a layer stack. Args: include_encdec_attention: a boolean Returns: a LayerStack """ ret = [] for _ in xrange(FLAGS.num_layers): ret.append( transformer_layers.SelfAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) ret.append( transformer_layers.DenseReluDense( hidden_size=FLAGS.d_ff, dropout_rate=FLAGS.dropout)) return transformer.LayerStack(ret)
def layer_stack_from_hparams(hparams, prefix): """Create a layer stack based on the hyperparameter values.""" layers = hparams.get(prefix + "layers") return transformer.LayerStack( [layers_registry[l](hparams, prefix) for l in layers], dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def mtr_lm_v1(num_heads=8, num_memory_heads=0): """Model incorporating mixture-of-experts, local and global attention. ~6B parameters 32 experts in 3 hierarchichal moe layers. Args: num_heads: an optional integer num_memory_heads: an optional integer Returns: a hparams """ hparams = mtr_lm_dense(0) local_att = transformer_layers.LocalSelfAttention( num_heads=num_heads, num_memory_heads=num_memory_heads, key_value_size=128) att = transformer_layers.SelfAttention(num_heads=num_heads, num_memory_heads=num_memory_heads, key_value_size=128) drd = transformer_layers.DenseReluDense(hidden_size=2048) hmoe = moe.MoE2D(expert_x=8, expert_y=4, hidden_size=32768) hparams.layer_stack = transformer.LayerStack( ([local_att, local_att, drd, att, drd, local_att, local_att, hmoe] * 4)[:-1]) hparams.mesh_shape = "b0:4;b1:8" hparams.layout = "outer_batch:b0;inner_batch:b1,expert_x:b1,expert_y:b0" hparams.outer_batch_size = 4 return hparams
def layer_stack_from_hparams(hparams, prefix): """Create a layer stack based on the hyperparameter values.""" return transformer.LayerStack( [self_attention_from_hparams(hparams, prefix), dense_relu_dense_from_hparams(hparams) ] * hparams.get(prefix + "num_layers"), dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def my_layer_stack(hparams): return transformer.LayerStack([ transformer_layers.SelfAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.DenseReluDense( hidden_size=hparams.d_ff, dropout_rate=hparams.layer_prepostprocess_dropout), ] * hparams.num_hidden_layers)
def mtf_unitransformer_all_layers_tiny(): """Test out all the layers on local CPU.""" hparams = mtf_unitransformer_tiny() hparams.layer_stack = transformer.LayerStack( [transformer_layers.SelfAttention(num_heads=4), transformer_layers.LocalSelfAttention(num_heads=4), moe.MoE1D(num_experts=4, hidden_size=512), moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512), transformer_layers.DenseReluDense(hidden_size=512)]) return hparams
def decoder_layer_stack_from_hparams(hparams, prefix): if prefix != "decoder_": raise ValueError("prefix should be 'decoder'") return transformer.LayerStack( [self_attention_from_hparams(hparams, prefix), enc_dec_attention_from_hparams(hparams, prefix), dense_relu_dense_from_hparams(hparams) ] * hparams.get(prefix + "num_layers"), dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def get_dummy_decoder_context(converter, batch=2, d_model=6, length=4, mode="incremental", initial_position=None, state=None, inputs=None): batch_dim = mtf.Dimension("batch", batch) length_dim = mtf.Dimension("length", length) # Set up a dummy model layer_stack = transformer.LayerStack(layers=[]) model = transformer.Unitransformer( d_model=d_model, input_vocab_size=10, # dummy values output_vocab_size=10, # dummy values autoregressive=True, max_length=length, layer_stack=layer_stack) if state is not None: state_mtf = converter.convert_np_array_to_mtf_tensor( state, dtype=tf.float32, dim_names=["batch", "length", "d_model"]) states = [state_mtf] else: states = None if initial_position: initial_position = mtf.constant(converter.mesh, initial_position, shape=mtf.Shape([batch_dim]), dtype=tf.int32) if inputs is not None: inputs = converter.convert_np_array_to_mtf_tensor( inputs, dim_names=["batch", "length"]) context = transformer.Context(model=model, mode=mode, states=states, new_states=[], mesh=converter.mesh, batch_dims=[batch_dim], length_dim=length_dim, variable_dtype=mtf.VariableDType(tf.float32), sequence_id=1, inputs=inputs, initial_position=initial_position) return context
def mtf_transformer2_all_layers_tiny(): """Test out all the layers on local CPU.""" hparams = mtf_transformer2_base() hparams.batch_size = 2 hparams.mesh_shape = "" hparams.d_model = 128 hparams.layer_stack = transformer.LayerStack([ transformer_layers.SelfAttention(num_heads=4), transformer_layers.LocalSelfAttention(num_heads=4), moe.MoE1D(num_experts=4, hidden_size=512), moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512), transformer_layers.DenseReluDense(hidden_size=512) ]) return hparams
def default_layer_stack_with_encoder_attention(hparams): return transformer.LayerStack( [ transformer_layers.SelfAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.EncDecAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.DenseReluDense( hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout), ] * hparams.num_hidden_layers, dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def simple_layer_stack(include_encdec_attention, num_layers=6, d_ff=2048, num_heads=8, d_kv=128, dropout_rate=0.1): """Create a layer stack. Args: include_encdec_attention: a boolean num_layers: an integer d_ff: an integer num_heads: an integer d_kv: an integer dropout_rate: a float Returns: a LayerStack """ ret = [] for _ in xrange(num_layers): ret.append( transformer_layers.SelfAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) ret.append( transformer_layers.DenseReluDense(hidden_size=d_ff, dropout_rate=dropout_rate)) return transformer.LayerStack(ret)