def layer_stack(include_encdec_attention): """Create a layer stack. Args: include_encdec_attention: a boolean Returns: a LayerStack """ ret = [] for _ in xrange(FLAGS.num_layers): ret.append( transformer_layers.SelfAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=FLAGS.num_heads, key_value_size=FLAGS.d_kv, attention_kwargs={"dropout_rate": FLAGS.dropout})) ret.append( transformer_layers.DenseReluDense( hidden_size=FLAGS.d_ff, dropout_rate=FLAGS.dropout)) return transformer.LayerStack(ret)
def mtr_lm_v1(num_heads=8, num_memory_heads=0): """Model incorporating mixture-of-experts, local and global attention. ~6B parameters 32 experts in 3 hierarchichal moe layers. Args: num_heads: an optional integer num_memory_heads: an optional integer Returns: a hparams """ hparams = mtr_lm_dense(0) local_att = transformer_layers.LocalSelfAttention( num_heads=num_heads, num_memory_heads=num_memory_heads, key_value_size=128) att = transformer_layers.SelfAttention(num_heads=num_heads, num_memory_heads=num_memory_heads, key_value_size=128) drd = transformer_layers.DenseReluDense(hidden_size=2048) hmoe = moe.MoE2D(expert_x=8, expert_y=4, hidden_size=32768) hparams.layer_stack = transformer.LayerStack( ([local_att, local_att, drd, att, drd, local_att, local_att, hmoe] * 4)[:-1]) hparams.mesh_shape = "b0:4;b1:8" hparams.layout = "outer_batch:b0;inner_batch:b1,expert_x:b1,expert_y:b0" hparams.outer_batch_size = 4 return hparams
def __init__(self, radius=128, num_heads=8, num_memory_heads=0, key_value_size=128, shared_kv=False, dropout_rate=0.0, attention_kwargs=None, downsample_query=2, low_rank_features=32, project_kv=True, use_ffn=True, num_memory_slots=0, structured=False, pre_attention=False, local_gate=False, norm=False, pos_att=False, conv_type=None, query_func="linear", pool_func="max", local_attention=False, use_offsets=False, consider_chars_as_blocks=False, use_block_pos_embedding=False, canine_mode=False, filter_size=5, block_mixing_mode=None, rank_activation="softmax", gbst_pool="mean"): super(GradientSubwordLayerV2, self).__init__(num_heads, num_memory_heads, key_value_size, shared_kv, dropout_rate, attention_kwargs) self.radius = radius self.downsample_query = downsample_query self.low_rank_features = low_rank_features self.project_kv = project_kv self.use_ffn = use_ffn if self.use_ffn: self.ffn = transformer_layers.DenseReluDense() self.num_memory_slots = num_memory_slots self.structured = structured self.pre_attention = pre_attention self.local_gate = local_gate self.norm = norm self.pos_att = pos_att self.conv_type = conv_type self.query_func = query_func self.pool_func = pool_func self.local_attention = local_attention self.use_offsets = use_offsets self.consider_chars_as_blocks = consider_chars_as_blocks self.use_block_pos_embedding = use_block_pos_embedding self.canine_mode = canine_mode self.filter_size = filter_size self.block_mixing_mode = block_mixing_mode self.rank_activation = rank_activation self.gbst_pool = gbst_pool
def my_layer_stack(hparams): return transformer.LayerStack([ transformer_layers.SelfAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.DenseReluDense( hidden_size=hparams.d_ff, dropout_rate=hparams.layer_prepostprocess_dropout), ] * hparams.num_hidden_layers)
def mtf_unitransformer_all_layers_tiny(): """Test out all the layers on local CPU.""" hparams = mtf_unitransformer_tiny() hparams.layer_stack = transformer.LayerStack( [transformer_layers.SelfAttention(num_heads=4), transformer_layers.LocalSelfAttention(num_heads=4), moe.MoE1D(num_experts=4, hidden_size=512), moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512), transformer_layers.DenseReluDense(hidden_size=512)]) return hparams
def mtf_transformer2_all_layers_tiny(): """Test out all the layers on local CPU.""" hparams = mtf_transformer2_base() hparams.batch_size = 2 hparams.mesh_shape = "" hparams.d_model = 128 hparams.layer_stack = transformer.LayerStack([ transformer_layers.SelfAttention(num_heads=4), transformer_layers.LocalSelfAttention(num_heads=4), moe.MoE1D(num_experts=4, hidden_size=512), moe.MoE2D(expert_x=4, expert_y=4, hidden_size=512), transformer_layers.DenseReluDense(hidden_size=512) ]) return hparams
def default_layer_stack_with_encoder_attention(hparams): return transformer.LayerStack( [ transformer_layers.SelfAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.EncDecAttention( num_heads=hparams.num_heads, key_value_size=hparams.d_kv, dropout_rate=hparams.attention_dropout), transformer_layers.DenseReluDense( hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout), ] * hparams.num_hidden_layers, dropout_rate=hparams.layer_prepostprocess_dropout, norm_epsilon=hparams.norm_epsilon)
def simple_layer_stack(include_encdec_attention, num_layers=6, d_ff=2048, num_heads=8, d_kv=128, dropout_rate=0.1): """Create a layer stack. Args: include_encdec_attention: a boolean num_layers: an integer d_ff: an integer num_heads: an integer d_kv: an integer dropout_rate: a float Returns: a LayerStack """ ret = [] for _ in xrange(num_layers): ret.append( transformer_layers.SelfAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) if include_encdec_attention: ret.append( transformer_layers.EncDecAttention( num_heads=num_heads, key_value_size=d_kv, attention_kwargs={"dropout_rate": dropout_rate})) ret.append( transformer_layers.DenseReluDense(hidden_size=d_ff, dropout_rate=dropout_rate)) return transformer.LayerStack(ret)
def dense_relu_dense_layer(hparams, prefix): del prefix return transformer_layers.DenseReluDense(hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout)
def create_dummy_model(mesh, shapes, n_blocks=2, block_param_size_str="2_2", block_repeat_size_str="1_1"): """Creates a dummy model and layer stack with 4-dimensional input.""" assert len(shapes) == 4 outer_batch_size, batch_size, length, d_model = shapes batch_dim = mtf.Dimension("batch", batch_size) outer_batch_dim = mtf.Dimension("outer_batch", outer_batch_size) length_dim = mtf.Dimension("length", length) block_param_size = list(map(int, block_param_size_str.split("_"))) block_repeat_size = list(map(int, block_repeat_size_str.split("_"))) sublayers_initial = [ transformer.sublayer_dropout, ] sublayers_per_layer = [ transformer.sublayer_rms_norm, transformer.sublayer_call_layer, transformer.sublayer_dropout, transformer.sublayer_residual, ] sublayers_final = [ transformer.sublayer_rms_norm, transformer.sublayer_dropout, ] submodules = [ transformer_layers.SelfAttention(), transformer_layers.DenseReluDense() ] n_sublayers = np.array(block_param_size).prod() layers = submodules * n_sublayers layer_stack = funnel_transformer.FunnelTransformerLayerStack( layers=layers, n_blocks=n_blocks, block_param_size=block_param_size, block_repeat_size=block_repeat_size, sublayers_initial=sublayers_initial, sublayers_per_layer=sublayers_per_layer, sublayers_final=sublayers_final) model = transformer.Unitransformer(input_vocab_size=10, output_vocab_size=10, autoregressive=False, max_length=8, d_model=d_model, layer_stack=layer_stack) context = transformer.Context(model=model, mesh=mesh, batch_dims=[batch_dim, outer_batch_dim], length_dim=length_dim, variable_dtype=mtf.VariableDType(tf.float32), sequence_id=mtf.ones(mesh, mtf.Shape([length_dim ])), position=mtf.range(mesh, length_dim, dtype=tf.int32)) return layer_stack, context
def dense_relu_dense_from_hparams(hparams): return transformer_layers.DenseReluDense(hidden_size=hparams.d_ff, dropout_rate=hparams.relu_dropout)