예제 #1
0
    def Params(cls):
        """Configs for `MTEncoderBiRNN`."""
        p = super(MTEncoderBiRNN, cls).Params()
        p.Define('emb', layers.EmbeddingLayer.Params(),
                 'Embedding layer params.')
        p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(),
                 'Configs template for the RNN layer.')
        p.Define('proj_tpl', layers.ProjectionLayer.Params(),
                 'Configs template for the projection layer.')
        p.Define('lstm_cell_size', 512, 'LSTM cell size for the RNN layer.')
        p.Define('num_lstm_layers', 8, 'Number of rnn layers to create')
        p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.')
        p.Define('residual_start', 2,
                 'Layer at which we start residual connections.')
        p.Define('encoder_out_dim', 1024, 'Depth of the encoder output.')
        p.Define('bidi_rnn_type', 'func', 'Options: func. '
                 'func: BidirectionalFRNN. ')
        p.Define('cc_schedule', None, 'Clipping cap schedule.')

        p.Define('is_transparent', False,
                 'If set, outputs a merger of layer outputs.')
        p.Define(
            'transparent_merger_tpl',
            layers.WeightedSumLayer.Params().Set(add_weight_summaries=True),
            'Merger op for layer outputs.')
        p.Define(
            'packed_input', False, 'If True, encoder and all layers support '
            'multiple examples in a single sequence.')

        disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
        default_params_init = py_utils.WeightInit.Uniform(0.04)

        # Default config for the embedding.
        p.emb.vn = disable_vn
        p.emb.vocab_size = 32000
        p.emb.embedding_dim = 1024
        p.emb.max_num_shards = 16
        p.emb.params_init = default_params_init

        p.lstm_tpl.vn = disable_vn
        p.lstm_tpl.params_init = default_params_init
        return p
예제 #2
0
    def Params(cls):
        """Configs for `MTEncoderV1`."""
        p = super(MTEncoderV1, cls).Params()
        p.Define('emb', layers.EmbeddingLayer.Params(),
                 'Embedding layer params.')
        p.Define('lstm_tpl', rnn_cell.LSTMCellSimple.Params(),
                 'Configs template for the RNN layer.')
        p.Define(
            'lstm_tpl_uni', None,
            'Override configs template for the unidirectional RNN layers.')
        p.Define('lstm_tpl_bidi', None,
                 'Override configs template for the bidirectional RNN layer.')
        p.Define('lstm_cell_size', 1024, 'LSTM cell size for the RNN layer.')
        p.Define('num_lstm_layers', 8, 'Number of rnn layers to create')
        p.Define('dropout_prob', 0.0, 'Prob at which we do dropout.')
        p.Define('unidi_rnn_type', 'func', 'Options: func. ' 'func: FRNN.')
        p.Define('bidi_rnn_type', 'func', 'Options: func. '
                 'func: BidirectionalFRNN. ')
        p.Define('cc_schedule', None, 'Clipping cap schedule.')
        p.Define(
            'packed_input', False, 'If True, encoder and all layers support '
            'multiple examples in a single sequence.')

        disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
        default_params_init = py_utils.WeightInit.Uniform(0.04)

        # Default config for the embedding.
        p.emb.vn = disable_vn
        p.emb.vocab_size = 32000
        p.emb.embedding_dim = 1024
        p.emb.max_num_shards = 16
        p.emb.params_init = default_params_init

        for tpl in [p.lstm_tpl, p.lstm_tpl_uni, p.lstm_tpl_bidi]:
            if tpl is not None:
                tpl.vn = disable_vn
                tpl.params_init = default_params_init
        return p
예제 #3
0
def SetupTransformerBatchMajorEncoderV1(model_dim,
                                        vocab_size,
                                        num_layers,
                                        num_heads,
                                        hidden_dim,
                                        residual_dropout_prob=0.1,
                                        input_dropout_prob=0.0,
                                        atten_dropout_prob=0.0,
                                        relu_dropout_prob=0.0,
                                        activation='RELU',
                                        add_unnormalized_residuals=False,
                                        packed_input=False,
                                        use_fast_projection_layer=False,
                                        enable_per_dim_scale=True,
                                        use_fused_layernorm=False,
                                        use_bf16_activations=False,
                                        use_bias=True,
                                        xla_num_partitions=None):
    """Common setup for transformer batch major model encoder.

  Args:
   model_dim: specifies dimension of transformer layers, token embeddings, and
     positional embeddings as well context vectors (attention values).
   vocab_size: for token embeddings.
   num_layers: number of transformer layers.
   num_heads: number of attention heads.
   hidden_dim: in transformer feedforward layer.
   residual_dropout_prob: used in transformer feedforward and attention layer.
   input_dropout_prob: input dropout.
   atten_dropout_prob: used in attention layer.
   relu_dropout_prob: used in transformer feedforward layer.
   activation: Non-linearity for feed-forward layers. (Unused)
   add_unnormalized_residuals: If set, uses un-normalized residuals in
     TransformerAttentionLayer
   packed_input: Whether to enable packed input.
   use_fast_projection_layer: Whether to use fast projection layer to remove the
     data formatting overheads.
   enable_per_dim_scale: Whether to enable per_dim_scale.
   use_fused_layernorm: Whether to use fused layer normalization.
   use_bf16_activations: Whether to use bfloat16 for activations.
   use_bias: Whether to use bias for projection layer.

  Returns:
   Encoder params.
  """
    # TODO(shibow) Add 'GLEU' activation option for batch major encoder.
    # GELU should be used to reduce quality loss when casting weights from fp32
    # to bf16 for inference (b/123993529).
    del activation
    # EncoderV1 has already had fast projection layer as default.
    del use_fast_projection_layer

    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Encoder
    encoder_params = encoder.TransformerBatchMajorEncoder.Params()
    encoder_params.params_init = default_params_init
    encoder_params.model_dim = model_dim
    encoder_params.input_dropout_prob = input_dropout_prob
    encoder_params.packed_input = packed_input
    encoder_params.use_fused_layernorm = use_fused_layernorm

    encoder_params.token_emb.Set(embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vocab_size=vocab_size,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    encoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    builder_params = self_attention_layer.Builder.Params().Set(
        model_dim=model_dim,
        num_heads=num_heads,
        ff_hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        selfatten_add_unnormalized_input=add_unnormalized_residuals,
        selfatten_enable_value_proj=True,
        packed_input=packed_input,
        enable_per_dim_scale=enable_per_dim_scale,
        use_fused_layernorm=use_fused_layernorm,
        fprop_dtype=tf.bfloat16 if use_bf16_activations else None,
        use_bias=use_bias,
        xla_num_partitions=xla_num_partitions)
    stack = (builder_params.Instantiate().TransformerStack(
        name='transformer_stack', num_layers=num_layers))
    encoder_params.transformer_stack = (
        self_attention_layer.StackedTransformerEncoderLayers.Cast(stack))
    if add_unnormalized_residuals:
        encoder_params.final_layer_norm = True

    return encoder_params
예제 #4
0
def SetupTransformerBatchMajorDecoderV1(model_dim,
                                        vocab_size,
                                        num_layers,
                                        num_heads,
                                        hidden_dim,
                                        residual_dropout_prob=0.1,
                                        input_dropout_prob=0.0,
                                        atten_dropout_prob=0.0,
                                        relu_dropout_prob=0.0,
                                        label_smoothing_uncertainty=0.1,
                                        activation='RELU',
                                        add_unnormalized_residuals=False,
                                        atten_hidden_dim=0,
                                        packed_input=False,
                                        use_fast_projection_layer=False,
                                        enable_per_dim_scale=True,
                                        use_fused_layernorm=False,
                                        use_bias=True):
    """Common setup for batch major transformer model decoder.

  Args:
   model_dim: specifies dimension of transformer layers, token embeddings, and
     positional embeddings as well context vectors (attention values).
   vocab_size: for token embeddings.
   num_layers: number of transformer layers.
   num_heads: number of attention heads.
   hidden_dim: in transformer feedforward layer.
   residual_dropout_prob: used in transformer feedforward and attention layer.
   input_dropout_prob: input dropout.
   atten_dropout_prob: used in attention layer.
   relu_dropout_prob: used in transformer feedforward layer.
   label_smoothing_uncertainty: A float, representing the uncertainty in label
     smoothing. If this value is 0, no label smoothing will be applied.
   activation: Non-linearity for feed-forward layers.
   add_unnormalized_residuals: If set, uses un-normalized residuals in
     TransformerAttentionLayer
   atten_hidden_dim: Explicitly set attention hidden dim. If 0, default is
     model_dim.
   packed_input: Whether to enable packed input.
   use_fast_projection_layer: Whether to use fast projection layer to remove the
     data formatting overheads.
   enable_per_dim_scale: Whether to enable per_dim_scale.
   use_fused_layernorm: Whether to use fused layer normalization.
   use_bias: Whether to use bias for projection layer.

  Returns:
   Decoder params.
  """
    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Decoder
    decoder_params = decoder.TransformerBatchMajorDecoder.Params()

    decoder_params.source_dim = model_dim
    decoder_params.model_dim = model_dim
    decoder_params.num_trans_layers = num_layers
    decoder_params.input_dropout_prob = input_dropout_prob
    decoder_params.packed_input = packed_input
    decoder_params.use_fused_layernorm = use_fused_layernorm

    decoder_params.token_emb.Set(vocab_size=vocab_size,
                                 embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    decoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    decoder_params.trans_decoder_tpl.packed_input = packed_input
    decoder_params.trans_decoder_tpl.input_dim = model_dim
    decoder_params.trans_decoder_tpl.tr_atten_tpl.Set(
        input_dim=model_dim,
        num_heads=num_heads,
        residual_dropout_prob=residual_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        params_init=default_params_init,
        add_unnormalized_input=add_unnormalized_residuals,
        hidden_dim=atten_hidden_dim,
        vn=disable_vn)

    decoder_params.trans_decoder_tpl.tr_atten_tpl.ln_tpl.Set(
        use_fused_layernorm=use_fused_layernorm)
    decoder_params.trans_decoder_tpl.tr_atten_tpl.atten_tpl.Set(
        vn=disable_vn,
        packed_input=packed_input,
        enable_per_dim_scale=enable_per_dim_scale,
        use_bias=use_bias)
    decoder_params.trans_decoder_tpl.tr_fflayer_tpl.Set(
        input_dim=model_dim,
        hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        params_init=default_params_init,
        vn=disable_vn,
        activation=activation)
    decoder_params.trans_decoder_tpl.tr_fflayer_tpl.ln_tpl.Set(
        use_fused_layernorm=use_fused_layernorm)
    decoder_params.trans_decoder_tpl.tr_fflayer_tpl.fflayer_tpl.projection.Set(
        use_einsum=use_fast_projection_layer)

    if add_unnormalized_residuals:
        decoder_params.final_layer_norm = True

    decoder_params.softmax.Set(num_classes=vocab_size,
                               vn=disable_vn,
                               params_init=emb_params_init,
                               num_shards=16)

    decoder_params.per_word_avg_loss = True
    decoder_params.label_smoothing = layers.UniformLabelSmoother.Params()
    decoder_params.label_smoothing.num_classes = vocab_size
    decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty

    return decoder_params
예제 #5
0
def SetupTransformerEncoder(model_dim,
                            vocab_size,
                            num_layers,
                            num_heads,
                            hidden_dim,
                            residual_dropout_prob=0.1,
                            input_dropout_prob=0.0,
                            atten_dropout_prob=0.0,
                            relu_dropout_prob=0.0,
                            is_transparent=False,
                            activation='RELU',
                            add_unnormalized_residuals=False,
                            atten_hidden_dim=0):
    """Common setup for transformer model encoder.

  Args:
   model_dim: specifies dimension of transformer layers, token embeddings,
    and positional embeddings as well context vectors (attention values).
   vocab_size: for token embeddings.
   num_layers: number of transformer layers.
   num_heads: number of attention heads.
   hidden_dim: in transformer feedforward layer.
   residual_dropout_prob: used in transformer feedforward and attention layer.
   input_dropout_prob: input dropout.
   atten_dropout_prob: used in attention layer.
   relu_dropout_prob: used in transformer feedforward layer.
   is_transparent: if set, outputs a merger of embeddings and layer outputs.
   activation: Non-linearity for feed-forward layers.
   add_unnormalized_residuals: If set, uses un-normalized residuals in
     TransformerAttentionLayer
   atten_hidden_dim: Explicitly set attention hidden dim.

  Returns:
   Encoder params.
  """
    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Encoder
    encoder_params = encoder.TransformerEncoder.Params()

    encoder_params.token_emb.Set(embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vocab_size=vocab_size,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    encoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    # Encoder TransformerStack params
    encoder_params.model_dim = model_dim
    encoder_params.transformer_stack.model_dim = model_dim
    encoder_params.transformer_stack.num_transformer_layers = num_layers
    encoder_params.input_dropout_prob = input_dropout_prob

    encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.Set(
        num_attention_heads=num_heads,
        residual_dropout_prob=residual_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        params_init=default_params_init,
        add_unnormalized_input=add_unnormalized_residuals,
        atten_hidden_dim=atten_hidden_dim,
        vn=disable_vn)

    encoder_params.transformer_stack.transformer_tpl.tr_atten_tpl.atten_tpl.Set(
        num_attention_heads=num_heads,
        enable_ctx_pre_proj=True,
        enable_ctx_post_proj=True,
        context_dim=model_dim,
        vn=disable_vn)

    encoder_params.transformer_stack.transformer_tpl.tr_fflayer_tpl.Set(
        hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        params_init=default_params_init,
        vn=disable_vn,
        activation=activation)

    if is_transparent:
        encoder_params.transformer_stack.is_transparent = True

    return encoder_params
예제 #6
0
def SetupTransformerDecoder(model_dim,
                            vocab_size,
                            num_layers,
                            num_heads,
                            hidden_dim,
                            residual_dropout_prob=0.1,
                            input_dropout_prob=0.0,
                            atten_dropout_prob=0.0,
                            relu_dropout_prob=0.0,
                            label_smoothing_uncertainty=0.1,
                            is_transparent=False,
                            activation='RELU',
                            add_unnormalized_residuals=False,
                            atten_hidden_dim=0):
    """Common setup for transformer model decoder."""
    disable_vn = py_utils.VariationalNoiseParams(1.0, False, False)
    default_params_init = py_utils.WeightInit.Xavier(1.0)
    emb_params_init = py_utils.WeightInit.Gaussian(1.0 / math.sqrt(model_dim))

    # Decoder
    decoder_params = decoder.TransformerDecoder.Params()
    decoder_params.source_dim = model_dim
    decoder_params.model_dim = model_dim
    decoder_params.num_trans_layers = num_layers
    decoder_params.input_dropout_prob = input_dropout_prob

    decoder_params.token_emb.Set(vocab_size=vocab_size,
                                 embedding_dim=model_dim,
                                 max_num_shards=16,
                                 params_init=emb_params_init,
                                 vn=disable_vn,
                                 scale_sqrt_depth=True)

    decoder_params.position_emb.Set(embedding_dim=model_dim,
                                    trainable_scaling=False,
                                    vn=disable_vn)

    decoder_params.trans_tpl.source_dim = model_dim
    decoder_params.trans_tpl.tr_atten_tpl.Set(
        source_dim=model_dim,
        num_attention_heads=num_heads,
        residual_dropout_prob=residual_dropout_prob,
        atten_dropout_prob=atten_dropout_prob,
        params_init=default_params_init,
        add_unnormalized_input=add_unnormalized_residuals,
        atten_hidden_dim=atten_hidden_dim,
        vn=disable_vn)

    decoder_params.trans_tpl.tr_atten_tpl.atten_tpl.Set(
        enable_ctx_pre_proj=True,
        enable_ctx_post_proj=True,
        context_dim=model_dim,
        vn=disable_vn)

    decoder_params.trans_tpl.tr_fflayer_tpl.Set(
        input_dim=model_dim,
        hidden_dim=hidden_dim,
        residual_dropout_prob=residual_dropout_prob,
        relu_dropout_prob=relu_dropout_prob,
        params_init=default_params_init,
        vn=disable_vn,
        activation=activation)

    decoder_params.softmax.Set(num_classes=vocab_size,
                               vn=disable_vn,
                               params_init=emb_params_init,
                               num_shards=16)

    decoder_params.per_word_avg_loss = True
    decoder_params.label_smoothing = layers.UniformLabelSmoother.Params()
    decoder_params.label_smoothing.num_classes = vocab_size
    decoder_params.label_smoothing.uncertainty = label_smoothing_uncertainty

    if is_transparent:
        decoder_params.is_transparent = True

    return decoder_params