コード例 #1
0
ファイル: reg_models.py プロジェクト: entn-at/translate_0
def attn_over_sent_and_lex_2d(x_slices, pad_remover_combined, hparams):
    with tf.variable_scope("self_attention"):
        query_antecedent = common_layers.layer_preprocess(x_slices, hparams)
        y_slices = common_attention.multihead_attention_2d(
            query_antecedent=query_antecedent,
            memory_antecedent=None,
            total_key_depth=hparams.attention_key_channels
            or hparams.hidden_size,
            total_value_depth=hparams.attention_value_channels
            or hparams.hidden_size,
            output_depth=hparams.hidden_size,
            num_heads=hparams.num_heads,
            query_shape=(4, 4),
            memory_flange=(4, 4))
        x_slices = common_layers.layer_postprocess(x_slices, y_slices, hparams)
    with tf.variable_scope("ffn"):
        x0_slices = common_layers.layer_preprocess(x_slices, hparams)
        x0_slices, batch_size, sent_len, lex_cap, hid_dim = reshape_2d(
            x0_slices)
        y_slices = transformer.transformer_ffn_layer(x0_slices, hparams,
                                                     pad_remover_combined)
        y_slices = tf.reshape(y_slices,
                              [batch_size, sent_len, lex_cap, hid_dim])
        x_slices = common_layers.layer_postprocess(x_slices, y_slices, hparams)
    return x_slices
コード例 #2
0
def invertible_transformer_encoder_ffn_unit(x,
                                            hparams,
                                            nonpadding_mask=None,
                                            pad_remover=None,
                                            split_index=0):
    """Applies a feed-forward function which is parametrised for encoding.
  Args:
    x: input
    hparams: model hyper-parameters
    nonpadding_mask: optional Tensor with shape [batch_size, encoder_length]
    indicating what positions are not padding.  This is used
    to mask out padding in convoltutional layers.  We generally only
    need this mask for "packed" datasets, because for ordinary datasets,
    no padding is ever followed by nonpadding.
    pad_remover: to mask out padding in convolutional layers (efficiency).
  Returns:
    the output tensor
  """

    with tf.variable_scope("ffn"):

        ##################
        ## CHANGE START ##
        ##################

        x_splits = tf.split(x, num_or_size_splits=2, axis=2)

        if hparams.transformer_ffn_type == "fc":
            y = transformer.transformer_ffn_layer(
                common_layers.layer_preprocess(x_splits[split_index], hparams),
                hparams,
                pad_remover,
                conv_padding="SAME",
                nonpadding_mask=nonpadding_mask)

        if hparams.transformer_ffn_type == "sepconv":
            assert nonpadding_mask is not None, (
                "The nonpadding_mask should be provided, otherwise the model uses "
                "the leaked padding information to estimate the length!")
            y = common_layers.sepconv_relu_sepconv(
                common_layers.layer_preprocess(x_splits[split_index], hparams),
                filter_size=hparams.filter_size,
                output_size=hparams.hidden_size,
                first_kernel_size=(3, 1),
                second_kernel_size=(5, 1),
                padding="SAME",
                nonpadding_mask=nonpadding_mask,
                dropout=hparams.relu_dropout)

        x_splits[1 - split_index] = common_layers.layer_postprocess(
            x_splits[1 - split_index], y, hparams)
        x = tf.concat(x_splits, axis=2)

        ##################
        ##  CHANGE END  ##
        ##################

    return x
コード例 #3
0
 def g(x):
     """g(x) for reversible layer, feed-forward layer."""
     old_hid_size = hparams.hidden_size
     hparams.hidden_size = old_hid_size // 2
     with tf.variable_scope("ffn"):
         y = transformer.transformer_ffn_layer(
             common_layers.layer_preprocess(x, hparams), hparams)
         y = common_layers.layer_postprocess(x, y, hparams)
     hparams.hidden_size = old_hid_size
     return y
コード例 #4
0
 def g(x):
   """g(x) for reversible layer, feed-forward layer."""
   old_hid_size = hparams.hidden_size
   hparams.hidden_size = old_hid_size // 2
   with tf.variable_scope("ffn"):
     y = transformer.transformer_ffn_layer(
         common_layers.layer_preprocess(x, hparams), hparams)
     y = common_layers.layer_postprocess(x, y, hparams)
   hparams.hidden_size = old_hid_size
   return y
コード例 #5
0
ファイル: reg_models.py プロジェクト: entn-at/translate_0
def attn_over_sent_and_lex_1d_dec(x, encoder_output,
                                  decoder_self_attention_bias,
                                  encoder_decoder_attention_bias, hparams):
    '''
    decoder_input: [batch_size, decoder_length, hidden_dim]
    encoder_output: [batch_size, input_length, hidden_dim]
    encoder_decoder_attention_bias: [batch_size, input_length]
    decoder_self_attention_bias: [batch_size, decoder_length]
    '''
    with tf.variable_scope("self_attention"):
        query_antecedent = common_layers.layer_preprocess(x, hparams)
        y = common_attention.multihead_attention(
            query_antecedent=query_antecedent,
            memory_antecedent=None,
            bias=decoder_self_attention_bias,
            total_key_depth=hparams.attention_key_channels
            or hparams.hidden_size,
            total_value_depth=hparams.attention_value_channels
            or hparams.hidden_size,
            output_depth=hparams.hidden_size,
            num_heads=hparams.num_heads,
            dropout_rate=hparams.attention_dropout,
            attention_type=hparams.self_attention_type,
            max_relative_position=hparams.max_relative_position)
        x = common_layers.layer_postprocess(x, y, hparams)
    if encoder_output is not None:
        with tf.variable_scope("encdec_attention"):
            query_antecedent = common_layers.layer_preprocess(x, hparams)
            y = common_attention.multihead_attention(
                query_antecedent=query_antecedent,
                memory_antecedent=encoder_output,
                bias=encoder_decoder_attention_bias,
                total_key_depth=hparams.attention_key_channels
                or hparams.hidden_size,
                total_value_depth=hparams.attention_value_channels
                or hparams.hidden_size,
                output_depth=hparams.hidden_size,
                num_heads=hparams.num_heads,
                dropout_rate=hparams.attention_dropout)
            x = common_layers.layer_postprocess(x, y, hparams)
    with tf.variable_scope("ffn"):
        x0 = common_layers.layer_preprocess(x, hparams)
        y = transformer.transformer_ffn_layer(x0, hparams)
        x = common_layers.layer_postprocess(x, y, hparams)
    return x
コード例 #6
0
ファイル: reg_models.py プロジェクト: entn-at/translate_0
def attn_over_sent(x, pad_remover, encoder_self_attention_bias, hparams):
    with tf.variable_scope("self_attention"):
        query_antecedent = common_layers.layer_preprocess(x, hparams)
        y = common_attention.multihead_attention(
            query_antecedent=query_antecedent,
            memory_antecedent=None,
            bias=encoder_self_attention_bias,
            total_key_depth=hparams.attention_key_channels
            or hparams.hidden_size,
            total_value_depth=hparams.attention_value_channels
            or hparams.hidden_size,
            output_depth=hparams.hidden_size,
            num_heads=hparams.num_heads,
            dropout_rate=hparams.attention_dropout,
            attention_type=hparams.self_attention_type,
            max_relative_position=hparams.max_relative_position)
        x = common_layers.layer_postprocess(x, y, hparams)
    with tf.variable_scope("ffn"):
        x0 = common_layers.layer_preprocess(x, hparams)
        y = transformer.transformer_ffn_layer(x0, hparams, pad_remover)
        x = common_layers.layer_postprocess(x, y, hparams)
    return x
コード例 #7
0
def ffn(x, hparams, name):
    with tf.variable_scope(name):
        y = transformer.transformer_ffn_layer(
            common_layers.layer_preprocess(x, hparams), hparams)
        return common_layers.layer_postprocess(x, y, hparams)
def hierarchical_context_encoder(encoder_input,
                                 encoder_self_attention_bias,
                                 contexts,
                                 context_self_attention_biases,
                                 features,
                                 hparams,
                                 name="discourse_aware_encoder",
                                 save_weights_to=None,
                                 make_image_summary=True,
                                 losses=None):
    input_x = encoder_input
    context_xs = {}
    for context_name in contexts:
        context_xs[context_name] = contexts[context_name]
    context_paddings = {}
    context_nonpaddings = {}
    context_pad_removers = {}

    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))

    with tf.variable_scope(name, reuse=tf.AUTO_REUSE):
        input_padding = common_attention.attention_bias_to_padding(
            encoder_self_attention_bias)
        input_nonpadding = 1.0 - input_padding
        for context_name in context_self_attention_biases:
            context_paddings[
                context_name] = common_attention.attention_bias_to_padding(
                    context_self_attention_biases[context_name])
            context_nonpaddings[
                context_name] = 1.0 - context_paddings[context_name]

        input_pad_remover = None
        for context_name in context_paddings:
            context_pad_removers[context_name] = None
        if hparams.use_pad_remover and not common_layers.is_xla_compiled():
            input_pad_remover = expert_utils.PadRemover(input_padding)
            for context_name in context_paddings:
                context_pad_removers[context_name] = expert_utils.PadRemover(
                    context_paddings[context_name])

        temp_hparam = tf.contrib.training.HParams(
        )  # copy hparams except num_hidden_layers -> num_hidden_layers - 1
        for key, val in hparams.values().items():
            temp_hparam.add_hparam(key, val)
        temp_hparam.set_hparam("num_hidden_layers",
                               hparams.num_hidden_layers - 1)
        encoder_output = transformer_with_contexts_layers.transformer_encoder(
            input_x,
            encoder_self_attention_bias,
            temp_hparam,
            nonpadding=features_to_nonpadding(features, "inputs"),
            save_weights_to=save_weights_to,
            make_image_summary=make_image_summary)

        context_encoded_outputs = {}
        for context_name in context_xs:
            context_encoded_outputs[
                context_name] = transformer_with_contexts_layers.transformer_encoder(
                    context_xs[context_name],
                    context_self_attention_biases[context_name],
                    temp_hparam,
                    nonpadding=features_to_nonpadding(features, context_name),
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary)

        with tf.variable_scope("hierarchical_context_encoder",
                               reuse=tf.AUTO_REUSE):
            for context_name in context_encoded_outputs:
                # self attention feed-forward
                _y = ffn_self_attention_layer(
                    context_encoded_outputs[context_name],
                    hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    save_weights_to=save_weights_to,
                    name="attentive_sum")
                # mean over sequence length
                context_encoded_outputs[context_name] = tf.reduce_mean(
                    _y, axis=1, keep_dims=True)

            encoded_contexts = [
                context_encoded_outputs[context_name]
                for context_name in context_encoded_outputs
            ]
            encoded_contexts = tf.concat(encoded_contexts, axis=1)

            temp_hparam = tf.contrib.training.HParams(
            )  # copy hparams except num_hidden_layers -> 1
            for key, val in hparams.values().items():
                temp_hparam.add_hparam(key, val)
            temp_hparam.set_hparam("num_hidden_layers", 1)
            context_padding = common_attention.embedding_to_padding(
                encoded_contexts)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                context_padding)

            encoded_contexts = transformer_encoder(encoded_contexts,
                                                   ignore_padding, temp_hparam)

        with tf.variable_scope("encoder/layer_%d" % hparams.num_hidden_layers,
                               reuse=tf.AUTO_REUSE):
            with tf.variable_scope("context_input_attention"):
                context_padding = common_attention.embedding_to_padding(
                    encoded_contexts)
                ignore_padding = common_attention.attention_bias_ignore_padding(
                    context_padding)
                _y = common_attention.multihead_attention(
                    common_layers.layer_preprocess(encoder_output, hparams),
                    encoded_contexts,
                    ignore_padding,
                    hparams.attention_key_channels or hparams.hidden_size,
                    hparams.attention_value_channels or hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    attention_type=hparams.self_attention_type,
                    save_weights_to=save_weights_to,
                    make_image_summary=make_image_summary,
                    max_relative_position=hparams.max_relative_position,
                    dropout_broadcast_dims=attention_dropout_broadcast_dims,
                    max_length=hparams.get("max_length"),
                    vars_3d=hparams.get("attention_variables_3d"))
                encoded_contexts = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

            with tf.variable_scope("input_self_attention"):
                _y = common_attention.multihead_attention(
                    common_layers.layer_preprocess(encoder_output, hparams),
                    None,
                    encoder_self_attention_bias,
                    hparams.attention_key_channels or hparams.hidden_size,
                    hparams.attention_value_channels or hparams.hidden_size,
                    hparams.hidden_size,
                    hparams.num_heads,
                    hparams.attention_dropout,
                    attention_type=hparams.self_attention_type,
                    save_weights_to=save_weights_to,
                    max_relative_position=hparams.max_relative_position,
                    make_image_summary=make_image_summary,
                    dropout_broadcast_dims=attention_dropout_broadcast_dims,
                    max_length=hparams.get("max_length"),
                    vars_3d=hparams.get("attention_variables_3d"))
                encoder_output = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

            with tf.variable_scope("gated_sum"):
                _depth = common_layers.shape_list(encoder_output)[-1]
                gate = tf.layers.dense(tf.concat(
                    [encoded_contexts, encoder_output], axis=-1),
                                       _depth,
                                       activation=tf.nn.sigmoid)
                if save_weights_to:
                    save_weights_to["gated_sum"] = gate
                encoder_output = gate * encoder_output + (
                    1. - gate) * encoded_contexts

            with tf.variable_scope("ffn"):
                _y = transformer_ffn_layer(common_layers.layer_preprocess(
                    encoder_output, hparams),
                                           hparams,
                                           input_pad_remover,
                                           conv_padding="SAME",
                                           nonpadding_mask=input_nonpadding,
                                           losses=losses)
                encoder_output = common_layers.layer_postprocess(
                    encoder_output, _y, hparams)

    return common_layers.layer_preprocess(encoder_output, hparams)
コード例 #9
0
ファイル: gibbs.py プロジェクト: fstahlberg/tensor2tensor-usr
def transformer_bidirectional_joint_decoder(left_decoder_output,
                                            right_decoder_output,
                                            encoder_output,
                                            encoder_decoder_attention_bias,
                                            hparams,
                                            cache=None,
                                            decode_loop_step=None,
                                            name="decoder",
                                            nonpadding=None,
                                            save_weights_to=None,
                                            make_image_summary=True,
                                            losses=None):
    """A stack of transformer layers.

  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_self_attention_bias: bias Tensor for self-attention
      (see common_attention.attention_bias())
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    decode_loop_step: An integer, step number of the decoding loop.
        Only used for inference on TPU.
    name: a string
    nonpadding: optional Tensor with shape [batch_size, encoder_length]
      indicating what positions are not padding.  This is used
      to mask out padding in convolutional layers.  We generally only
      need this mask for "packed" datasets, because for ordinary datasets,
      no padding is ever followed by nonpadding.
    save_weights_to: an optional dictionary to capture attention weights
      for visualization; the weights tensor will be appended there under
      a string key created from the variable scope (including name).
    make_image_summary: Whether to make an attention image summary.
    losses: optional list onto which to append extra training losses

  Returns:
    y: a Tensors
  """
    x = left_decoder_output + right_decoder_output
    attention_dropout_broadcast_dims = (
        common_layers.comma_separated_string_to_integer_list(
            getattr(hparams, "attention_dropout_broadcast_dims", "")))
    with tf.variable_scope(name):
        for layer in range(hparams.num_bidirectional_decoder_joint_layers):
            layer_name = "joint_layer_%d" % layer
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                if encoder_output is not None:
                    with tf.variable_scope("encdec_attention"):
                        y = common_attention.multihead_attention(
                            common_layers.layer_preprocess(x, hparams),
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            max_relative_position=hparams.
                            max_relative_position,
                            heads_share_relative_embedding=(
                                hparams.heads_share_relative_embedding),
                            add_relative_to_values=hparams.
                            add_relative_to_values,
                            save_weights_to=save_weights_to,
                            cache=layer_cache,
                            make_image_summary=make_image_summary,
                            dropout_broadcast_dims=
                            attention_dropout_broadcast_dims,
                            max_length=hparams.get("max_length"),
                            vars_3d=hparams.get("attention_variables_3d"))
                        x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer_ffn_layer(
                        common_layers.layer_preprocess(x, hparams),
                        hparams,
                        conv_padding="LEFT",
                        nonpadding_mask=nonpadding,
                        losses=losses,
                        cache=layer_cache,
                        decode_loop_step=decode_loop_step)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it should also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
コード例 #10
0
ファイル: reg_models.py プロジェクト: entn-at/translate_0
def attn_over_sent_and_lex_2d_dec(x, encoder_output,
                                  decoder_self_attention_bias, hparams):
    with tf.variable_scope("self_attention"):
        query_antecedent = common_layers.layer_preprocess(x, hparams)
        y = common_attention.multihead_attention(
            query_antecedent=query_antecedent,
            memory_antecedent=None,
            bias=decoder_self_attention_bias,
            total_key_depth=hparams.attention_key_channels
            or hparams.hidden_size,
            total_value_depth=hparams.attention_value_channels
            or hparams.hidden_size,
            output_depth=hparams.hidden_size,
            num_heads=hparams.num_heads,
            dropout_rate=hparams.attention_dropout,
            attention_type=hparams.self_attention_type,
            max_relative_position=hparams.max_relative_position)
        x = common_layers.layer_postprocess(x, y, hparams)
    if encoder_output is not None:
        with tf.variable_scope("encdec_attention"):
            query_antecedent = common_layers.layer_preprocess(x, hparams)

            batch_size = tf.shape(encoder_output)[0]
            src_len = tf.shape(encoder_output)[1]
            tgt_len = tf.shape(query_antecedent)[1]
            lex_cap = encoder_output.shape.as_list()[2]
            hid_size = encoder_output.shape.as_list()[3]

            query_antecedent = tf.expand_dims(query_antecedent, 2)
            query_antecedent = tf.pad(
                query_antecedent, [[0, 0], [0, 0], [0, lex_cap - 1], [0, 0]])
            query_pad = tf.zeros([batch_size, src_len, lex_cap, hid_size])
            query_antecedent = tf.concat([query_antecedent, query_pad], 1)

            memory_antecedent = encoder_output
            memory_pad = tf.zeros([batch_size, tgt_len, lex_cap, hid_size])
            memory_antecedent = tf.concat([memory_antecedent, memory_pad], 1)

            tf.logging.info(
                "dimension of decoder input at the enc-dec attention layer: {0}"
                .format(query_antecedent.get_shape()))
            tf.logging.info(
                "dimension of encoder output at the enc-dec attention layer: {0}"
                .format(memory_antecedent.get_shape()))

            y = common_attention.multihead_attention_2d(
                query_antecedent=query_antecedent,
                memory_antecedent=memory_antecedent,
                total_key_depth=hparams.attention_key_channels
                or hparams.hidden_size,
                total_value_depth=hparams.attention_value_channels
                or hparams.hidden_size,
                output_depth=hparams.hidden_size,
                num_heads=hparams.num_heads,
                attention_type="masked_local_attention_2d",
                query_shape=(4, 4),
                memory_flange=(4, 4))

            tf.logging.info("dimension of enc-dec output: {0}".format(
                y.get_shape()))
            y = y[:, :, 0, :]
            y = y[:, :tgt_len, :]

            x = common_layers.layer_postprocess(x, y, hparams)
    with tf.variable_scope("ffn"):
        x0 = common_layers.layer_preprocess(x, hparams)
        y = transformer.transformer_ffn_layer(x0, hparams)
        x = common_layers.layer_postprocess(x, y, hparams)
    return x
コード例 #11
0
ファイル: reg_models.py プロジェクト: entn-at/translate_0
    def encode_lex(self, encoder_input, target_space, hparams):
        '''
        encoder_input: [batch_size, input_len, hidden_dim]
        return: 
            encoder_output: [batch_size, input_len, hidden_dim]
            encoder_decoder_attention_bias: [batch_size, input_len]
        '''
        encoder_output_slices = []
        for i in range(encoder_input.get_shape()[2].value):
            encoder_input_slice = encoder_input[:, :, i, :]

            # bias
            encoder_padding = common_attention.embedding_to_padding(
                encoder_input_slice)
            print(encoder_padding.shape.as_list()
                  )  # ==> [None, None] (None, None, 4)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                encoder_padding)
            encoder_self_attention_bias = ignore_padding
            encoder_decoder_attention_bias = ignore_padding
            print(ignore_padding.shape.as_list()
                  )  # ==> [None, 1, 1, None] (None, 1, 1, None, 4)

            # add target space to encoder input?
            ishape_static = encoder_input_slice.shape.as_list()
            print(ishape_static)  # ==> [None, None, 300] (None, None, 4, 300)
            emb_target_space = common_layers.embedding(
                target_space,
                32,
                ishape_static[-1],
                name="target_space_embedding")
            print(emb_target_space.shape.as_list())  # ==> [300]
            emb_target_space = tf.reshape(emb_target_space, [1, 1, -1])
            print(emb_target_space.shape.as_list())  # ==> [1, 1, 300]
            encoder_input_slice += emb_target_space
            print(encoder_input_slice.shape.as_list()
                  )  # ==> [None, None, 300] (None, None, 4, 300)

            # add timing signals to encoder input
            if hparams.pos == "timing":
                encoder_input_slice = common_attention.add_timing_signal_1d(
                    encoder_input_slice)

            # dropout
            encoder_input_slice = tf.nn.dropout(
                encoder_input_slice,
                1.0 - hparams.layer_prepostprocess_dropout)

            # encoder
            '''
            multihead_attention(
            query_antecedent: [batch, length_q, channels], -- x, x
            memory_antecedent: [batch, length_m, channels], -- None, encoder_output
            bias: bias tensor, -- encoder_self_attention_bias
            total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size
            total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size
            output_depth: integer, -- hparams.hidden_size
            num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8)
            dropout_rate: float, -- hparams.attention_dropout
            ...
            cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention)
            '''
            x = encoder_input_slice
            with tf.variable_scope("encoder" + str(i)):
                # remove pad
                pad_remover = None
                if hparams.use_pad_remover:
                    pad_remover = expert_utils.PadRemover(
                        common_attention.attention_bias_to_padding(
                            encoder_self_attention_bias))

                # self-attention along the sentence dimension
                for layer in xrange(hparams.num_encoder_layers
                                    or hparams.num_hidden_layers):
                    with tf.variable_scope("layer_%d" % layer):
                        with tf.variable_scope("self_attention"):
                            query_antecedent = common_layers.layer_preprocess(
                                x, hparams)
                            y = common_attention.multihead_attention(
                                query_antecedent=query_antecedent,
                                memory_antecedent=None,
                                bias=encoder_self_attention_bias,
                                total_key_depth=hparams.attention_key_channels
                                or hparams.hidden_size,
                                total_value_depth=hparams.
                                attention_value_channels
                                or hparams.hidden_size,
                                output_depth=hparams.hidden_size,
                                num_heads=hparams.num_heads,
                                dropout_rate=hparams.attention_dropout,
                                attention_type=hparams.self_attention_type,
                                max_relative_position=hparams.
                                max_relative_position)
                            x = common_layers.layer_postprocess(x, y, hparams)
                        with tf.variable_scope("ffn"):
                            y = transformer.transformer_ffn_layer(
                                common_layers.layer_preprocess(x, hparams),
                                hparams, pad_remover)
                            x = common_layers.layer_postprocess(x, y, hparams)
                encoder_output_slice = common_layers.layer_preprocess(
                    x, hparams)
                print(encoder_output_slice.shape.as_list()
                      )  # ==> [None, None, 300] (None, None, 4, 300)

            encoder_output_slices.append(encoder_output_slice)
        encoder_output = tf.stack(encoder_output_slices, 2)
        print(encoder_output.shape.as_list())  # ==> [None, None, 4, 300]

        # --------

        encoder_output_slices = []
        #hparams2 = copy.deepcopy(hparams)
        #hparams2.hidden_size = hparams.lex_cap
        num_heads = int(hparams.lex_cap / 2)
        hparams2 = tf.contrib.training.HParams(
            layer_preprocess_sequence=hparams.layer_preprocess_sequence,
            layer_postprocess_sequence=hparams.layer_postprocess_sequence,
            layer_prepostprocess_dropout=hparams.layer_prepostprocess_dropout,
            norm_type=hparams.norm_type,
            hidden_size=hparams.lex_cap,
            norm_epsilon=hparams.norm_epsilon,
            ffn_layer=hparams.ffn_layer,
            filter_size=hparams.filter_size,
            relu_dropout=hparams.relu_dropout,
            num_heads=num_heads,
            attention_dropout=hparams.attention_dropout,
            parameter_attention_key_channels=hparams.
            parameter_attention_key_channels,
            parameter_attention_value_channels=hparams.
            parameter_attention_value_channels)

        for i in range(encoder_output.get_shape()[3].value):
            encoder_input_slice = encoder_output[:, :, :, i]
            #print(encoder_input_slice.shape.as_list()) # ==> [None, None, 4]

            encoder_padding = common_attention.embedding_to_padding(
                encoder_input_slice)
            ignore_padding = common_attention.attention_bias_ignore_padding(
                encoder_padding)
            encoder_self_attention_bias = ignore_padding
            #print(encoder_self_attention_bias.shape.as_list()) # ==> [None, 1, 1, None]

            # encoder
            '''
            multihead_attention(
            query_antecedent: [batch, length_q, channels], -- x, x
            memory_antecedent: [batch, length_m, channels], -- None, encoder_output
            bias: bias tensor, -- encoder_self_attention_bias
            total_key_depth: int, -- hparams.attention_key_channels or hparams.hidden_size
            total_value_depth: int, -- hparams.attention_value_channels or hparams.hidden_size
            output_depth: integer, -- hparams.hidden_size
            num_heads: integer dividing total_key_depth and total_value_depth, -- hparams.num_heads (8)
            dropout_rate: float, -- hparams.attention_dropout
            ...
            cache=None: dict, containing tensors which are the results of previous attentions used for fast decoding, {'k': [batch_size, 0, key_channels], 'v': [batch_size, 0, value_channels], used in decoder self-attention)
            '''
            x = encoder_input_slice
            with tf.variable_scope("encoder_extra" + str(i)):
                # remove pad
                pad_remover = None
                if hparams.use_pad_remover:
                    pad_remover = expert_utils.PadRemover(
                        common_attention.attention_bias_to_padding(
                            encoder_self_attention_bias))

                # self-attention along the lexicon dimension
                with tf.variable_scope("layer_extra"):
                    with tf.variable_scope("self_attention"):
                        #query_antecedent = layer_preprocess2(x, hparams, hparams.lex_cap)
                        query_antecedent = common_layers.layer_preprocess(
                            x, hparams2)

                        y = common_attention.multihead_attention(
                            query_antecedent=query_antecedent,
                            memory_antecedent=None,
                            bias=encoder_self_attention_bias,
                            total_key_depth=hparams.attention_key_channels
                            or hparams.lex_cap,
                            total_value_depth=hparams.attention_value_channels
                            or hparams.lex_cap,
                            output_depth=hparams.lex_cap,
                            num_heads=num_heads,
                            dropout_rate=hparams.attention_dropout,
                            attention_type=hparams.self_attention_type,
                            max_relative_position=hparams.max_relative_position
                        )
                        #x = layer_postprocess2(x, y, hparams, hparams.lex_cap)
                        x = common_layers.layer_postprocess(x, y, hparams2)
                    with tf.variable_scope("ffn"):
                        y = transformer.transformer_ffn_layer(
                            common_layers.layer_preprocess(x, hparams2),
                            hparams2, pad_remover)
                        #x = layer_postprocess2(x, y, hparams, hparams.lex_cap)
                        x = common_layers.layer_postprocess(x, y, hparams2)
                #encoder_output_slice = layer_preprocess2(x, hparams, hparams.lex_cap)
                encoder_output_slice = common_layers.layer_preprocess(
                    x, hparams2)
                #print(encoder_output_slice.shape.as_list()) # ==> [None, None, 4] (None, None, 4, 300)

            encoder_output_slices.append(encoder_output_slice)
        encoder_output = tf.stack(encoder_output_slices, 3)
        print(encoder_output.shape.as_list())  # ==> [None, None, 4, 300]

        # --------

        lex_cap = encoder_output.get_shape()[2].value
        embed_len = encoder_output.get_shape()[3].value
        assert (lex_cap == hparams.lex_cap)
        aggregate_layer = tf.get_variable(
            name="Aggregate",
            shape=[embed_len, embed_len, lex_cap],
            initializer=tf.random_normal_initializer(mean=0.0, stddev=0.1))
        encoder_output = tf.tensordot(encoder_output,
                                      aggregate_layer,
                                      axes=[[2, 3], [1, 2]])
        print(encoder_output.shape.as_list())  # ==> [None, None, 300]

        return encoder_output, encoder_decoder_attention_bias
コード例 #12
0
def transformer_decoder_fast_aan(decoder_input,
                                 encoder_output,
                                 decoder_position_forward_mask,
                                 encoder_decoder_attention_bias,
                                 hparams,
                                 cache=None,
                                 name="decoder"):
    """A stack of transformer layers.
  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_position_forward_mask: mask Tensor for position-forward / shape: [1, t, 1]
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    name: a string
  Returns:
    y: a Tensors
  """
    x = decoder_input
    with tf.variable_scope(name):
        for layer in range(hparams.num_decoder_layers
                           or hparams.num_hidden_layers):
            layer_name = "layer_%d" % layer
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                with tf.variable_scope("position_forward"):
                    if layer_cache:
                        given_inputs_new = layer_cache['given_inputs'] + x
                        x_fwd = given_inputs_new * decoder_position_forward_mask
                        layer_cache['given_inputs'] = given_inputs_new + x
                    else:
                        x_fwd = tf.cumsum(
                            x, axis=1) * decoder_position_forward_mask
                    # FFN activation
                    y = transformer.transformer_ffn_layer(
                        common_layers.layer_preprocess(x_fwd, hparams),
                        hparams)

                    # Gating layer
                    z = tf.layers.dense(tf.concat([x, y], axis=-1),
                                        hparams.hidden_size * 2,
                                        name="z_project")
                    i, f = tf.split(z, 2, axis=-1)
                    y = tf.sigmoid(i) * x + tf.sigmoid(f) * y
                    x = common_layers.layer_postprocess(x, y, hparams)

                if encoder_output is not None:
                    with tf.variable_scope("encdec_attention"):
                        y = multihead_attention(
                            common_layers.layer_preprocess(x, hparams),
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            cache=layer_cache)
                        x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer.transformer_ffn_layer(
                        common_layers.layer_preprocess(x, hparams), hparams)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it shuold also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)
コード例 #13
0
def transformer_decoder_fast(decoder_input,
                             encoder_output,
                             decoder_self_attention_bias,
                             encoder_decoder_attention_bias,
                             hparams,
                             cache=None,
                             name="decoder"):
    """A stack of transformer layers.
  Args:
    decoder_input: a Tensor
    encoder_output: a Tensor
    decoder_position_forward_mask: mask Tensor for position-forward / shape: [1, t, 1]
    encoder_decoder_attention_bias: bias Tensor for encoder-decoder attention
      (see common_attention.attention_bias())
    hparams: hyperparameters for model
    cache: dict, containing tensors which are the results of previous
        attentions, used for fast decoding.
    name: a string
  Returns:
    y: a Tensors
  """
    x = decoder_input
    with tf.variable_scope(name):
        for layer in range(hparams.num_decoder_layers
                           or hparams.num_hidden_layers):
            layer_name = "layer_%d" % layer
            layer_cache = cache[layer_name] if cache is not None else None
            with tf.variable_scope(layer_name):
                with tf.variable_scope("self_attention"):
                    y = common_attention.multihead_attention(
                        common_layers.layer_preprocess(x, hparams),
                        None,
                        decoder_self_attention_bias,
                        hparams.attention_key_channels or hparams.hidden_size,
                        hparams.attention_value_channels
                        or hparams.hidden_size,
                        hparams.hidden_size,
                        hparams.num_heads,
                        hparams.attention_dropout,
                        attention_type=hparams.self_attention_type,
                        max_relative_position=hparams.max_relative_position,
                        cache=layer_cache)
                    x = common_layers.layer_postprocess(x, y, hparams)
                if encoder_output is not None:
                    with tf.variable_scope("encdec_attention"):
                        y = multihead_attention(
                            common_layers.layer_preprocess(x, hparams),
                            encoder_output,
                            encoder_decoder_attention_bias,
                            hparams.attention_key_channels
                            or hparams.hidden_size,
                            hparams.attention_value_channels
                            or hparams.hidden_size,
                            hparams.hidden_size,
                            hparams.num_heads,
                            hparams.attention_dropout,
                            cache=layer_cache)
                        x = common_layers.layer_postprocess(x, y, hparams)
                with tf.variable_scope("ffn"):
                    y = transformer.transformer_ffn_layer(
                        common_layers.layer_preprocess(x, hparams), hparams)
                    x = common_layers.layer_postprocess(x, y, hparams)
        # if normalization is done in layer_preprocess, then it shuold also be done
        # on the output, since the output can grow very large, being the sum of
        # a whole stack of unnormalized layer outputs.
        return common_layers.layer_preprocess(x, hparams)