Beispiel #1
0
    def encoder_impl(self, encoder_input, is_training):

        attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
        residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0

        # Mask
        encoder_padding = tf.equal(
            tf.reduce_sum(tf.abs(encoder_input), axis=-1), 0.0)
        encoder_output = dense(encoder_input,
                               self._config.hidden_units,
                               activation=tf.identity,
                               use_bias=True,
                               name="src_change")
        encoder_output = layers.layer_norm(encoder_output)

        # Add positional signal
        encoder_output = layers_with_attention.add_timing_signal_1d(
            encoder_output)
        # Dropout
        encoder_output = tf.layers.dropout(encoder_output,
                                           rate=residual_dropout_rate,
                                           training=is_training)

        # Blocks
        for i in range(self._config.encoder_num_blocks):
            with tf.variable_scope("block_{}".format(i)):
                # Multihead Attention
                encoder_output = residual(
                    encoder_output,
                    multihead_attention(
                        query_antecedent=encoder_output,
                        memory_antecedent=None,
                        bias=layers_with_attention.
                        attention_bias_ignore_padding(encoder_padding),
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        name='encoder_self_attention',
                        summaries=True),
                    dropout_rate=residual_dropout_rate)

                # Feed Forward
                encoder_output = residual(
                    encoder_output,
                    ff_hidden(inputs=encoder_output,
                              hidden_size=4 * self._config.hidden_units,
                              output_size=self._config.hidden_units,
                              activation=self._ff_activation),
                    dropout_rate=residual_dropout_rate)
        # Mask padding part to zeros.
        encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding),
                                         axis=-1)
        return encoder_output
Beispiel #2
0
  def decoder_with_caching_impl(self, decoder_input, decoder_cache, encoder_output, is_training, cache_qkv=None):
    # decoder_input: [batch_size * beam_size, step], 该step逐步增加,即1,2,3,..
    # decoder_cache: [batch_size * beam_size, 0, num_blocks , hidden_units ]
    # encoder_output: [batch_size * beam_size, time_step, hidden_units]
    batch_size = tf.shape(encoder_output)[0]/self._config.test.beam_size
    attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
    residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0
    encoder_padding = tf.equal(tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0)
    encoder_attention_bias = layers_with_attention.attention_bias_ignore_padding(encoder_padding)
    decoder_self_attention_bias = layers_with_attention.attention_bias_lower_triangle(tf.shape(decoder_input)[1])
    decoder_output = embedding(decoder_input,
                   vocab_size=self._config.vocab_size,
                   dense_size=self._config.hidden_units,
                   multiplier=self._config.hidden_units ** 0.5 if self._config.scale_embedding else 1.0,
                   name="dst_embedding")
    # Positional Encoding
    # decoder_output += layers_with_attention.add_timing_signal_1d(decoder_output)
    decoder_output = layers_with_attention.add_timing_signal_1d(decoder_output)
    # Dropout
    decoder_output = tf.layers.dropout(decoder_output,
                       rate=residual_dropout_rate,
                       training=is_training)

    new_cache = []
    # Blocks
    for i in range(self._config.decoder_num_blocks):
      with tf.variable_scope("block_{}".format(i)):
        layer_name = "layer_%d" % i
        layer_cache = cache_qkv[layer_name] if cache_qkv is not None else None
        # Multihead Attention (self-attention)
        decoder_output = residual(decoder_output[:,-1:,:],
                      sb_multihead_attention_for_decoding(
                        query_antecedent=decoder_output[:,-1:,:],
                        memory_antecedent=None,
                        cache=layer_cache,
                        bias=decoder_self_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        batch_size=batch_size,
                        beam_size = self._config.test.beam_size,
                        output_depth=self._config.hidden_units,
                        name="decoder_self_attention",
                        summaries=True),
                      dropout_rate=residual_dropout_rate)
        # Multihead Attention (vanilla attention)
        multihead_out = sb_multihead_attention_for_decoding(
                        query_antecedent=decoder_output,
                        memory_antecedent=encoder_output,
                        bias=encoder_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        name="decoder_vanilla_attention",
                        summaries=True)
        decoder_output = residual(decoder_output, multihead_out,
                      dropout_rate=residual_dropout_rate)

        # Feed Forward
        decoder_output = residual(decoder_output,
                      ff_hidden(
                        decoder_output,
                        hidden_size=self._config.ff_units * self._config.hidden_units,
                        output_size=self._config.hidden_units,
                        activation=self._ff_activation),
                      dropout_rate=residual_dropout_rate)
        decoder_output = tf.concat([decoder_cache[:, :, i, :], decoder_output], axis=1)
        new_cache.append(decoder_output[:, :, None, :])
    new_cache = tf.concat(new_cache, axis=2)  # [batch_size, n_step, num_blocks, num_hidden]

    return decoder_output, new_cache, cache_qkv
Beispiel #3
0
  def decoder_impl(self, decoder_input, encoder_output, is_training, cache=None):
    # decoder_input: [2, batch_size, step]
    # encoder_output: [batch_size, time_step, hidden_units]
    attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0
    residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0

    encoder_padding = tf.equal(tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0)
    encoder_attention_bias = layers_with_attention.attention_bias_ignore_padding(encoder_padding)

    decoder_output = embedding(decoder_input,
                   vocab_size=self._config.vocab_size,
                   dense_size=self._config.hidden_units,
                   multiplier=self._config.hidden_units ** 0.5 if self._config.scale_embedding else 1.0,
                   name="dst_embedding")
    # Positional Encoding
    decoder_output = tf.concat(
      [tf.expand_dims(layers_with_attention.add_timing_signal_1d(decoder_output[0]), 0),
      tf.expand_dims(layers_with_attention.add_timing_signal_1d(decoder_output[1]), 0)], 0)
    # Dropout
    decoder_output = tf.layers.dropout(decoder_output,
                       rate=residual_dropout_rate,
                       training=is_training)
    # Bias for preventing peeping later information for bidirectional decoder
    self_attention_bias = layers_with_attention.attention_bias_lower_triangle(tf.shape(decoder_input)[2])

    # Blocks
    for i in range(self._config.decoder_num_blocks):
      with tf.variable_scope("block_{}".format(i)):
        layer_name = "layer_%d" % i
        layer_cache = cache[layer_name] if cache is not None else None
        # Multihead Attention (self-attention)
        decoder_output = residual(decoder_output,
                      sb_multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=None,
                        cache=layer_cache,
                        bias=self_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        output_depth=self._config.hidden_units,
                        name="decoder_self_attention",
                        summaries=True),
                      dropout_rate=residual_dropout_rate)

        # Multihead Attention (vanilla attention)
        decoder_output = residual(decoder_output,
                      sb_multihead_attention(
                        query_antecedent=decoder_output,
                        memory_antecedent=encoder_output,
                        bias=encoder_attention_bias,
                        total_key_depth=self._config.hidden_units,
                        total_value_depth=self._config.hidden_units,
                        output_depth=self._config.hidden_units,
                        num_heads=self._config.num_heads,
                        dropout_rate=attention_dropout_rate,
                        name="decoder_vanilla_attention",
                        summaries=True),
                      dropout_rate=residual_dropout_rate)

        # Feed Forward
        decoder_output = residual(decoder_output,
                      ff_hidden(
                        decoder_output,
                        hidden_size=self._config.ff_units * self._config.hidden_units,
                        output_size=self._config.hidden_units,
                        activation=self._ff_activation),
                      dropout_rate=residual_dropout_rate)
    return decoder_output
Beispiel #4
0
def darknet53(inputs, trainable, data_format):

    with tf.variable_scope('darknet'):
        inputs = convolutional(inputs=inputs,
                               filters=32,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv0',
                               data_format=data_format)

        inputs = convolutional(inputs=inputs,
                               filters=64,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv1',
                               strides=2,
                               data_format=data_format)

        for i in range(1):
            inputs = residual(inputs=inputs,
                              filters=32,
                              trainable=trainable,
                              data_format=data_format,
                              name='residual%d' % (i + 0))

        inputs = convolutional(inputs=inputs,
                               filters=128,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv4',
                               strides=2,
                               data_format=data_format)

        for i in range(2):
            inputs = residual(inputs=inputs,
                              filters=64,
                              trainable=trainable,
                              data_format=data_format,
                              name='residual%d' % (i + 1))

        inputs = convolutional(inputs=inputs,
                               filters=256,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv9',
                               strides=2,
                               data_format=data_format)

        for i in range(8):
            inputs = residual(inputs=inputs,
                              filters=128,
                              trainable=trainable,
                              data_format=data_format,
                              name='residual%d' % (i + 3))

        route1 = inputs

        inputs = convolutional(inputs=inputs,
                               filters=512,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv26',
                               strides=2,
                               data_format=data_format)

        for i in range(8):
            inputs = residual(inputs=inputs,
                              filters=256,
                              trainable=trainable,
                              data_format=data_format,
                              name='residual%d' % (i + 11))

        route2 = inputs

        inputs = convolutional(inputs=inputs,
                               filters=1024,
                               kernel_size=3,
                               trainable=trainable,
                               name='conv43',
                               strides=2,
                               data_format=data_format)

        for i in range(4):
            inputs = residual(inputs=inputs,
                              filters=512,
                              trainable=trainable,
                              data_format=data_format,
                              name='residual%d' % (i + 19))

        return route1, route2, inputs