def encoder_impl(self, encoder_input, is_training): attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 # Mask encoder_padding = tf.equal( tf.reduce_sum(tf.abs(encoder_input), axis=-1), 0.0) encoder_output = dense(encoder_input, self._config.hidden_units, activation=tf.identity, use_bias=True, name="src_change") encoder_output = layers.layer_norm(encoder_output) # Add positional signal encoder_output = layers_with_attention.add_timing_signal_1d( encoder_output) # Dropout encoder_output = tf.layers.dropout(encoder_output, rate=residual_dropout_rate, training=is_training) # Blocks for i in range(self._config.encoder_num_blocks): with tf.variable_scope("block_{}".format(i)): # Multihead Attention encoder_output = residual( encoder_output, multihead_attention( query_antecedent=encoder_output, memory_antecedent=None, bias=layers_with_attention. attention_bias_ignore_padding(encoder_padding), total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name='encoder_self_attention', summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward encoder_output = residual( encoder_output, ff_hidden(inputs=encoder_output, hidden_size=4 * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) # Mask padding part to zeros. encoder_output *= tf.expand_dims(1.0 - tf.to_float(encoder_padding), axis=-1) return encoder_output
def decoder_with_caching_impl(self, decoder_input, decoder_cache, encoder_output, is_training, cache_qkv=None): # decoder_input: [batch_size * beam_size, step], 该step逐步增加,即1,2,3,.. # decoder_cache: [batch_size * beam_size, 0, num_blocks , hidden_units ] # encoder_output: [batch_size * beam_size, time_step, hidden_units] batch_size = tf.shape(encoder_output)[0]/self._config.test.beam_size attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal(tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = layers_with_attention.attention_bias_ignore_padding(encoder_padding) decoder_self_attention_bias = layers_with_attention.attention_bias_lower_triangle(tf.shape(decoder_input)[1]) decoder_output = embedding(decoder_input, vocab_size=self._config.vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units ** 0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding # decoder_output += layers_with_attention.add_timing_signal_1d(decoder_output) decoder_output = layers_with_attention.add_timing_signal_1d(decoder_output) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) new_cache = [] # Blocks for i in range(self._config.decoder_num_blocks): with tf.variable_scope("block_{}".format(i)): layer_name = "layer_%d" % i layer_cache = cache_qkv[layer_name] if cache_qkv is not None else None # Multihead Attention (self-attention) decoder_output = residual(decoder_output[:,-1:,:], sb_multihead_attention_for_decoding( query_antecedent=decoder_output[:,-1:,:], memory_antecedent=None, cache=layer_cache, bias=decoder_self_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, batch_size=batch_size, beam_size = self._config.test.beam_size, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) multihead_out = sb_multihead_attention_for_decoding( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name="decoder_vanilla_attention", summaries=True) decoder_output = residual(decoder_output, multihead_out, dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual(decoder_output, ff_hidden( decoder_output, hidden_size=self._config.ff_units * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) decoder_output = tf.concat([decoder_cache[:, :, i, :], decoder_output], axis=1) new_cache.append(decoder_output[:, :, None, :]) new_cache = tf.concat(new_cache, axis=2) # [batch_size, n_step, num_blocks, num_hidden] return decoder_output, new_cache, cache_qkv
def decoder_impl(self, decoder_input, encoder_output, is_training, cache=None): # decoder_input: [2, batch_size, step] # encoder_output: [batch_size, time_step, hidden_units] attention_dropout_rate = self._config.attention_dropout_rate if is_training else 0.0 residual_dropout_rate = self._config.residual_dropout_rate if is_training else 0.0 encoder_padding = tf.equal(tf.reduce_sum(tf.abs(encoder_output), axis=-1), 0.0) encoder_attention_bias = layers_with_attention.attention_bias_ignore_padding(encoder_padding) decoder_output = embedding(decoder_input, vocab_size=self._config.vocab_size, dense_size=self._config.hidden_units, multiplier=self._config.hidden_units ** 0.5 if self._config.scale_embedding else 1.0, name="dst_embedding") # Positional Encoding decoder_output = tf.concat( [tf.expand_dims(layers_with_attention.add_timing_signal_1d(decoder_output[0]), 0), tf.expand_dims(layers_with_attention.add_timing_signal_1d(decoder_output[1]), 0)], 0) # Dropout decoder_output = tf.layers.dropout(decoder_output, rate=residual_dropout_rate, training=is_training) # Bias for preventing peeping later information for bidirectional decoder self_attention_bias = layers_with_attention.attention_bias_lower_triangle(tf.shape(decoder_input)[2]) # Blocks for i in range(self._config.decoder_num_blocks): with tf.variable_scope("block_{}".format(i)): layer_name = "layer_%d" % i layer_cache = cache[layer_name] if cache is not None else None # Multihead Attention (self-attention) decoder_output = residual(decoder_output, sb_multihead_attention( query_antecedent=decoder_output, memory_antecedent=None, cache=layer_cache, bias=self_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, output_depth=self._config.hidden_units, name="decoder_self_attention", summaries=True), dropout_rate=residual_dropout_rate) # Multihead Attention (vanilla attention) decoder_output = residual(decoder_output, sb_multihead_attention( query_antecedent=decoder_output, memory_antecedent=encoder_output, bias=encoder_attention_bias, total_key_depth=self._config.hidden_units, total_value_depth=self._config.hidden_units, output_depth=self._config.hidden_units, num_heads=self._config.num_heads, dropout_rate=attention_dropout_rate, name="decoder_vanilla_attention", summaries=True), dropout_rate=residual_dropout_rate) # Feed Forward decoder_output = residual(decoder_output, ff_hidden( decoder_output, hidden_size=self._config.ff_units * self._config.hidden_units, output_size=self._config.hidden_units, activation=self._ff_activation), dropout_rate=residual_dropout_rate) return decoder_output
def darknet53(inputs, trainable, data_format): with tf.variable_scope('darknet'): inputs = convolutional(inputs=inputs, filters=32, kernel_size=3, trainable=trainable, name='conv0', data_format=data_format) inputs = convolutional(inputs=inputs, filters=64, kernel_size=3, trainable=trainable, name='conv1', strides=2, data_format=data_format) for i in range(1): inputs = residual(inputs=inputs, filters=32, trainable=trainable, data_format=data_format, name='residual%d' % (i + 0)) inputs = convolutional(inputs=inputs, filters=128, kernel_size=3, trainable=trainable, name='conv4', strides=2, data_format=data_format) for i in range(2): inputs = residual(inputs=inputs, filters=64, trainable=trainable, data_format=data_format, name='residual%d' % (i + 1)) inputs = convolutional(inputs=inputs, filters=256, kernel_size=3, trainable=trainable, name='conv9', strides=2, data_format=data_format) for i in range(8): inputs = residual(inputs=inputs, filters=128, trainable=trainable, data_format=data_format, name='residual%d' % (i + 3)) route1 = inputs inputs = convolutional(inputs=inputs, filters=512, kernel_size=3, trainable=trainable, name='conv26', strides=2, data_format=data_format) for i in range(8): inputs = residual(inputs=inputs, filters=256, trainable=trainable, data_format=data_format, name='residual%d' % (i + 11)) route2 = inputs inputs = convolutional(inputs=inputs, filters=1024, kernel_size=3, trainable=trainable, name='conv43', strides=2, data_format=data_format) for i in range(4): inputs = residual(inputs=inputs, filters=512, trainable=trainable, data_format=data_format, name='residual%d' % (i + 19)) return route1, route2, inputs