def _prepare_source(): """ Pre-processes inputs to the encoder and generates the corresponding attention masks.""" # Embed source_embeddings = self._embed(source_ids) # Obtain length and depth of the input tensors _, time_steps, depth = tf_utils.get_shape_list(source_embeddings) # Transform input mask into attention mask inverse_mask = tf.cast(tf.equal(source_mask, 0.0), dtype=FLOAT_DTYPE) attn_mask = inverse_mask * -1e9 # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1) # Differentiate between self-attention and cross-attention masks for further, optional modifications self_attn_mask = attn_mask cross_attn_mask = attn_mask # Add positional encodings positional_signal = get_positional_signal(time_steps, depth, FLOAT_DTYPE) source_embeddings += positional_signal # Apply dropout if self.config.transformer_dropout_embeddings > 0: source_embeddings = tf.layers.dropout( source_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) return source_embeddings, self_attn_mask, cross_attn_mask
def generate_decoding_function(self, encoder_output): with tf.compat.v1.name_scope(self._scope): # Generate a positional signal for the longest possible output. positional_signal = get_positional_signal( self._config.translation_maxlen, self._config.embedding_size, FLOAT_DTYPE) decoder = self._model.dec def _decoding_function(step_target_ids, current_time_step, memories): """Single-step decoding function. Args: step_target_ids: Tensor with shape (batch_size) current_time_step: scalar Tensor. memories: dictionary (see top-level class description) Returns: """ with tf.compat.v1.name_scope(self._scope): # TODO Is this necessary? vocab_ids = tf.reshape(step_target_ids, [-1, 1]) # Look up embeddings for target IDs. target_embeddings = decoder._embed(vocab_ids) # Add positional signal. signal_slice = positional_signal[:, current_time_step - 1:current_time_step, :] target_embeddings += signal_slice # Optionally, apply dropout to embeddings. if self.config.transformer_dropout_embeddings > 0: target_embeddings = tf.compat.v1.layers.dropout( target_embeddings, rate=self.config.transformer_dropout_embeddings, training=decoder.training) # Propagate values through the decoder stack. # NOTE: No self-attention mask is applied at decoding, as # future information is unavailable. layer_output = target_embeddings for layer_id in range(1, self.config.transformer_dec_depth + 1): layer = decoder.decoder_stack[layer_id] mem_key = 'layer_{:d}'.format(layer_id) layer_output, memories[mem_key] = \ layer['self_attn'].forward( layer_output, None, None, memories[mem_key]) layer_output, _ = layer['cross_attn'].forward( layer_output, encoder_output.enc_output, encoder_output.cross_attn_mask) layer_output = layer['ffn'].forward(layer_output) # Return prediction at the final time-step to be consistent # with the inference pipeline. dec_output = layer_output[:, -1, :] # Project decoder stack outputs and apply the soft-max # non-linearity. step_logits = \ decoder.softmax_projection_layer.project(dec_output) return step_logits, memories return _decoding_function
def _prepare_source(): """ Pre-processes inputs to the encoder and generates the corresponding attention masks.""" # Embed pre_source_embeddings = self._embed(source_ids) with tf.variable_scope(self.name): source_embeddings = self.emb_ffn.forward(pre_source_embeddings) glove_embeddings = self.embedding_layer.get_glove_embed(source_pids) source_embeddings += glove_embeddings # Obtain length and depth of the input tensors _, time_steps, depth = get_shape_list(source_embeddings) # Transform input mask into attention mask # 恢复source_mask shape_mask = get_shape_list(source_mask) source_mask1 = tf.slice(source_mask, [0, 0, 0], [shape_mask[0], shape_mask[1], 1]) source_mask2 = tf.reshape(source_mask1, [shape_mask[0], shape_mask[1]]) inverse_mask = tf.cast(tf.equal(source_mask2, 0.0), dtype=self.float_dtype) attn_mask = inverse_mask * -1e9 # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1) # Differentiate between self-attention and cross-attention masks for further, optional modifications self_attn_mask = attn_mask cross_attn_mask = attn_mask # Add positional encodings positional_signal = get_positional_signal(time_steps, depth, self.float_dtype) source_embeddings += positional_signal # Apply dropout if self.config.transformer_dropout_embeddings > 0: source_embeddings = tf.layers.dropout(source_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) return source_embeddings, self_attn_mask, cross_attn_mask
def _embed(self, index_sequence): """ Embeds source-side indices to obtain the corresponding dense tensor representations. """ #重要更改 #index_sequence: (batch_size, seq_len, u_len) u_emb = self.embedding_layer.embed(index_sequence) #(batch_size, seq_len, u_len, embedding_size) shape = get_shape_list(u_emb) #加上位置编码,特指md5:[1, u_len, embedding_size] if self.config.utf8_type == "md5": md5_positional_signal = get_positional_signal(shape[2], shape[3], self.float_dtype) u_emb += md5_positional_signal #修剪为2048 input_size = self.config.pre_source_embedding_size # 默认2048 cc = input_size - shape[2]*shape[3] if self.config.pre_source_embed_cross: #似乎效果更差,且测试时bleu值异常 embsize = tf.to_int32((input_size/shape[2])) accsize = input_size % shape[2] fix_merge_emb = tf.pad(u_emb, [[0, 0], [0, 0], [0, 0], [0, tf.reduce_max([embsize-shape[3], 0])]], constant_values=1.0) fix_merge_emb = tf.slice(fix_merge_emb, [0, 0, 0, 0], [-1, -1, -1, embsize]) fix_merge_emb = tf.reshape(fix_merge_emb, [shape[0], shape[1], shape[2]*embsize]) fix_merge_emb = tf.pad(fix_merge_emb, [[0, 0], [0, 0], [0, accsize]], constant_values=1.0) else: merge_emb = tf.reshape(u_emb, [shape[0], shape[1], shape[2]*shape[3]]) #(batch_size, seq_len, u_len*embedding_size) fix_merge_emb = tf.pad(merge_emb, [[0, 0], [0, 0], [0, tf.reduce_max([cc, 0])]], constant_values=0) fix_merge_emb = tf.slice(fix_merge_emb, [0, 0, 0], [-1, -1, input_size]) return fix_merge_emb
def _prepare_source(): """ Pre-processes inputs to the encoder and generates the corresponding attention masks.""" DICT_SIZE, ENG_DICT_FILE, OUTPUT_TRANSLATE_FILE, _, _, DEBIASED_EMBEDDING, _ = get_debias_files_from_config( self.consts_config_str) if self.USE_DEBIASED: print("using debiased embeddings") self.embedding_layer.embedding_table = self.embedding_matrix else: print("using non debiased embeddings") source_embeddings = self._embed(source_ids) if self.COLLECT_EMBEDDING_TABLE: ## print the embedding table # ########################################### PRINT ######################################################### printops = [] printops.append( tf.compat.v1.Print( [], [tf.shape(self.embedding_layer.embedding_table)], "embedding_table shape ", summarize=10000)) for i in list(range(DICT_SIZE)): printops.append( tf.compat.v1.Print( [], [self.embedding_layer.embedding_table[i, :]], "enc_inputs for word " + str(i), summarize=10000)) printops.append( tf.compat.v1.Print( [], [], "**************************************", summarize=10000)) tf.io.write_file( "output_translate.txt", str(self.embedding_layer.embedding_table[i, :])) with tf.control_dependencies(printops): source_embeddings = source_embeddings * 1 # ########################################################################################################### # Embed ### comment: first embedding without positional signal # Obtain length and depth of the input tensors _, time_steps, depth = tf_utils.get_shape_list(source_embeddings) # Transform input mask into attention mask inverse_mask = tf.cast(tf.equal(source_mask, 0.0), dtype=FLOAT_DTYPE) attn_mask = inverse_mask * -1e9 # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1) # Differentiate between self-attention and cross-attention masks for further, optional modifications self_attn_mask = attn_mask cross_attn_mask = attn_mask # Add positional encodings positional_signal = get_positional_signal(time_steps, depth, FLOAT_DTYPE) source_embeddings += positional_signal ### comment: first embedding with positional signal # Apply dropout if self.dropout_embedding is not None: source_embeddings = self.dropout_embedding( source_embeddings, training=self.training) return source_embeddings, self_attn_mask, cross_attn_mask
def decode_at_train(self, target_ids, enc_output, cross_attn_mask): """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder; performs decoding in parallel at training time. """ def _decode_all(target_embeddings): """ Decodes the encoder-generated representations into target-side logits in parallel. """ # Apply input dropout dec_input = \ tf.layers.dropout(target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) # Propagate inputs through the encoder stack dec_output = dec_input for layer_id in range(1, self.config.transformer_dec_depth + 1): dec_output, _ = self.decoder_stack[layer_id][ 'self_attn'].forward(dec_output, None, self_attn_mask) dec_output, _ = \ self.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask) dec_output = self.decoder_stack[layer_id]['ffn'].forward( dec_output) return dec_output def _prepare_targets(): """ Pre-processes target token ids before they're passed on as input to the decoder for parallel decoding. """ # Embed target_ids target_embeddings = self._embed(target_ids) target_embeddings += positional_signal if self.config.transformer_dropout_embeddings > 0: target_embeddings = tf.layers.dropout( target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) return target_embeddings def _decoding_function(): """ Generates logits for target-side tokens. """ # Embed the model's predictions up to the current time-step; add positional information, mask target_embeddings = _prepare_targets() # Pass encoder context and decoder embeddings through the decoder dec_output = _decode_all(target_embeddings) # Project decoder stack outputs and apply the soft-max non-linearity full_logits = self.softmax_projection_layer.project(dec_output) return full_logits with tf.variable_scope(self.name): # Transpose encoder information in hybrid models if self.from_rnn: enc_output = tf.transpose(enc_output, [1, 0, 2]) cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0]) self_attn_mask = get_right_context_mask(tf.shape(target_ids)[-1]) positional_signal = get_positional_signal( tf.shape(target_ids)[-1], self.config.embedding_size, FLOAT_DTYPE) logits = _decoding_function() return logits
def _pre_embed(self, index_sequence): u_emb = self.embedding_layer.embed(index_sequence) #(batch_size, u_len, embedding_size) shape = get_shape_list(u_emb) if self.config.utf8_type == "md5": md5_positional_signal = get_positional_signal(shape[1], shape[2], self.float_dtype) u_emb += md5_positional_signal input_size = self.config.pre_source_embedding_size cc = input_size - shape[1]*shape[2] merge_emb = tf.reshape(u_emb, [shape[0], shape[1]*shape[2]]) #merge_emb: (batch_size, u_len*embedding_size) fix_merge_emb = tf.pad(merge_emb, [[0, 0], [0, tf.reduce_max([cc, 0])]], constant_values=1.0) fix_merge_emb = tf.slice(fix_merge_emb, [0, 0], [-1, input_size]) return fix_merge_emb
def decode_at_train(self, target_ids, enc_output, cross_attn_mask): """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder; performs decoding in parallel at training time. """ def _decode_all(target_embeddings): """ Decodes the encoder-generated representations into target-side logits in parallel. """ # Apply input dropout dec_input = \ tf.layers.dropout(target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) # Propagate inputs through the encoder stack dec_output = dec_input for layer_id in range(1, self.config.transformer_dec_depth + 1): dec_output, _ = self.decoder_stack[layer_id]['self_attn'].forward(dec_output, None, self_attn_mask) dec_output, _ = \ self.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask) dec_output = self.decoder_stack[layer_id]['ffn'].forward(dec_output) return dec_output def _prepare_targets(): """ Pre-processes target token ids before they're passed on as input to the decoder for parallel decoding. """ # Embed target_ids target_embeddings = self._embed(target_ids) target_embeddings += positional_signal if self.config.transformer_dropout_embeddings > 0: target_embeddings = tf.layers.dropout(target_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) return target_embeddings def _decoding_function(): """ Generates logits for target-side tokens. """ # Embed the model's predictions up to the current time-step; add positional information, mask target_embeddings = _prepare_targets() # Pass encoder context and decoder embeddings through the decoder dec_output = _decode_all(target_embeddings) # Project decoder stack outputs and apply the soft-max non-linearity full_logits = self.softmax_projection_layer.project(dec_output) return full_logits with tf.variable_scope(self.name): # Transpose encoder information in hybrid models if self.from_rnn: enc_output = tf.transpose(enc_output, [1, 0, 2]) cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0]) self_attn_mask = get_right_context_mask(tf.shape(target_ids)[-1]) positional_signal = get_positional_signal(tf.shape(target_ids)[-1], self.config.embedding_size, self.float_dtype) logits = _decoding_function() return logits
def _prepare_source(): """ Pre-processes inputs to the encoder and generates the corresponding attention masks.""" # Embed source_embeddings = self._embed(source_ids) # Obtain length and depth of the input tensors _, time_steps, depth = get_shape_list(source_embeddings) # Transform input mask into attention mask inverse_mask = tf.cast(tf.equal(source_mask, 0.0), dtype=self.float_dtype) attn_mask = inverse_mask * -1e9 # Expansion to shape [batch_size, 1, 1, time_steps] is needed for compatibility with attention logits attn_mask = tf.expand_dims(tf.expand_dims(attn_mask, 1), 1) # Differentiate between self-attention and cross-attention masks for further, optional modifications self_attn_mask = attn_mask cross_attn_mask = attn_mask # Add positional encodings positional_signal = get_positional_signal(time_steps, depth, self.float_dtype) source_embeddings += positional_signal # Apply dropout if self.config.transformer_dropout_embeddings > 0: source_embeddings = tf.layers.dropout(source_embeddings, rate=self.config.transformer_dropout_embeddings, training=self.training) return source_embeddings, self_attn_mask, cross_attn_mask
def decode_at_test(model, decoder, enc_output, cross_attn_mask, batch_size, beam_size, do_sample, normalization_alpha): """ Returns the probability distribution over target-side tokens conditioned on the output of the encoder; performs decoding via auto-regression at test time. """ def _decode_step(target_embeddings, memories): """ Decode the encoder-generated representations into target-side logits with auto-regression. """ # Propagate inputs through the encoder stack dec_output = target_embeddings # NOTE: No self-attention mask is applied at decoding, as future information is unavailable for layer_id in range(1, decoder.config.transformer_dec_depth + 1): dec_output, memories['layer_{:d}'.format(layer_id)] = \ decoder.decoder_stack[layer_id]['self_attn'].forward( dec_output, None, None, memories['layer_{:d}'.format(layer_id)]) dec_output, _ = \ decoder.decoder_stack[layer_id]['cross_attn'].forward(dec_output, enc_output, cross_attn_mask) dec_output = decoder.decoder_stack[layer_id]['ffn'].forward( dec_output) # Return prediction at the final time-step to be consistent with the inference pipeline dec_output = dec_output[:, -1, :] return dec_output, memories def _pre_process_targets(step_target_ids, current_time_step): """ Pre-processes target token ids before they're passed on as input to the decoder for auto-regressive decoding. """ # Embed target_ids target_embeddings = decoder._embed(step_target_ids) signal_slice = positional_signal[:, current_time_step - 1:current_time_step, :] target_embeddings += signal_slice if decoder.config.transformer_dropout_embeddings > 0: target_embeddings = tf.layers.dropout( target_embeddings, rate=decoder.config.transformer_dropout_embeddings, training=decoder.training) return target_embeddings def _decoding_function(step_target_ids, current_time_step, memories): """ Generates logits for the target-side token predicted for the next-time step with auto-regression. """ # Embed the model's predictions up to the current time-step; add positional information, mask target_embeddings = _pre_process_targets(step_target_ids, current_time_step) # Pass encoder context and decoder embeddings through the decoder dec_output, memories = _decode_step(target_embeddings, memories) # Project decoder stack outputs and apply the soft-max non-linearity step_logits = decoder.softmax_projection_layer.project(dec_output) return step_logits, memories with tf.variable_scope(decoder.name): # Transpose encoder information in hybrid models if decoder.from_rnn: enc_output = tf.transpose(enc_output, [1, 0, 2]) cross_attn_mask = tf.transpose(cross_attn_mask, [3, 1, 2, 0]) positional_signal = get_positional_signal( decoder.config.translation_maxlen, decoder.config.embedding_size, decoder.float_dtype) if beam_size > 0: # Initialize target IDs with <GO> initial_ids = tf.cast(tf.fill([batch_size], 1), dtype=decoder.int_dtype) initial_memories = decoder._get_initial_memories( batch_size, beam_size=beam_size) output_sequences, scores = _beam_search( _decoding_function, initial_ids, initial_memories, decoder.int_dtype, decoder.float_dtype, decoder.config.translation_maxlen, batch_size, beam_size, decoder.embedding_layer.get_vocab_size(), 0, normalization_alpha) else: # Initialize target IDs with <GO> initial_ids = tf.cast(tf.fill([batch_size, 1], 1), dtype=decoder.int_dtype) initial_memories = decoder._get_initial_memories(batch_size, beam_size=1) output_sequences, scores = greedy_search( model, _decoding_function, initial_ids, initial_memories, decoder.int_dtype, decoder.float_dtype, decoder.config.translation_maxlen, batch_size, 0, do_sample, time_major=False) return output_sequences, scores
def decode_greedy(models, do_sample=False, beam_size=0, normalization_alpha=None): """Decodes a source sequence using beam search or sampling. Args: models: a list of Transformer objects. do_sample: randomly sample instead of argmax for greedy search beam_size: integer specifying the beam width. normalization_alpha: length normalization hyperparameter. Returns: A tuple (ids, scores), where ids is a Tensor with shape (batch_size, k, max_seq_len) containing k translations for each input sentence in model.inputs.x and scores is a Tensor with shape (batch_size, k) """ # Get some parameter values. For ensembling, some settings are required to # be consistent across all models but others are not. In the former case, # we assume that consistency has already been checked. For the parameters # that are allowed to vary across models, the first model's settings take # precedence. batch_size, _ = get_shape_list(models[0].source_ids) model_name = models[0].name decoder_name = models[0].dec.name from_rnn = models[0].dec.from_rnn config = models[0].dec.config float_dtype = models[0].dec.float_dtype int_dtype = models[0].dec.int_dtype vocab_size = models[0].dec.embedding_layer.get_vocab_size(), # Generate a positional signal for the longest possible output. with tf.name_scope('{:s}_decode'.format(model_name)): with tf.variable_scope(decoder_name): positional_signal = get_positional_signal( config.translation_maxlen, config.embedding_size, float_dtype) # Generate a decoding function for each model. decoding_functions = [] for model in models: assert model.name == model_name # Encode source sequences. with tf.name_scope('{:s}_encode'.format(model.name)): enc_output, cross_attn_mask = model.enc.encode(model.source_ids, model.source_mask) # Generate a model-specific decoding function. with tf.name_scope('{:s}_decode'.format(model.name)): func = generate_decoding_function(enc_output, cross_attn_mask, model.dec, positional_signal) decoding_functions.append(func) # Decode into target sequences with tf.name_scope('{:s}_decode'.format(model_name)): with tf.variable_scope(decoder_name): if beam_size > 0: # Initialize target IDs with <GO> initial_ids = tf.cast(tf.fill([batch_size], 1), dtype=int_dtype) initial_memories = [ model.dec._get_initial_memories(batch_size, beam_size=beam_size) for model in models] output_sequences, scores = _beam_search( decoding_functions, initial_ids, initial_memories, int_dtype, float_dtype, config.translation_maxlen, batch_size, beam_size, vocab_size, 0, normalization_alpha) else: # Initialize target IDs with <GO> initial_ids = tf.cast(tf.fill([batch_size, 1], 1), dtype=int_dtype) initial_memories = [ model.dec._get_initial_memories(batch_size, beam_size=1) for model in models] output_sequences, scores = greedy_search( models[0], decoding_functions[0], initial_ids, initial_memories[0], int_dtype, float_dtype, config.translation_maxlen, batch_size, 0, do_sample, time_major=False) return output_sequences, scores