def _prepare_inputs(self, output_word_ids: tf.Tensor, edit_vector: tf.Tensor): # Add start token to decoder inputs decoder_input_words = prepare_decoder_input( output_word_ids) # [batch, output_len+1] decoder_input_max_len = tf.shape(decoder_input_words)[1] decoder_input_len = sequence.length_pre_embedding( decoder_input_words) # [batch] # Get word embeddings decoder_input_embeds = self.embedding_layer( decoder_input_words) # [batch, output_len+1, hidden_size) # Add positional encoding to the embeddings part with tf.name_scope('positional_encoding'): pos_encoding = model_utils.get_position_encoding( decoder_input_max_len, self.config.orig_hidden_size) decoder_input_embeds += pos_encoding decoder_input = decoder_input_embeds if self.config.enable_dropout and self.config.layer_postprocess_dropout > 0.: decoder_input = tf.nn.dropout( decoder_input, 1 - self.config.layer_postprocess_dropout) return decoder_input, decoder_input_len
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def _get_symbols_to_logits_fn(self, edit_vector): """Returns a decoding function that calculates logits of the next tokens.""" max_decode_length = self.config.max_decode_length timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.config.orig_hidden_size) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] outputs = self.decoder_stack( decoder_input, self_attention_bias, encoder_outputs=cache["encoder_outputs"], encoder_attn_bias=cache["encoder_attn_bias"], mev_st=cache["mev_st"], mev_st_keys=cache["mev_st_keys"], mev_st_attn_bias=cache["mev_st_attn_bias"], mev_ts=cache["mev_ts"], mev_ts_keys=cache["mev_ts_keys"], mev_ts_attn_bias=cache["mev_ts_attn_bias"], cache=cache) # Project transformer outputs to the embedding space outputs = self.project_back(outputs) logits = self.vocab_projection.linear(outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def _prepare_inputs(self, seq): embedded_inputs = self.embedding_layer(seq) length = tf.shape(embedded_inputs)[1] with tf.name_scope("pos_encoding"): pos_encoding = model_utils.get_position_encoding(length, self.config.hidden_size) embedded_inputs += pos_encoding if self.config.enable_dropout and self.config.layer_postprocess_dropout > 0.: embedded_inputs = tf.nn.dropout(embedded_inputs, 1. - self.config.layer_postprocess_dropout) return embedded_inputs
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def call(self, src, tgt, src_len, tgt_len, **kwargs): # Add [REMOVE] token to the beginning of source sequence # [batch, length, hidden_size] embedded_tgt = self.embedding_layer(tgt) # [batch, length+1, hidden_size] extended_embedded_tgt = self._add_token_to_beginning(embedded_tgt, self.rm_tok_embedding) extended_embedded_tgt += model_utils.get_position_encoding( tf.shape(extended_embedded_tgt)[1], self.params.hidden_size ) extended_tgt_len = tgt_len + 1 if self.params.get('noiser_ident_prob', 1) < 1: extended_tgt_attention_bias, extended_tgt_padding = self._get_attn_bias_with_dropout( extended_tgt_len, uniform_low=1) else: extended_tgt_padding = model_utils.get_padding_by_seq_len(extended_tgt_len) extended_tgt_attention_bias = model_utils.get_padding_bias(None, extended_tgt_padding) # Add [CLS] token to the beginning of source sequence # [batch, length, hidden_size] embedded_src = self.embedding_layer(src) # [batch, length+1, hidden_size] extended_embedded_src = self._add_cls_token(embedded_src) extended_embedded_src += model_utils.get_position_encoding( tf.shape(extended_embedded_src)[1], self.params.hidden_size ) extended_src_len = src_len + 1 if self.params.get('noiser_ident_prob', 1) < 1: extended_src_attention_bias, extended_src_padding = self._get_attn_bias_with_dropout( extended_src_len, uniform_low=1) else: extended_src_padding = model_utils.get_padding_by_seq_len(extended_src_len) extended_src_attention_bias = model_utils.get_padding_bias(None, extended_src_padding) # Encode Target # [batch, length+1, hidden_size] encoded_tgt = self._encode_tgt(extended_embedded_tgt, extended_tgt_padding, extended_tgt_attention_bias) # Decode source using the encoded target # [batch, length+1, hidden_size] decoder_output = self._decode_micro_edit_vectors(extended_embedded_src, extended_src_padding, extended_src_attention_bias, encoded_tgt, extended_tgt_attention_bias) if not graph_utils.is_training() and self.params.save_attentions: tf.add_to_collection('TransformerMicroEditExtractor_Attentions', [ self.target_encoder.self_attn_alignment_history, self.mev_decoder.self_attn_alignment_history, self.mev_decoder.enc_dec_attn_alignment_history, ]) with tf.name_scope("pooler"): # We "pool" the model by simply taking the hidden state corresponding # to the first token. first_token_tensor = tf.squeeze(decoder_output[:, 0:1, :], axis=1) pooled = self.pooling_layer(first_token_tensor) # with tf.control_dependencies([prints]): # [batch, length, hidden_size] micro_ev = self.mev_projection(decoder_output[:, 1:, :]) return encoded_tgt[:, 1:, :], extended_tgt_attention_bias[:, :, :, 1:], pooled, micro_ev