Esempio n. 1
0
    def _prepare_inputs(self, output_word_ids: tf.Tensor,
                        edit_vector: tf.Tensor):
        # Add start token to decoder inputs
        decoder_input_words = prepare_decoder_input(
            output_word_ids)  # [batch, output_len+1]
        decoder_input_max_len = tf.shape(decoder_input_words)[1]
        decoder_input_len = sequence.length_pre_embedding(
            decoder_input_words)  # [batch]

        # Get word embeddings
        decoder_input_embeds = self.embedding_layer(
            decoder_input_words)  # [batch, output_len+1, hidden_size)

        # Add positional encoding to the embeddings part
        with tf.name_scope('positional_encoding'):
            pos_encoding = model_utils.get_position_encoding(
                decoder_input_max_len, self.config.orig_hidden_size)
            decoder_input_embeds += pos_encoding

        decoder_input = decoder_input_embeds

        if self.config.enable_dropout and self.config.layer_postprocess_dropout > 0.:
            decoder_input = tf.nn.dropout(
                decoder_input, 1 - self.config.layer_postprocess_dropout)

        return decoder_input, decoder_input_len
Esempio n. 2
0
    def encode(self, inputs, attention_bias):
        """Generate continuous representation for inputs.

        Args:
          inputs: int tensor with shape [batch_size, input_length].
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float tensor with shape [batch_size, input_length, hidden_size]
        """
        with tf.name_scope("encode"):
            # Prepare inputs to the layer stack by adding positional encodings and
            # applying dropout.
            embedded_inputs = self.embedding_softmax_layer(inputs)
            inputs_padding = model_utils.get_padding(inputs)

            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(embedded_inputs)[1]
                pos_encoding = model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
                encoder_inputs = embedded_inputs + pos_encoding

            if self.train:
                encoder_inputs = tf.nn.dropout(
                    encoder_inputs,
                    1 - self.params["layer_postprocess_dropout"])

            return self.encoder_stack(encoder_inputs, attention_bias,
                                      inputs_padding)
Esempio n. 3
0
    def _get_symbols_to_logits_fn(self, edit_vector):
        """Returns a decoding function that calculates logits of the next tokens."""
        max_decode_length = self.config.max_decode_length

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.config.orig_hidden_size)
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]

            outputs = self.decoder_stack(
                decoder_input,
                self_attention_bias,
                encoder_outputs=cache["encoder_outputs"],
                encoder_attn_bias=cache["encoder_attn_bias"],
                mev_st=cache["mev_st"],
                mev_st_keys=cache["mev_st_keys"],
                mev_st_attn_bias=cache["mev_st_attn_bias"],
                mev_ts=cache["mev_ts"],
                mev_ts_keys=cache["mev_ts_keys"],
                mev_ts_attn_bias=cache["mev_ts_attn_bias"],
                cache=cache)

            # Project transformer outputs to the embedding space
            outputs = self.project_back(outputs)
            logits = self.vocab_projection.linear(outputs)
            logits = tf.squeeze(logits, axis=[1])

            return logits, cache

        return symbols_to_logits_fn
Esempio n. 4
0
    def _prepare_inputs(self, seq):
        embedded_inputs = self.embedding_layer(seq)
        length = tf.shape(embedded_inputs)[1]

        with tf.name_scope("pos_encoding"):
            pos_encoding = model_utils.get_position_encoding(length, self.config.hidden_size)
            embedded_inputs += pos_encoding

        if self.config.enable_dropout and self.config.layer_postprocess_dropout > 0.:
            embedded_inputs = tf.nn.dropout(embedded_inputs, 1. - self.config.layer_postprocess_dropout)

        return embedded_inputs
Esempio n. 5
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
Esempio n. 6
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

        Args:
          targets: target values for the output sequence.
            int tensor with shape [batch_size, target_length]
          encoder_outputs: continuous representation of input sequence.
            float tensor with shape [batch_size, input_length, hidden_size]
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - self.params["layer_postprocess_dropout"])

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
    def call(self, src, tgt, src_len, tgt_len, **kwargs):
        # Add [REMOVE] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_tgt = self.embedding_layer(tgt)

        # [batch, length+1, hidden_size]
        extended_embedded_tgt = self._add_token_to_beginning(embedded_tgt, self.rm_tok_embedding)
        extended_embedded_tgt += model_utils.get_position_encoding(
            tf.shape(extended_embedded_tgt)[1],
            self.params.hidden_size
        )
        extended_tgt_len = tgt_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_tgt_attention_bias, extended_tgt_padding = self._get_attn_bias_with_dropout(
                extended_tgt_len, uniform_low=1)
        else:
            extended_tgt_padding = model_utils.get_padding_by_seq_len(extended_tgt_len)
            extended_tgt_attention_bias = model_utils.get_padding_bias(None, extended_tgt_padding)

        # Add [CLS] token to the beginning of source sequence
        # [batch, length, hidden_size]
        embedded_src = self.embedding_layer(src)

        # [batch, length+1, hidden_size]
        extended_embedded_src = self._add_cls_token(embedded_src)
        extended_embedded_src += model_utils.get_position_encoding(
            tf.shape(extended_embedded_src)[1],
            self.params.hidden_size
        )
        extended_src_len = src_len + 1

        if self.params.get('noiser_ident_prob', 1) < 1:
            extended_src_attention_bias, extended_src_padding = self._get_attn_bias_with_dropout(
                extended_src_len, uniform_low=1)
        else:
            extended_src_padding = model_utils.get_padding_by_seq_len(extended_src_len)
            extended_src_attention_bias = model_utils.get_padding_bias(None, extended_src_padding)

        # Encode Target
        # [batch, length+1, hidden_size]
        encoded_tgt = self._encode_tgt(extended_embedded_tgt, extended_tgt_padding, extended_tgt_attention_bias)

        # Decode source using the encoded target
        # [batch, length+1, hidden_size]
        decoder_output = self._decode_micro_edit_vectors(extended_embedded_src, extended_src_padding,
                                                         extended_src_attention_bias,
                                                         encoded_tgt, extended_tgt_attention_bias)

        if not graph_utils.is_training() and self.params.save_attentions:
            tf.add_to_collection('TransformerMicroEditExtractor_Attentions', [
                self.target_encoder.self_attn_alignment_history,
                self.mev_decoder.self_attn_alignment_history,
                self.mev_decoder.enc_dec_attn_alignment_history,
            ])

        with tf.name_scope("pooler"):
            # We "pool" the model by simply taking the hidden state corresponding
            # to the first token.
            first_token_tensor = tf.squeeze(decoder_output[:, 0:1, :], axis=1)
            pooled = self.pooling_layer(first_token_tensor)

        # with tf.control_dependencies([prints]):
        # [batch, length, hidden_size]
        micro_ev = self.mev_projection(decoder_output[:, 1:, :])

        return encoded_tgt[:, 1:, :], extended_tgt_attention_bias[:, :, :, 1:], pooled, micro_ev