Example #1
0
  def encode(self, inputs, attention_bias):
    """Generate continuous representation for inputs.

    Args:
      inputs: int tensor with shape [batch_size, input_length].
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float tensor with shape [batch_size, input_length, hidden_size]
    """
    with tf.name_scope("encode"):
      # Prepare inputs to the layer stack by adding positional encodings and
      # applying dropout.
      embedded_inputs = self.embedding_softmax_layer(inputs)
      inputs_padding = model_utils.get_padding(inputs)

      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(embedded_inputs)[1]
        pos_encoding = model_utils.get_position_encoding(
            length, self.params.hidden_size)
        encoder_inputs = embedded_inputs + pos_encoding

      if self.train:
        encoder_inputs = tf.nn.dropout(
            encoder_inputs, 1 - self.params.layer_postprocess_dropout)

      return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
Example #2
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.compat.v1.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.compat.v1.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(tensor=decoder_inputs,
                                        paddings=[[0, 0], [1, 0],
                                                  [0, 0]])[:, :-1, :]
            with tf.compat.v1.name_scope("add_pos_encoding"):
                length = tf.shape(input=decoder_inputs)[1]
                poscod = tf.cast(
                    model_utils.get_position_encoding(length,
                                                      self.params.hidden_size),
                    tf.bfloat16)
                decoder_inputs += poscod
            if self.train:
                mlperf_log.transformer_print(
                    key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT,
                    value=self.params.layer_postprocess_dropout)
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - (1 - self.params.layer_postprocess_dropout))

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
Example #3
0
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""

        #   [length, hidden_size]
        timing_signal = model_utils.get_position_encoding(
            max_decode_length + 1, self.params["hidden_size"])

        #   [1,1,length,length]
        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            max_decode_length)

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]

            # Preprocess decoder input by getting embeddings and adding timing signal.
            decoder_input = self.embedding_softmax_layer(decoder_input)
            decoder_input += timing_signal[i:i + 1]

            self_attention_bias = decoder_self_attention_bias[:, :,
                                                              i:i + 1, :i + 1]
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.embedding_softmax_layer.linear(decoder_outputs)
            logits = tf.squeeze(logits, axis=[1])
            return logits, cache

        return symbols_to_logits_fn
Example #4
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            # shape: (batch_size, target_length, hidden_size)
            decoder_inputs = self.embedding_softmax_layer(targets)
            with tf.name_scope("shift_targets"):
                # Shift decoder_input one token to the right
                # fill inputs the first token is 0 -> <BOS>,
                # and remove the last element <EOS>
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params["hidden_size"])
            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1 - self.params["layer_postprocess_dropout"])

            # Run values
            # shape: [1, 1, target_length, target_length]
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            # shape: (batch_size, target_length, hidden_size)
            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)
            # shape: (batch_size, target_length, vocab_size)
            logits = self.embedding_softmax_layer.linear(outputs)
            return logits
Example #5
0
    def _decode(self, encoder_outputs, targets, attention_bias):
        decoder_inputs = self.embedding_layer(targets)
        decoder_inputs = tf.pad(decoder_inputs,
                                [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
        # add positional encoding
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.hparams['num_units'])

        if self.is_train:
            decoder_inputs = self.decoder_embedding_dropout(decoder_inputs)

        decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
            length)
        outputs, dec_ponders, dec_remainders = self.decoder_stack(
            decoder_inputs, encoder_outputs, decoder_self_attention_bias,
            attention_bias)
        logits = self.embedding_layer.linear(outputs)
        return logits, dec_ponders, dec_remainders
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence.

        Args:
          targets: target values for the output sequence.
            int tensor with shape [batch_size, target_length]
          encoder_outputs: continuous representation of input sequence.
            float tensor with shape [batch_size, input_length, hidden_size]
          attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
          float32 tensor with shape [batch_size, target_length, vocab_size]
        """
        with tf.name_scope("decode"):
            # Prepare inputs to decoder layers by shifting targets, adding positional
            # encoding and applying dropout.
            # !!!
            # decoder_inputs = self.embedding_softmax_layer(targets)
            decoder_inputs = self.decoder_embedding_layer(targets, not ModeKeys.is_predict_one(self.mode))
            # done
            with tf.name_scope("shift_targets"):
                # Shift targets to the right, and remove the last element
                decoder_inputs = tf.pad(
                    decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs, encoder_outputs, decoder_self_attention_bias,
                attention_bias)
            # !!!
            # logits = self.embedding_softmax_layer.linear(outputs)
            logits = self.decoder_softmax_layer.linear(outputs)
            # done
            return logits
Example #7
0
  def _get_symbols_to_logits_fn(self, max_decode_length):
    """Returns a decoding function that calculates logits of the next tokens."""

    timing_signal = model_utils.get_position_encoding(
        max_decode_length + 1, self.params.hidden_size)
    decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
        max_decode_length)

    def symbols_to_logits_fn(ids, i, cache):
      """Generate logits for next potential IDs.

      Args:
        ids: Current decoded sequences.
          int tensor with shape [batch_size * beam_size, i + 1]
        i: Loop index
        cache: dictionary of values storing the encoder output, encoder-decoder
          attention bias, and previous decoder attention values.

      Returns:
        Tuple of
          (logits with shape [batch_size * beam_size, vocab_size],
           updated cache values)
      """
      # Set decoder input to the last generated IDs
      decoder_input = ids[:, -1:]

      # Preprocess decoder input by getting embeddings and adding timing signal.
      decoder_input = self.embedding_softmax_layer(decoder_input)
      decoder_input += timing_signal[i:i + 1]

      self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1]
      decoder_outputs = self.decoder_stack(
          decoder_input, cache.get("encoder_outputs"), self_attention_bias,
          cache.get("encoder_decoder_attention_bias"), cache)
      logits = self.embedding_softmax_layer.linear(decoder_outputs)
      logits = tf.squeeze(logits, axis=[1])
      return logits, cache
    return symbols_to_logits_fn
Example #8
0
    def call(self, decoder_inputs, encoder_outputs,
             decoder_self_attention_bias, attention_bias):
        batch_size, length, hidden_size = tf.unstack(tf.shape(decoder_inputs))
        act = ACT(batch_size, length, hidden_size)
        halt_threshold = 1.0 - self.hparams['act_epsilon']

        state = decoder_inputs
        previous_state = tf.zeros_like(state, name='previous_state')
        for step in range(self.hparams['act_max_step']):
            # judge to continue
            if not act.should_continue(halt_threshold):
                break

            # position and timestep encoding
            state += model_utils.get_position_encoding(
                self.hparams['max_length'], hidden_size)
            state += model_utils.get_timestep_encoding(
                step, self.hparams['act_max_step'], hidden_size)

            # to judge pondering
            pondering = self.pondering_layer(state)
            pondering = tf.squeeze(pondering, axis=-1)

            # proceed act step
            update_weights = act(pondering, halt_threshold)

            state = self.self_attention_wrapper(state,
                                                decoder_self_attention_bias)
            state = self.enc_dec_attention_wrapper(state, encoder_outputs,
                                                   attention_bias)
            state = self.ffn_wrapper(state)

            # update new state and previous state
            new_state = (state * update_weights) + (previous_state *
                                                    (1 - update_weights))
            previous_state = new_state

        return self.output_norm(new_state), act.n_updates, act.remainders
    def decode(self, targets, encoder_outputs, attention_bias):
        """Generate logits for each value in the target sequence."""
        with tf.name_scope("decode"):
            decoder_inputs = self.decoder_embedding_layer(targets, not ModeKeys.is_predict_one(self.mode))
            with tf.name_scope("shift_targets"):
                decoder_inputs = tf.pad(
                    decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]  # [batch, tgt_seqn_len, embed_size]
            with tf.name_scope("add_pos_encoding"):
                length = tf.shape(decoder_inputs)[1]
                decoder_inputs += model_utils.get_position_encoding(
                    length, self.params.hidden_size)
            if self.is_train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs, 1 - self.params.layer_postprocess_dropout)

            # Run values
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)
            outputs = self.decoder_stack(
                decoder_inputs, encoder_outputs, decoder_self_attention_bias,
                attention_bias)
            logits = self.decoder_softmax_layer.linear(outputs)
            return logits
Example #10
0
    def encode(self, inputs, attention_bias):
        """Generate continuous representation for inputs.

        Args:
            inputs: int tensor with shape [batch_size, input_length].
            attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

        Returns:
            float tensor with shape [batch_size, input_length, hidden_size]
        """
        embedded_inputs = self.embedding_softmax_layer(inputs)
        inputs_padding = model_utils.get_padding(inputs)

        length = embedded_inputs.shape[1]
        pos_encoding = model_utils.get_position_encoding(
            length, self.param.hidden_size, inputs.context)
        encoder_inputs = embedded_inputs + pos_encoding

        if self.train:
            encoder_inputs = self.dropout_input(encoder_inputs)

        return self.encoder_stack(encoder_inputs, attention_bias,
                                  inputs_padding)
Example #11
0
  def decode(self, targets, encoder_outputs, attention_bias):
    """Generate logits for each value in the target sequence.

    Args:
      targets: target values for the output sequence.
        int tensor with shape [batch_size, target_length]
      encoder_outputs: continuous representation of input sequence.
        float tensor with shape [batch_size, input_length, hidden_size]
      attention_bias: float tensor with shape [batch_size, 1, 1, input_length]

    Returns:
      float32 tensor with shape [batch_size, target_length, vocab_size]
    """
    with tf.name_scope("decode"):
      # Prepare inputs to decoder layers by shifting targets, adding positional
      # encoding and applying dropout.
      decoder_inputs = self.embedding_softmax_layer(targets)
      with tf.name_scope("shift_targets"):
        # Shift targets to the right, and remove the last element
        decoder_inputs = tf.pad(
            decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
      with tf.name_scope("add_pos_encoding"):
        length = tf.shape(decoder_inputs)[1]
        decoder_inputs += model_utils.get_position_encoding(
            length, self.params.hidden_size)
      if self.train:
        decoder_inputs = tf.nn.dropout(
            decoder_inputs, 1 - self.params.layer_postprocess_dropout)

      # Run values
      decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
          length)
      outputs = self.decoder_stack(
          decoder_inputs, encoder_outputs, decoder_self_attention_bias,
          attention_bias)
      logits = self.embedding_softmax_layer.linear(outputs)
      return logits
Example #12
0
    def decode(self, targets, encoder_outputs, attention_bias):
        """
        :param targets:  [batch_size, target_length]
        :param encoder_outputs: [batch_size, input_length, hidden_size]
        :param attention_bias:  [batch_size, 1, 1, input_length]
        :return: [batch_size, target_length, vocab_size]
        """
        with tf.name_scope('decode'):
            #   [batch_size, target_length, hidden_size]
            decoder_inputs = self.embedding_layer(targets)
            with tf.name_scope('shift_targets'):
                #   pad embedding value 0 at the head of sequence and remove eos_id
                decoder_inputs = tf.pad(decoder_inputs,
                                        [[0, 0], [1, 0], [0, 0]])[:, :-1, :]
            with tf.name_scope('add_pos_embedding'):
                length = tf.shape(decoder_inputs)[1]
                position_decode = model_utils.get_position_encoding(
                    length, self.params.get('hidden_size'))
                decoder_inputs = tf.add(decoder_inputs, position_decode)

            if self.train:
                decoder_inputs = tf.nn.dropout(
                    decoder_inputs,
                    1. - self.params.get('encoder_decoder_dropout'))

            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                length)

            outputs = self.decoder_stack(decoder_inputs, encoder_outputs,
                                         decoder_self_attention_bias,
                                         attention_bias)

            #   [batch_size, target_length, vocab_size]
            logits = self.embedding_layer.linear(outputs)

            return logits
    def _get_symbols_to_logits_fn(self, max_decode_length):
        """Returns a decoding function that calculates logits of the next tokens."""
        if ModeKeys.is_predict_one(self.mode):
            timing_signal = model_utils.get_position_encoding(
                self.params.max_length, self.params.hidden_size)
            timing_signal = tf.slice(
                timing_signal, [0, 0],
                [max_decode_length + 1, self.params.hidden_size],
                name='slice_timing_signal')
        else:
            timing_signal = model_utils.get_position_encoding(
                max_decode_length + 1, self.params.hidden_size
            )  # [max_decode_length + 1, hidden_size]

        if ModeKeys.is_predict_one(self.mode):
            decoder_self_attention_bias = None
        else:
            decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(
                max_decode_length
            )  # [1, 1, max_decode_length, max_decode_length]

        def symbols_to_logits_fn(ids, i, cache):
            """Generate logits for next potential IDs.

            Args:
              ids: Current decoded sequences.
                int tensor with shape [batch_size * beam_size, i + 1]
              i: Loop index
              cache: dictionary of values storing the encoder output, encoder-decoder
                attention bias, and previous decoder attention values.

            Returns:
              Tuple of
                (logits with shape [batch_size * beam_size, vocab_size],
                 updated cache values)
            """
            # Set decoder input to the last generated IDs
            decoder_input = ids[:, -1:]  # [batch, 1]

            # decoder_input = ids[:, :]     # [batch, 1]
            # print("decoder_input:", decoder_input.shape)

            # Preprocess decoder input by getting embeddings and adding timing signal.
            # !!!!!!!!
            decoder_input = self.decoder_embedding_layer(
                decoder_input, not ModeKeys.is_predict_one(
                    self.mode))  # [batch, 1, hidden_size]
            # !!!!!!!!
            if ModeKeys.is_predict_one(self.mode):
                decoder_input = decoder_input * (1 -
                                                 tf.to_float(tf.equal(i, 0)))

            # add position embedding
            # decoder_input += timing_signal[i:i + 1]
            slice_pos_encoding = tf.slice(
                timing_signal, [i, 0], [1, self.params.hidden_size],
                name='slice_pos_encoding')  # [1, hidden_size]
            decoder_input += slice_pos_encoding

            if decoder_self_attention_bias is None:
                self_attention_bias = None
            else:
                self_attention_bias = decoder_self_attention_bias[:, :,
                                                                  i:i + 1, :i +
                                                                  1]  # [1, 1, 1, time_step]
                # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step]
            # print("attention bias:", self_attention_bias.shape)
            decoder_outputs = self.decoder_stack(
                decoder_input,
                cache.get("encoder_outputs"), self_attention_bias,
                cache.get("encoder_decoder_attention_bias"), cache)
            logits = self.decoder_softmax_layer.linear(decoder_outputs)
            # logits = tf.squeeze(logits, axis=[1])
            logits = tf.reshape(logits, [-1, self.params.target_vocab_size])
            return logits, cache

        return symbols_to_logits_fn
Example #14
0
    def call(self, encoder_inputs, attention_bias, inputs_padding):
        batch_size, length, hidden_size = tf.unstack(tf.shape(encoder_inputs))
        act = ACT(batch_size, length, hidden_size)
        halt_threshold = 1.0 - self.hparams.act_epsilon

        state = encoder_inputs
        previous_state = tf.zeros_like(state, name='previous_state')
        for step in range(self.hparams.act_max_step):
            # judge to continue
            if not act.should_continue(halt_threshold):
                break

            # position & timestep encoding
            state += model_utils.get_position_encoding(self.hparams.max_length,
                                                       hidden_size)
            state += model_utils.get_timestep_encoding(
                step, self.hparams.act_max_step, hidden_size)

            # to judge pondering
            pondering = self.pondering_layer(state)
            pondering = tf.squeeze(pondering, axis=-1)

            num_head_layer = self.num_head_layer

            num_head_layer = self.num_head_layer(state)
            num_head_3logit = tf.greater(tf.softmax(num_head_layer), 0.6)
            num_head_5logit = tf.greater(tf.softmax(num_head_layer), 0.6)
            num_head_7logit = tf.greater(tf.softmax(num_head_layer), 0.6)

            # proceed act step
            update_weights = act(pondering, halt_threshold)

            if (num_head_3logit):
                self_attention_layer = SelfAttention(hparams.num_units, 3,
                                                     hparams.dropout_rate,
                                                     is_train)
            elif (num_head_5logit):
                self_attention_layer = SelfAttention(hparams.num_units, 5,
                                                     hparams.dropout_rate,
                                                     is_train)

            ffn_layer = FeedForwardNetwork(hparams.num_units,
                                           hparams.num_filter_units,
                                           hparams.dropout_rate, is_train)
            self.self_attention_wrapper = LayerWrapper(self_attention_layer,
                                                       hparams.num_units,
                                                       hparams.dropout_rate,
                                                       is_train)
            self.ffn_wrapper = LayerWrapper(ffn_layer, hparams.num_units,
                                            hparams.dropout_rate, is_train)
            self.output_norm = LayerNormalization(hparams.num_units)

            state = self.self_attention_wrapper(state, attention_bias)
            state = self.ffn_wrapper(state, inputs_padding)

            # update new state and previous state
            new_state = (state * update_weights) + (previous_state *
                                                    (1 - update_weights))
            previous_state = new_state

        return self.output_norm(new_state), act.n_updates, act.remainders