def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.compat.v1.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.compat.v1.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(tensor=decoder_inputs, paddings=[[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.compat.v1.name_scope("add_pos_encoding"): length = tf.shape(input=decoder_inputs)[1] poscod = tf.cast( model_utils.get_position_encoding(length, self.params.hidden_size), tf.bfloat16) decoder_inputs += poscod if self.train: mlperf_log.transformer_print( key=mlperf_log.MODEL_HP_LAYER_POSTPROCESS_DROPOUT, value=self.params.layer_postprocess_dropout) decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - (1 - self.params.layer_postprocess_dropout)) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" # [length, hidden_size] timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) # [1,1,length,length] decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. # shape: (batch_size, target_length, hidden_size) decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift decoder_input one token to the right # fill inputs the first token is 0 -> <BOS>, # and remove the last element <EOS> decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values # shape: [1, 1, target_length, target_length] decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) # shape: (batch_size, target_length, hidden_size) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # shape: (batch_size, target_length, vocab_size) logits = self.embedding_softmax_layer.linear(outputs) return logits
def _decode(self, encoder_outputs, targets, attention_bias): decoder_inputs = self.embedding_layer(targets) decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # add positional encoding length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.hparams['num_units']) if self.is_train: decoder_inputs = self.decoder_embedding_dropout(decoder_inputs) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs, dec_ponders, dec_remainders = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_layer.linear(outputs) return logits, dec_ponders, dec_remainders
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. # !!! # decoder_inputs = self.embedding_softmax_layer(targets) decoder_inputs = self.decoder_embedding_layer(targets, not ModeKeys.is_predict_one(self.mode)) # done with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # !!! # logits = self.embedding_softmax_layer.linear(outputs) logits = self.decoder_softmax_layer.linear(outputs) # done return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def call(self, decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias): batch_size, length, hidden_size = tf.unstack(tf.shape(decoder_inputs)) act = ACT(batch_size, length, hidden_size) halt_threshold = 1.0 - self.hparams['act_epsilon'] state = decoder_inputs previous_state = tf.zeros_like(state, name='previous_state') for step in range(self.hparams['act_max_step']): # judge to continue if not act.should_continue(halt_threshold): break # position and timestep encoding state += model_utils.get_position_encoding( self.hparams['max_length'], hidden_size) state += model_utils.get_timestep_encoding( step, self.hparams['act_max_step'], hidden_size) # to judge pondering pondering = self.pondering_layer(state) pondering = tf.squeeze(pondering, axis=-1) # proceed act step update_weights = act(pondering, halt_threshold) state = self.self_attention_wrapper(state, decoder_self_attention_bias) state = self.enc_dec_attention_wrapper(state, encoder_outputs, attention_bias) state = self.ffn_wrapper(state) # update new state and previous state new_state = (state * update_weights) + (previous_state * (1 - update_weights)) previous_state = new_state return self.output_norm(new_state), act.n_updates, act.remainders
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence.""" with tf.name_scope("decode"): decoder_inputs = self.decoder_embedding_layer(targets, not ModeKeys.is_predict_one(self.mode)) with tf.name_scope("shift_targets"): decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.decoder_softmax_layer.linear(outputs) return logits
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) length = embedded_inputs.shape[1] pos_encoding = model_utils.get_position_encoding( length, self.param.hidden_size, inputs.context) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = self.dropout_input(encoder_inputs) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def decode(self, targets, encoder_outputs, attention_bias): """ :param targets: [batch_size, target_length] :param encoder_outputs: [batch_size, input_length, hidden_size] :param attention_bias: [batch_size, 1, 1, input_length] :return: [batch_size, target_length, vocab_size] """ with tf.name_scope('decode'): # [batch_size, target_length, hidden_size] decoder_inputs = self.embedding_layer(targets) with tf.name_scope('shift_targets'): # pad embedding value 0 at the head of sequence and remove eos_id decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope('add_pos_embedding'): length = tf.shape(decoder_inputs)[1] position_decode = model_utils.get_position_encoding( length, self.params.get('hidden_size')) decoder_inputs = tf.add(decoder_inputs, position_decode) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1. - self.params.get('encoder_decoder_dropout')) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # [batch_size, target_length, vocab_size] logits = self.embedding_layer.linear(outputs) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" if ModeKeys.is_predict_one(self.mode): timing_signal = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) timing_signal = tf.slice( timing_signal, [0, 0], [max_decode_length + 1, self.params.hidden_size], name='slice_timing_signal') else: timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size ) # [max_decode_length + 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_self_attention_bias = None else: decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length ) # [1, 1, max_decode_length, max_decode_length] def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # [batch, 1] # decoder_input = ids[:, :] # [batch, 1] # print("decoder_input:", decoder_input.shape) # Preprocess decoder input by getting embeddings and adding timing signal. # !!!!!!!! decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one( self.mode)) # [batch, 1, hidden_size] # !!!!!!!! if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) # add position embedding # decoder_input += timing_signal[i:i + 1] slice_pos_encoding = tf.slice( timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') # [1, hidden_size] decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # [1, 1, 1, time_step] # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step] # print("attention bias:", self_attention_bias.shape) decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) # logits = tf.squeeze(logits, axis=[1]) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache return symbols_to_logits_fn
def call(self, encoder_inputs, attention_bias, inputs_padding): batch_size, length, hidden_size = tf.unstack(tf.shape(encoder_inputs)) act = ACT(batch_size, length, hidden_size) halt_threshold = 1.0 - self.hparams.act_epsilon state = encoder_inputs previous_state = tf.zeros_like(state, name='previous_state') for step in range(self.hparams.act_max_step): # judge to continue if not act.should_continue(halt_threshold): break # position & timestep encoding state += model_utils.get_position_encoding(self.hparams.max_length, hidden_size) state += model_utils.get_timestep_encoding( step, self.hparams.act_max_step, hidden_size) # to judge pondering pondering = self.pondering_layer(state) pondering = tf.squeeze(pondering, axis=-1) num_head_layer = self.num_head_layer num_head_layer = self.num_head_layer(state) num_head_3logit = tf.greater(tf.softmax(num_head_layer), 0.6) num_head_5logit = tf.greater(tf.softmax(num_head_layer), 0.6) num_head_7logit = tf.greater(tf.softmax(num_head_layer), 0.6) # proceed act step update_weights = act(pondering, halt_threshold) if (num_head_3logit): self_attention_layer = SelfAttention(hparams.num_units, 3, hparams.dropout_rate, is_train) elif (num_head_5logit): self_attention_layer = SelfAttention(hparams.num_units, 5, hparams.dropout_rate, is_train) ffn_layer = FeedForwardNetwork(hparams.num_units, hparams.num_filter_units, hparams.dropout_rate, is_train) self.self_attention_wrapper = LayerWrapper(self_attention_layer, hparams.num_units, hparams.dropout_rate, is_train) self.ffn_wrapper = LayerWrapper(ffn_layer, hparams.num_units, hparams.dropout_rate, is_train) self.output_norm = LayerNormalization(hparams.num_units) state = self.self_attention_wrapper(state, attention_bias) state = self.ffn_wrapper(state, inputs_padding) # update new state and previous state new_state = (state * update_weights) + (previous_state * (1 - update_weights)) previous_state = new_state return self.output_norm(new_state), act.n_updates, act.remainders