def symbols_to_logits_fn(ids, i, cache): decoder_input = ids[:, -1:] decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one(self.mode)) # !!!!!!!! if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) # decoder_input += timing_signal[i:i + 1] slice_pos_encoding = tf.slice(timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) # logits = tf.squeeze(logits, axis=[1]) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache
def encode(self, inputs, attention_bias): with tf.name_scope("encode"): embedded_inputs = self.encoder_embedding_layer( inputs, not ModeKeys.is_predict_one(self.mode)) if ModeKeys.is_predict_one(self.mode): inputs_padding = None else: inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] if ModeKeys.is_predict_one(self.mode): pos_encoding = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) pos_encoding = tf.slice(pos_encoding, [0, 0], [length, self.params.hidden_size], name='slice_pos_encoding') else: pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.is_train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def argmax_predict(self, encoder_outputs, encoder_decoder_attention_bias): if ModeKeys.is_predict_one(self.mode): batch_size = 1 else: batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = input_length + self.params.extra_decode_length symbols_to_logits_fn = self._get_symbols_to_logits_fn( max_decode_length) cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params.hidden_size]), "v": tf.zeros([batch_size, 0, self.params.hidden_size]), } for layer in range(self.params.num_hidden_layers) } cache["encoder_outputs"] = encoder_outputs if not ModeKeys.is_predict_one(self.mode): cache[ "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if self.params.beam_size > 1: pass else: def inner_loop(i, finished, next_id, decoded_ids, cache): logits, cache = symbols_to_logits_fn( next_id, i, cache) # [batch, vocab_size] next_id = tf.argmax(logits, -1, output_type=tf.int32) finished |= tf.equal(next_id, EOS_ID) next_id = tf.reshape(next_id, shape=[-1, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, finished, next_id, decoded_ids, cache def is_not_finished(i, finished, _1, _2, _3): return (i < max_decode_length) & tf.logical_not( tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32) finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int32) _, _, _, decoded_ids, _ = tf.while_loop( cond=is_not_finished, body=inner_loop, loop_vars=[ tf.constant(0), finished, next_id, decoded_ids, cache ], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), nest.map_structure(get_state_shape_invariants, cache), ]) return decoded_ids
def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # [batch, 1] # decoder_input = ids[:, :] # [batch, 1] # print("decoder_input:", decoder_input.shape) # Preprocess decoder input by getting embeddings and adding timing signal. # !!!!!!!! decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one( self.mode)) # [batch, 1, hidden_size] # !!!!!!!! if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) # add position embedding # decoder_input += timing_signal[i:i + 1] slice_pos_encoding = tf.slice( timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') # [1, hidden_size] decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # [1, 1, 1, time_step] # self_attention_bias = decoder_self_attention_bias[:, :, :i+1, :i+1] # [1, 1, 1, time_step] # print("attention bias:", self_attention_bias.shape) decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) # logits = tf.squeeze(logits, axis=[1]) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache
def _get_symbols_to_logits_fn(self, max_decode_length): if ModeKeys.is_predict_one(self.mode): timing_signal = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) timing_signal = tf.slice( timing_signal, [0, 0], [max_decode_length + 1, self.params.hidden_size], name='slice_timing_signal') else: timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size ) # [max_decode_length + 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_self_attention_bias = None else: decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length ) # [1, 1, max_decode_length, max_decode_length] def symbols_to_logits_fn(ids, i, cache): decoder_input = ids[:, -1:] # [batch, 1] decoder_input = self.decoder_embedding_layer( decoder_input, not ModeKeys.is_predict_one( self.mode)) # [batch, 1, hidden_size] if ModeKeys.is_predict_one(self.mode): decoder_input = decoder_input * (1 - tf.to_float(tf.equal(i, 0))) slice_pos_encoding = tf.slice( timing_signal, [i, 0], [1, self.params.hidden_size], name='slice_pos_encoding') # [1, hidden_size] decoder_input += slice_pos_encoding if decoder_self_attention_bias is None: self_attention_bias = None else: self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # [1, 1, 1, time_step] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.decoder_softmax_layer.linear(decoder_outputs) logits = tf.reshape(logits, [-1, self.params.target_vocab_size]) return logits, cache return symbols_to_logits_fn
def build_generator(self, inputs): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. self.encoder_outputs = self.encode( inputs, self.attention_bias) # [batch, src_len, hidden_size] # get encdec_attenion k/v just for predict_one_encoder if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, self.encoder_outputs, None, None, None) # Generate output sequence if targets is None, or return logits if target # sequence is known. if self.is_train: tf.logging.info("!!!!!! using rl predict in traning !!!!!!") return self.rl_predict(self.encoder_outputs, self.attention_bias) else: tf.logging.info( "!!!!!!! using argmax_predict in inference !!!!!!!!") return self.argmax_predict(self.encoder_outputs, self.attention_bias)
def __init__(self, params, is_train, mode): super(DecoderStack, self).__init__() self.mode = mode self.predict_one = ModeKeys.is_predict_one(self.mode) self.layers = [] for _ in range(params.num_hidden_layers): self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) if self.mode == ModeKeys.PREDICT_ONE_DECODER: enc_dec_attention_layer = attention_layer.EncDecPredictOneAttention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) else: enc_dec_attention_layer = attention_layer.Attention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, is_train, self.predict_one) # decoder 包含3个模块,分别是self-attention,enc_dec_attention,以及feed-forward. 分别wrapper熵layer_norm和dropout. self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, is_train), PrePostProcessingWrapper(enc_dec_attention_layer, params, is_train), PrePostProcessingWrapper(feed_forward_network, params, is_train) ]) self.output_normalization = LayerNormalization(params.hidden_size)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence.""" with tf.name_scope("decode"): decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) with tf.name_scope("shift_targets"): decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.decoder_softmax_layer.linear(outputs) return logits
def build_generator(self, inputs): if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] self.encoder_outputs = self.encode( inputs, self.attention_bias) # [batch, src_len, hidden_size] if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, self.encoder_outputs, None, None, None) if self.is_train: # if self.mode == tf.estimator.ModeKeys.TRAIN: tf.logging.info("!!!!!! using rl predict in traning !!!!!!") decoded_ids, decoded_logits, log_probs = self.rl_predict( self.encoder_outputs, self.attention_bias) return decoded_ids, decoded_logits, log_probs else: tf.logging.info( "!!!!!!! using argmax_predict in prediction/evaluation !!!!!!!!" ) decoded_ids, decoded_logits = self.argmax_predict( self.encoder_outputs, self.attention_bias) return decoded_ids, decoded_logits, _
def build_pretrain(self, inputs, targets): # initializer = tf.variance_scaling_initializer( # self.params.initializer_gain, mode="fan_avg", distribution="uniform") # # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE): if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] encoder_outputs = self.encode( inputs, attention_bias) # [batch, src_len, hidden_size] if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, encoder_outputs, None, None, None) if targets is None: prediction, _ = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: logits = self.decode( targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def build_padding_rollout_generator(self, real_inputs, gen_samples, max_len, given_num): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=tf.AUTO_REUSE): if ModeKeys.is_predict_one(self.mode): self.attention_bias = None else: self.attention_bias = model_utils.get_padding_bias(real_inputs) self.encoder_outputs = self.encode(real_inputs, self.attention_bias) def condition(given_num, _): return given_num < max_len def inner_loop(given_num, given_y): logits = self.decode(given_y, self.encoder_outputs, self.attention_bias) next_logits = logits[:, given_num, :] # [batch, decoder_vocab_size] next_probs = tf.nn.softmax(next_logits) log_probs = tf.log(next_probs) next_sample = tf.multinomial(log_probs, num_samples=1) next_sample = tf.cast(next_sample, dtype=tf.int32) given_y = tf.concat([given_y[:, :given_num], next_sample], axis=1) given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num - 1]]) return given_num + 1, given_y given_y = gen_samples[:, :given_num] init_given_y = tf.pad(given_y, [[0, 0], [0, max_len - given_num]]) init_given_num = given_num given_num, roll_sample = tf.while_loop( cond=condition, body=inner_loop, loop_vars=[init_given_num, init_given_y], shape_invariants=[init_given_num.get_shape(), tf.TensorShape([None, None])] ) return roll_sample
def decode(self, targets, encoder_outputs, attention_bias): with tf.name_scope("decode"): decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) # done with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # !!! # logits = self.embedding_softmax_layer.linear(outputs) logits = self.decoder_softmax_layer.linear(outputs) # done return logits
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.encoder_embedding_layer( inputs, not ModeKeys.is_predict_one(self.mode)) if ModeKeys.is_predict_one(self.mode): inputs_padding = None else: inputs_padding = model_utils.get_padding(inputs) # add_pos_encoding with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] if ModeKeys.is_predict_one(self.mode): pos_encoding = model_utils.get_position_encoding( self.params.max_length, self.params.hidden_size) pos_encoding = tf.slice(pos_encoding, [0, 0], [length, self.params.hidden_size], name='slice_pos_encoding') else: pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.is_train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. # !!! # decoder_inputs = self.embedding_softmax_layer(targets) decoder_inputs = self.decoder_embedding_layer( targets, not ModeKeys.is_predict_one(self.mode)) # done with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0] ])[:, :-1, :] # [batch, tgt_seqn_len, embed_size] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.is_train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) # !!! # logits = self.embedding_softmax_layer.linear(outputs) logits = self.decoder_softmax_layer.linear(outputs) # done return logits
def inference(self, inputs, targets=None, reuse=None): with tf.variable_scope(self.name_scope, initializer=self.initializer, reuse=reuse): if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs = self.encode(inputs, attention_bias) if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack( fake_decoder_inputs, encoder_outputs, None, None, None) if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def __init__(self, params, is_train, mode): super(EncoderStack, self).__init__() self.mode = mode self.predict_one = ModeKeys.is_predict_one(self.mode) self.layers = [] for _ in range(params.num_hidden_layers): # Create sublayers for each layer. self_attention_layer = attention_layer.SelfAttention( params.hidden_size, params.num_heads, params.attention_dropout, is_train, self.predict_one) feed_forward_network = ffn_layer.FeedFowardNetwork( params.hidden_size, params.filter_size, params.relu_dropout, is_train, self.predict_one) self.layers.append([ PrePostProcessingWrapper(self_attention_layer, params, is_train), PrePostProcessingWrapper(feed_forward_network, params, is_train) ]) # Create final layer normalization layer. self.output_normalization = LayerNormalization(params.hidden_size)
def build_pretrain(self, inputs, targets): # initializer = tf.variance_scaling_initializer( # self.params.initializer_gain, mode="fan_avg", distribution="uniform") # # with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. if ModeKeys.is_predict_one(self.mode): attention_bias = None else: attention_bias = model_utils.get_padding_bias( inputs) # [batch, 1, 1, src_len] # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode( inputs, attention_bias) # [batch, src_len, hidden_size] # get encdec_attenion k/v just for predict_one_encoder if self.mode == ModeKeys.PREDICT_ONE_ENCODER: fake_decoder_inputs = tf.zeros([1, 0, self.params.hidden_size]) fake_decoder_outputs = self.decoder_stack(fake_decoder_inputs, encoder_outputs, None, None, None) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: tf.logging.info( "!!!!!!!!!!!prediction using argmax prediction!!!!!!!!!!!!!") prediction, _ = self.argmax_predict(encoder_outputs, attention_bias) return prediction else: logits = self.decode( targets, encoder_outputs, attention_bias) # [batch, tgt_len, vocab_size] return logits
def rl_predict(self, encoder_outputs, encoder_decoder_attention_bias): if ModeKeys.is_predict_one(self.mode): batch_size = 1 else: batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = input_length + self.params.extra_decode_length symbols_to_logits_fn = self._get_symbols_to_logits_fn( max_decode_length) # Create initial set of IDs that will be passed into symbols_to_logits_fn. initial_ids = tf.zeros([batch_size], dtype=tf.int32) # Create cache storing decoder attention values for each layer. cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params.hidden_size]), "v": tf.zeros([batch_size, 0, self.params.hidden_size]), } for layer in range(self.params.num_hidden_layers) } # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs if not ModeKeys.is_predict_one(self.mode): cache[ "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if self.params.beam_size > 1: pass else: def inner_loop(i, finished, next_id, decoded_ids, log_probs, decoded_logits, cache): # print("time step:", i) """One step of greedy decoding.""" logits, cache = symbols_to_logits_fn(next_id, i, cache) categorical = tf.contrib.distributions.Categorical( logits=logits) next_id = categorical.sample() log_prob = categorical.log_prob(next_id) # [batch,] finished |= tf.equal(next_id, EOS_ID) finished = tf.reshape(finished, (-1, )) next_id = tf.reshape(next_id, shape=[-1, 1]) log_prob = tf.reshape(log_prob, shape=[-1, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) log_probs = tf.concat([log_probs, log_prob], axis=1) # [batch, len] logits = tf.expand_dims(logits, axis=1) decoded_logits = tf.concat([decoded_logits, logits], axis=1) return i + 1, finished, next_id, decoded_ids, log_probs, decoded_logits, cache def is_not_finished(i, finished, _1, _2, _3, _4, _5): return (i < max_decode_length) & tf.logical_not( tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32) log_probs = tf.zeros([batch_size, 0], dtype=tf.float32) decoded_logits = tf.zeros( [batch_size, 0, self.params.target_vocab_size], dtype=tf.float32) finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int32) _, _, _, decoded_ids, log_probs, decoded_logits, cache = tf.while_loop( cond=is_not_finished, body=inner_loop, loop_vars=[ tf.constant(0), finished, next_id, decoded_ids, log_probs, decoded_logits, cache ], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None, None]), nest.map_structure(get_state_shape_invariants, cache), ]) # return {"outputs": decoded_ids, "scores": tf.ones([batch_size, 1])} return decoded_ids, decoded_logits, log_probs
def rl_predict_new(self, encoder_outputs, encoder_decoder_attention_bias): if ModeKeys.is_predict_one(self.mode): batch_size = 1 else: batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = input_length + self.params.extra_decode_length symbols_to_logits_fn = self._get_symbols_to_logits_fn(max_decode_length) initial_ids = tf.zeros([batch_size], dtype=tf.int32) cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params.hidden_size]), "v": tf.zeros([batch_size, 0, self.params.hidden_size]), } for layer in range(self.params.num_hidden_layers)} # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs if not ModeKeys.is_predict_one(self.mode): cache["encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if self.params.beam_size > 1: pass else: def inner_loop(i, finished, next_id, decoded_ids, log_probs, cache): prev_id = next_id logits, cache = symbols_to_logits_fn(next_id, i, cache) categorical = tf.contrib.distributions.Categorical(logits=logits) next_id = categorical.sample() log_prob = categorical.log_prob(next_id) # [batch,] finished |= tf.equal(next_id, EOS_ID) finished = tf.reshape(finished, (-1,)) next_id = tf.reshape(next_id, shape=[-1, 1]) mask = tf.cast(tf.math.not_equal(prev_id, EOS_ID), dtype=tf.int32) next_id = next_id * mask def pad_fn(): mask_pad = tf.cast(tf.math.not_equal(prev_id, PAD_ID), dtype=tf.int32) return next_id * mask_pad next_id = tf.cond(tf.less(i, 1), lambda: next_id, pad_fn) log_prob = tf.reshape(log_prob, shape=[-1, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) log_probs = tf.concat([log_probs, log_prob], axis=1) # [batch, len] return i + 1, finished, next_id, decoded_ids, log_probs, cache def is_not_finished(i, finished, _1, _2, _3, _4): return (i < max_decode_length) & tf.logical_not(tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32) log_probs = tf.zeros([batch_size, 0], dtype=tf.float32) finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int32) _, _, _, decoded_ids, log_probs, cache = tf.while_loop( cond=is_not_finished, body=inner_loop, loop_vars=[tf.constant(0), finished, next_id, decoded_ids, log_probs, cache], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), nest.map_structure(get_state_shape_invariants, cache), ]) return decoded_ids, log_probs
def predict(self, encoder_outputs, encoder_decoder_attention_bias): """Return predicted sequence.""" if ModeKeys.is_predict_one(self.mode): batch_size = 1 else: batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = input_length + self.params.extra_decode_length symbols_to_logits_fn = self._get_symbols_to_logits_fn( max_decode_length) # Create initial set of IDs that will be passed into symbols_to_logits_fn. initial_ids = tf.zeros([batch_size], dtype=tf.int32) # Create cache storing decoder attention values for each layer. cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params.hidden_size]), "v": tf.zeros([batch_size, 0, self.params.hidden_size]), } for layer in range(self.params.num_hidden_layers) } # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs if not ModeKeys.is_predict_one(self.mode): cache[ "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if self.params.beam_size > 1: print("!!!!!!!!!!! right here, beam_size = %i!!!!!!!!!!!!" % self.params.beam_size) # Use beam search to find the top beam_size sequences and scores. decoded_ids, scores = beam_search.sequence_beam_search( symbols_to_logits_fn=symbols_to_logits_fn, initial_ids=initial_ids, initial_cache=cache, vocab_size=self.params.target_vocab_size, beam_size=self.params.beam_size, alpha=self.params.alpha, max_decode_length=max_decode_length, eos_id=EOS_ID) # Get the top sequence for each batch element top_decoded_ids = decoded_ids[:, 0, 1:] top_scores = scores[:, 0] return {"outputs": top_decoded_ids, "scores": top_scores} else: def inner_loop(i, finished, next_id, decoded_ids, cache): """One step of greedy decoding.""" logits, cache = symbols_to_logits_fn(next_id, i, cache) next_id = tf.argmax(logits, -1, output_type=tf.int32) finished |= tf.equal(next_id, EOS_ID) # next_id = tf.expand_dims(next_id, axis=1) next_id = tf.reshape(next_id, shape=[-1, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, finished, next_id, decoded_ids, cache def is_not_finished(i, finished, *_): return (i < max_decode_length) & tf.logical_not( tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32) finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int32) _, _, _, decoded_ids, _ = tf.while_loop( is_not_finished, inner_loop, [tf.constant(0), finished, next_id, decoded_ids, cache], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), nest.map_structure(get_state_shape_invariants, cache), ]) return {"outputs": decoded_ids, "scores": tf.ones([batch_size, 1])}
norm_x = (x - mean) * tf.rsqrt(variance + epsilon) return norm_x * self.scale + self.bias if __name__ == "__main__": import os tf.enable_eager_execution() os.environ["CUDA_VISIBLE_DEVICES"] = "0" params = model_params.TransformerBaseParams() x_inputs = tf.constant([[1, 2, 3, 0, 0], [3, 4, 5, 6, 8]], dtype=tf.int32) Enc_Embedding = embedding_layer.EmbeddingWeights(params.source_vocab_size, params.hidden_size, "source_embedding") embedded_inputs = Enc_Embedding( x_inputs, not ModeKeys.is_predict_one(ModeKeys.TRAIN)) print(embedded_inputs.shape) attention_bias = model_utils.get_padding_bias(x_inputs) print(attention_bias.shape) encoder_stack = EncoderStack(params, is_train=True, mode=ModeKeys.TRAIN) enc_out = encoder_stack(embedded_inputs, attention_bias, None) print(enc_out.shape) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( 10) self_attention_bias = decoder_self_attention_bias[:, :, 0:1, :1] print(self_attention_bias) attention_bias = model_utils.get_padding_bias(x_inputs) cache = { "layer_%d" % layer: { "k": tf.zeros([2, 0, params.hidden_size]), "v": tf.zeros([2, 0, params.hidden_size]),
def argmax_predict(self, encoder_outputs, encoder_decoder_attention_bias): """Return predicted sequence.""" if ModeKeys.is_predict_one(self.mode): batch_size = 1 else: batch_size = tf.shape(encoder_outputs)[0] input_length = tf.shape(encoder_outputs)[1] max_decode_length = input_length + self.params.extra_decode_length symbols_to_logits_fn = self._get_symbols_to_logits_fn( max_decode_length) # Create initial set of IDs that will be passed into symbols_to_logits_fn. initial_ids = tf.zeros([batch_size], dtype=tf.int32) # Create cache storing decoder attention values for each layer. cache = { "layer_%d" % layer: { "k": tf.zeros([batch_size, 0, self.params.hidden_size]), "v": tf.zeros([batch_size, 0, self.params.hidden_size]), } for layer in range(self.params.num_hidden_layers) } # Add encoder output and attention bias to the cache. cache["encoder_outputs"] = encoder_outputs if not ModeKeys.is_predict_one(self.mode): cache[ "encoder_decoder_attention_bias"] = encoder_decoder_attention_bias if self.params.beam_size > 1: pass else: def inner_loop(i, finished, next_id, decoded_ids, cache): print("time step:", i) """One step of greedy decoding.""" logits, cache = symbols_to_logits_fn(next_id, i, cache) # logits, cache = symbols_to_logits_fn(decoded_ids, i, cache) next_id = tf.argmax(logits, -1, output_type=tf.int32) finished |= tf.equal(next_id, EOS_ID) # next_id = tf.expand_dims(next_id, axis=1) next_id = tf.reshape(next_id, shape=[-1, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, finished, next_id, decoded_ids, cache def is_not_finished(i, finished, _1, _2, _3): return (i < max_decode_length) & tf.logical_not( tf.reduce_all(finished)) decoded_ids = tf.zeros([batch_size, 0], dtype=tf.int32) finished = tf.fill([batch_size], False) next_id = tf.zeros([batch_size, 1], dtype=tf.int32) _, _, _, decoded_ids, _ = tf.while_loop( cond=is_not_finished, body=inner_loop, loop_vars=[ tf.constant(0), finished, next_id, decoded_ids, cache ], shape_invariants=[ tf.TensorShape([]), tf.TensorShape([None]), tf.TensorShape([None, None]), tf.TensorShape([None, None]), nest.map_structure(get_state_shape_invariants, cache), ]) return decoded_ids, _