def decode(self, start_tokens, targets, encoder_outputs, attention_bias): with tf.name_scope("decode"): with tf.name_scope("shift_targets"): decoder_inputs = tf.concat( [tf.expand_dims(start_tokens, axis=1), targets[:, :-1]], axis=1) decoder_inputs = self.decoder_embedding_layer(decoder_inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) decoder_outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) outputs = self.output_embedding_layer(decoder_outputs) return outputs
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding(length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias(length) outputs = self.decoder_stack(decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def encode(self, embedded_inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. # Not needed as we use our own embeddings # embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_embedding_padding(embedded_inputs) encoder_inputs = embedded_inputs if self.params["use_positional_encoding"]: with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. 生成输入的向量表示,即representation Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. # 添加postional encodings,输入给encoder,然后应用dropout embedded_inputs = self.embedding_softmax_layer( inputs) # 将Input转化成embedding inputs_padding = model_utils.get_padding( inputs) # 获得Input 的 padding过的位置 with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( # 获得position embedding length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding # 相加作为输入 if self.train: encoder_inputs = tf.nn.dropout( # 输入前还要dropout encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def _get_symbols_to_logits_fn(self, max_decode_length, training): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) timing_signal = tf.cast(timing_signal, self.params["dtype"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length, dtype=self.params["dtype"]) # TODO(b/139770046): Refactor code with better naming of i. def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1]. i: Loop index. cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) if self.params["padded_decode"]: timing_signal_shape = timing_signal.shape.as_list() decoder_input += tf.slice(timing_signal, [i, 0], [1, timing_signal_shape[1]]) bias_shape = decoder_self_attention_bias.shape.as_list() self_attention_bias = tf.slice( decoder_self_attention_bias, [0, 0, i, 0], [bias_shape[0], bias_shape[1], 1, bias_shape[3]]) else: decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), training=training, cache=cache, decode_loop_step=i if self.params["padded_decode"] else None) logits = self.embedding_softmax_layer(decoder_outputs, mode="linear") logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def encode_no_lookup(self, embedded_inputs, inputs_mask): """Encoder step for transformer given already-embedded inputs Args: model: transformer model embedded_inputs: int tensor with shape [batch_size, input_length, emb_size]. inputs_mask: int tensor with shape [batch_size, input_length] params: transformer_params train: boolean flag Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. inputs_padding = model_utils.get_padding(inputs_mask) attention_bias = model_utils.get_padding_bias(inputs_mask) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, _, inputs, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: inputs: int tensor (old dst sentence) with shape [batch_size, input_length]. encoder_outputs: continuous representation of diff sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params["hidden_size"]) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def encode(self, inputs, attention_bias, training): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length]. training: boolean, whether in training mode or not. Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if training: encoder_inputs = tf.nn.dropout( encoder_inputs, rate=self.params["layer_postprocess_dropout"]) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding, training=training)
def decode(self, targets, encoder_outputs, attention_bias, training): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] training: boolean, whether in training mode or not. Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_layer(targets) decoder_inputs = tf.cast(decoder_inputs, self.params["dtype"]) attention_bias = tf.cast(attention_bias, self.params["dtype"]) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad(decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) pos_encoding = tf.cast(pos_encoding, self.params["dtype"]) decoder_inputs += pos_encoding if training: decoder_inputs = tf.nn.dropout( decoder_inputs, rate=self.params["layer_postprocess_dropout"]) # Run values # decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( # length, dtype=self.params["dtype"]) decoder_self_attention_bias = tf.ones([1, 1, length, length], dtype=self.params["dtype"]) logits = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias, training=training) batch_size = tf.shape(logits)[0] length = tf.shape(logits)[1] hidden_size = tf.shape(logits)[2] # logits = tf.reduce_mean(logits, axis=1) logits = tf.reshape(logits, [batch_size, length * hidden_size]) logits = tf.reshape(logits, [batch_size, self.params['max_length'] * hidden_size]) logits = self.embedding_layer(logits, mode="linear") # logits = tf.reshape(logits, [batch_size, 2]) logits = tf.cast(logits, tf.float32) return logits
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" # 返回一个能够计算下一个token的decode函数 timing_signal = model_utils.get_position_encoding( # 时序信息,形状是[length, hidden_size] max_decode_length + 1, self.params["hidden_size"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) # self attention 的偏差, 形状是[1, 1, length, length] def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. 这个函数可以做到,给出已经预测的tokens的id,使用decoder和encode的信息,预测下一个token ids表示已经预测出来的tokens i表示当前是第i个位置,要被预测 cache应该是因为:训练时的decode只需要做一次,但是inference时的decode需要做多次,因为要逐个单词预测,多次decode用的encode信息是一样的,因此需要提前存储好。 Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1],忽略batch_size,可以看出,这个ids不是整个句子的ids,而是从开始到某一位置的候选tokens的id i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # 貌似是想要获得句子中当前位置的候选tokens的ids,也就是获得形状 [batch_size * beam_size, 1] # Preprocess decoder input by getting embeddings and adding timing signal. # 做embedding,也就是[batch_size * beam_size, 1, hidden_size] # 从这一步可以看出,在inference的decode的输入,就是用已经预测的tokens的最后一个token,构成句子长度为1的句子,做embedding,输入到decode进行解码 decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] # 加上第i个token的时序信息 self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] # self attention,形状是[1, 1, 1, i+1] decoder_outputs = self.decoder_stack( # 进行decode,输出tensor的形状和输入decoder_input一样,也是[batch_size * beam_size, 1, hidden_size] decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) # softmax,从[batch_size * beam_size, 1, hidden_size]映射到[batch_size * beam_size, 1, vocab_size] logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze( logits, axis=[1] ) # 去掉中间那个长度为1的维度,即由[batch_size * beam_size, 1, vocab_size]变为[batch_size * beam_size, vocab_size] return logits, cache return symbols_to_logits_fn
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params["hidden_size"]) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] ### domyounglee 2020.2.12 cls_dec_bias = model_utils.get_cls_dec_attention_bias( tf.cast(tf.equal(decoder_input, 2), tf.int64)) #self.cls_attention_bias=None # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cls_attention_bias=None, cls_dec_attention_bias=None, identity_mask=None, cache=cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def encode(self, seq, seq_len=None, output_method='all'): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): if self.use_position_encoding: hidden_size = melt.get_shape(seq, -1) # Scale embedding by the sqrt of the hidden size seq *= hidden_size ** 0.5 # Create binary array of size [batch_size, length] # where 1 = padding, 0 = not padding padding = tf.to_float(tf.sequence_mask(seq_len)) # Set all padding embedding values to 0 seq *= tf.expand_dims(padding, -1) pos_encoding = model_utils.get_position_encoding( tf.shape(seq)[1], tf.shape(seq)[-1]) seq = seq + pos_encoding num_filters = self.num_filters seqs = [seq] #batch_size = melt.get_batch_size(seq) #kernel_sizes = [3, 5, 7, 9, 11, 13] kernel_sizes = [3] * 7 assert self.num_layers <= len(kernel_sizes) for layer in range(self.num_layers): #input_size_ = melt.get_shape(seq, -1) if layer == 0 else num_filters seq = melt.dropout(seq, self.keep_prob, self.is_train) seq = tf.layers.conv1d(seqs[-1], num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # mask = melt.dropout(tf.ones([batch_size, 1, input_size_], dtype=tf.float32), # keep_prob=self.keep_prob, is_train=self.is_train, mode=None) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=3, padding='same', activation=tf.nn.relu) #seq = tf.layers.conv1d(seqs[-1] * mask, num_filters, kernel_size=kernel_sizes[layer], padding='same', activation=tf.nn.relu) # if self.is_train and self.keep_prob < 1: # seq = tf.nn.dropout(seq, self.keep_prob) #seq = melt.layers.batch_norm(seq, self.is_train, name='layer_%d' % layer) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def __call__(self, inputs, embedded_inputs): """1.get padding; 2.add position encoding. Args: inputs: size with [batch_size, length] embedded_inputs: size with [batch_size, length, hidden_size] return: encoder_inputs: size with [batch_size, length, hidden_size] inputs_padding: size with [batch_size, length] """ with tf.name_scope("stack_input"): inputs_padding = model_utils.get_padding(inputs) length = tf.shape(inputs)[1] pos_encoding = model_utils.get_position_encoding(length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout(encoder_inputs, 1-self.params["layer_postprocess_dropout"]) return encoder_inputs, inputs_padding
def get_pointer_encodings(self, images, words, tags, word_paddings=None, training=False): batch_size, image_locations, length = (tf.shape(images)[0], tf.shape(images)[1], tf.shape(words)[1]) if word_paddings is None: word_paddings = tf.cast(tf.ones_like(words), self.params["dtype"]) # Pass the image features [BATCH, 64, 2048] into an encoder images = self.image_layer(images) image_attention_bias = tf.zeros([batch_size, 1, 1, image_locations]) image_attention_bias = tf.cast(image_attention_bias, self.params["dtype"]) image_padding = tf.zeros_like(images) encoder_outputs = self.encoder(images, image_attention_bias, image_padding, training=training) # Add a positional encoding to the word embeddings pos_encoding = tf.cast( model_utils.get_position_encoding(length, self.params["hidden_size"]), self.params["dtype"]) decoder_inputs = pos_encoding + self.merge_embeddings( tf.concat([ self.word_embeddings( words, mode="embedding", training=training), self.tag_embeddings(tags, mode="embedding", training=training) ], -1), training=training) # Use the decoder to merge image and word features word_attention_bias = -1e9 * ( 1.0 - word_paddings[:, tf.newaxis, tf.newaxis, :]) return self.decoder(decoder_inputs, encoder_outputs, word_attention_bias, image_attention_bias, training=training)
def encode(self, inputs, attention_bias): with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.encoder_embedding_layer(inputs) inputs_padding = seoul_get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def predict(self, start_tokens, encoder_outputs, encoder_decoder_attention_bias): """Return predicted sequence.""" with tf.name_scope('decode'): batch_size = tf.shape(encoder_outputs)[0] max_decode_length = self.params['sequence_length'] timing_signal = model_utils.get_position_encoding( max_decode_length, self.params['hidden_size']) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) # Create cache storing decoder attention values for each layer. cache = { 'layer_%d' % layer: { 'k': tf.zeros([batch_size, 0, self.params['hidden_size']]), 'v': tf.zeros([batch_size, 0, self.params['hidden_size']]) } for layer in range(self.params['num_hidden_layers']) } # Add encoder output and attention bias to the cache. cache['encoder_outputs'] = encoder_outputs cache[ 'encoder_decoder_attention_bias'] = encoder_decoder_attention_bias # Forward decoder_inputs to decoder_stack max_decode_length times instead of applying beam search. decoder_outputs = tf.zeros( [batch_size, 0, self.params['output_size']]) decoder_inputs = tf.expand_dims(start_tokens, axis=1) for i in range(max_decode_length): decoder_inputs = self.decoder_embedding_layer(decoder_inputs) decoder_inputs += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_inputs = self.decoder_stack( decoder_inputs, cache.get('encoder_outputs'), self_attention_bias, cache.get('encoder_decoder_attention_bias'), cache) decoder_inputs = self.output_embedding_layer(decoder_inputs) decoder_outputs = tf.concat([decoder_outputs, decoder_inputs], axis=1) return decoder_outputs
def _get_symbols_to_logits_fn(self, max_decode_length): """Returns a decoding function that calculates logits of the next tokens.""" timing_signal = model_utils.get_position_encoding( max_decode_length + 1, self.params.hidden_size) decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( max_decode_length) def symbols_to_logits_fn(ids, i, cache): """Generate logits for next potential IDs. Args: ids: Current decoded sequences. int tensor with shape [batch_size * beam_size, i + 1] i: Loop index cache: dictionary of values storing the encoder output, encoder-decoder attention bias, and previous decoder attention values. Returns: Tuple of (logits with shape [batch_size * beam_size, vocab_size], updated cache values) """ # Set decoder input to the last generated IDs decoder_input = ids[:, -1:] # Preprocess decoder input by getting embeddings and adding timing signal. decoder_input = self.embedding_softmax_layer(decoder_input) decoder_input += timing_signal[i:i + 1] self_attention_bias = decoder_self_attention_bias[:, :, i:i + 1, :i + 1] decoder_outputs = self.decoder_stack( decoder_input, cache.get("encoder_outputs"), self_attention_bias, cache.get("encoder_decoder_attention_bias"), cache) logits = self.embedding_softmax_layer.linear(decoder_outputs) logits = tf.squeeze(logits, axis=[1]) return logits, cache return symbols_to_logits_fn
def call(self, seq, seq_len=None, masks=None, output_method=OutputMethod.all, training=False): if self.use_position_encoding: hidden_size = melt.get_shape(seq, -1) # Scale embedding by the sqrt of the hidden size seq *= hidden_size ** 0.5 # Create binary array of size [batch_size, length] # where 1 = padding, 0 = not padding padding = tf.to_float(tf.sequence_mask(seq_len)) # Set all padding embedding values to 0 seq *= tf.expand_dims(padding, -1) pos_encoding = model_utils.get_position_encoding( tf.shape(seq)[1], tf.shape(seq)[-1]) seq = seq + pos_encoding num_filters = self.num_filters seqs = [seq] #batch_size = melt.get_batch_size(seq) for layer in range(self.num_layers): if masks is None: seq_ = melt.dropout(seq, self.keep_prob, training) else: seq_ = seq * masks[layer] seq = self.conv1ds[layer](seq_) seqs.append(seq) outputs = tf.concat(seqs[1:], 2) # not do any dropout in convet just dropout outside # if self.is_train and self.keep_prob < 1: # outputs = tf.nn.dropout(outputs, self.keep_prob) # compact for rnn with sate return return melt.rnn.encode_outputs(outputs, seq_len, output_method)
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] # 这个矩阵,不是padding的部分,都是0,是padding的部分,都是负无穷, Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer( inputs) # 将inputs做embedding # 获得padding information tensor,凡是padding部分是1,非padding部分是0,形状与inputs一样 inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding" ): # 给embedded_inputs添加pos_encoding,即添加时序信息 length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: # 如果是训练模式,则需要加上dropout encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) # encoder_inputs 的 shape 应该是: [batch_size, input_length, hidden_size] # attention_bias 应该是: [batch_size, 1, 1, input_length] # inputs_padding 应该是: [batch_size, input_length] return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding) # 将经过encoder的结果返回
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ print('LOOK AT ME') print(inputs.get_shape().as_list()) print(attention_bias.get_shape().as_list()) with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) print('YOOO') print(encoder_inputs) print(attention_bias) print(inputs_padding) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def decode(self, targets, encoder_outputs, attention_bias): """Generate logits for each value in the target sequence. Args: targets: target values for the output sequence. int tensor with shape [batch_size, target_length] encoder_outputs: continuous representation of input sequence. float tensor with shape [batch_size, input_length, hidden_size] attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float32 tensor with shape [batch_size, target_length, vocab_size] """ with tf.name_scope("decode"): # Prepare inputs to decoder layers by shifting targets, adding positional # encoding and applying dropout. decoder_inputs = self.embedding_softmax_layer(targets) with tf.name_scope("shift_targets"): # Shift targets to the right, and remove the last element decoder_inputs = tf.pad( decoder_inputs, [[0, 0], [1, 0], [0, 0]])[:, :-1, :] with tf.name_scope("add_pos_encoding"): length = tf.shape(decoder_inputs)[1] decoder_inputs += model_utils.get_position_encoding( length, self.params.hidden_size) if self.train: decoder_inputs = tf.nn.dropout( decoder_inputs, 1 - self.params.layer_postprocess_dropout) # Run values decoder_self_attention_bias = model_utils.get_decoder_self_attention_bias( length) outputs = self.decoder_stack( decoder_inputs, encoder_outputs, decoder_self_attention_bias, attention_bias) logits = self.embedding_softmax_layer.linear(outputs) return logits
def encode(self, inputs, attention_bias): """Generate continuous representation for inputs. Args: inputs: int tensor with shape [batch_size, input_length]. attention_bias: float tensor with shape [batch_size, 1, 1, input_length] Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. embedded_inputs = self.embedding_softmax_layer(inputs) inputs_padding = model_utils.get_padding(inputs) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding # shape (batch_size, input_len, h_size) with tf.name_scope("add_vir_entities"): encoder_inputs = self.add_vir_entities( encoder_inputs ) # shape (batch_size, input_len + num_ve, h_size) if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params["layer_postprocess_dropout"]) x = self.encoder_stack( encoder_inputs, attention_bias, inputs_padding) # shape (-1, length, h_size) # Remove virtual entities from the encoder output x = x[:, :-self.params["num_vir_entities"], :] return x # shape (batch_size, input_length, hidden_size)
tf_sess.run(tf.global_variables_initializer()) tf_assign_list = get_assign_list(tf_transformer) assert len(tf_assign_list) == len(list(set(tf_assign_list))) tf_sess.run(tf_assign_list) tf_res = tf_sess.run(tf_output, feed_dict={ tf_input_x_raw: my_input_x_raw, tf_input_y_raw: my_input_y_raw }) print("tf output:") with printoptions(precision=3, suppress=True): print(tf_res) tf_embedded_inputs = tf_transformer.embedding_softmax_layer(tf_input_x_raw) tf_pos_encoding = tf_model_utils.get_position_encoding( seq_len_x, tf_transformer.params.hidden_size) tf_embedding_inputs = tf_embedded_inputs + tf_pos_encoding tf_attention_bias = tf_model_utils.get_padding_bias(tf_input_x_raw) tf_encoder_outputs = tf_transformer.encode(tf_input_x_raw, tf_attention_bias) tf_pred = tf_transformer(tf_input_x_raw)["outputs"] tf_pred_res = tf_sess.run(tf_pred, feed_dict={tf_input_x_raw: my_input_x_raw}) print("tf prediction:") with printoptions(threshold=2000): print(tf_pred_res) k_transformer = KTransformer(params) k_input_x_raw = Input(shape=(_seq_len_x, ))