def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) ###domyounglee 2020.2.12 self.cls_attention_bias = model_utils.get_padding_bias( tf.cast(tf.equal(inputs, 2), tf.int64)) if targets is not None: self.cls_dec_attention_bias = model_utils.get_cls_dec_attention_bias( tf.cast(tf.equal(targets, 2), tf.int64)) else: self.cls_dec_attention_bias = None # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias, cls_attention_bias=None, cls_dec_attention_bias=None, identity_mask=None) return logits
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, 3, input_length], where [old source seq, old target seq, new source seq] in the 2nd dimension targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("TransformerDifre", initializer=initializer): # Extract each element from inputs inputs_oldsrc = inputs[:,0,:] inputs_oldtrg = inputs[:,1,:] inputs_newsrc = inputs[:,2,:] # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias_oldsrc = model_utils.get_padding_bias(inputs_oldsrc) attention_bias_oldtrg = model_utils.get_padding_bias(inputs_oldtrg) attention_bias_newsrc = model_utils.get_padding_bias(inputs_newsrc) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. src_encoder_outputs = self.src_encode( inputs_oldsrc, attention_bias_oldsrc) diff_encoder_outputs = self.diff_encode( inputs_newsrc, src_encoder_outputs, attention_bias_newsrc) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict( inputs_oldtrg, diff_encoder_outputs, attention_bias_oldtrg) else: logits = self.decode( targets, inputs_oldtrg, diff_encoder_outputs, attention_bias_oldtrg) return logits
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. src_attention_bias = model_utils.get_padding_bias( inputs) # used for 1.src_encode self-att; 2.decode en-de att. if targets is not None: # tc modified tgt_attention_bias = model_utils.get_padding_bias( targets) # only used for tgt_encode self-att. else: tgt_attention_bias = None # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. (encoder_outputs, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar) = self.encode(inputs, src_attention_bias, targets, tgt_attention_bias) # tc modified # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: logits = self.predict(encoder_outputs, src_attention_bias, latent_sample) else: logits = self.decode(targets, encoder_outputs, src_attention_bias, latent_sample) return logits, latent_sample, prior_mu, prior_logvar, recog_mu, recog_logvar
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params.initializer_gain, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def predict(self, inputs, **kwargs): source, targets = inputs[0], inputs[1] with tf.name_scope("Transformer_Predict"): attention_bias = model_utils.get_padding_bias(source) encoder_outputs = self.encode(source, attention_bias, self.params['train']) logits = self.decode(targets, encoder_outputs, attention_bias, self.params['train']) return logits
def encode_no_lookup(self, embedded_inputs, inputs_mask): """Encoder step for transformer given already-embedded inputs Args: model: transformer model embedded_inputs: int tensor with shape [batch_size, input_length, emb_size]. inputs_mask: int tensor with shape [batch_size, input_length] params: transformer_params train: boolean flag Returns: float tensor with shape [batch_size, input_length, hidden_size] """ with tf.name_scope("encode"): # Prepare inputs to the layer stack by adding positional encodings and # applying dropout. inputs_padding = model_utils.get_padding(inputs_mask) attention_bias = model_utils.get_padding_bias(inputs_mask) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, self.params.hidden_size) encoder_inputs = embedded_inputs + pos_encoding if self.train: encoder_inputs = tf.nn.dropout( encoder_inputs, 1 - self.params.layer_postprocess_dropout) return self.encoder_stack(encoder_inputs, attention_bias, inputs_padding)
def __call__(self, inputs): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer_gain = 1. initializer = tf.variance_scaling_initializer(initializer_gain, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) return encoder_outputs
def call(self, inputs): """Calculate target logits or inferred target sequences. Args: inputs: input tensor list of size 1 or 2. First item, inputs: int tensor with shape [batch_size, input_length]. Second item (optional), targets: None or int tensor with shape [batch_size, target_length]. training: boolean, whether in training mode or not. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { outputs: [batch_size, decoded length] scores: [batch_size, float]} Even when float16 is used, the output tensor(s) are always float32. Raises: NotImplementedError: If try to use padded decode method on CPU/GPUs. """ input_ids, final_hidden, targets = inputs training = self.train # if len(inputs) == 2: # inputs, targets = inputs[0], inputs[1] # else: # inputs, targets = inputs[0], None # if self.params["padded_decode"]: # if not self.params["num_replicas"]: # raise NotImplementedError( # "Padded decoding on CPU/GPUs is not supported.") # decode_batch_size = int(self.params["decode_batch_size"] / # self.params["num_replicas"]) # inputs = tf.reshape( # inputs, [decode_batch_size, self.params["decode_max_length"]]) # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. with tf.name_scope("Transformer"): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(input_ids) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. #encoder_outputs = self.encode(inputs, attention_bias, training) encoder_outputs = final_hidden # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias, training) else: logits = self.decode(targets, encoder_outputs, attention_bias, training) return logits
def test_get_padding_bias(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) bias = model_utils.get_padding_bias(x) bias_shape = tf.shape(bias) flattened_bias = tf.reshape(bias, [3, 5]) with self.test_session() as sess: flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape)) self.assertAllEqual( [[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias) self.assertAllEqual([3, 1, 1, 5], bias_shape)
def call(self, inputs, training): """Calculate target logits or inferred target sequences. Args: inputs: input tensor list of size 1 or 2. First item, inputs: int tensor with shape [batch_size, input_length]. Second item (optional), targets: None or int tensor with shape [batch_size, target_length]. training: boolean, whether in training mode or not. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { outputs: [batch_size, decoded length] scores: [batch_size, float]} Even when float16 is used, the output tensor(s) are always float32. """ if len(inputs) == 2: inputs, targets = inputs[0], inputs[1] else: inputs, targets = inputs[0], None # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. with tf.name_scope("Transformer"): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias, training) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias, training) else: logits = self.decode(targets, encoder_outputs, attention_bias, training) return logits
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. 向量形状:batch size × input length targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target 训练阶段返回target概率 sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. 预测阶段返回预测结果 returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well.x # 定义变量的初始化方式,均匀分布 initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. # 得到一个跟input相同形状的attention bias向量,padding的0值为1e-9,否则为1(感觉更像mask) attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. # 将输入经encoder编码为表示 encoder_outputs = self.encode(inputs, attention_bias) # Generate output sequence if targets is None, or return logits if target # sequence is known. # 如有target(训练阶段),返回target的概率;如果没有target(预测阶段),返回预测情况 if targets is None: return self.predict(encoder_outputs, attention_bias) else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. # init负责构造这些层,call负责 Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( # 初始化器,给了scope,即可实现初始化 self.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias( inputs) # 获得attention偏差矩阵, # 这个矩阵,不是padding的部分,都是0,是padding的部分,都是负无穷,而且插了两个维度,貌似是给 num_heads 和 length 准备的 # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # 将输入进行encode # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: # 没给目标句子,那就是要做预测了 return self.predict(encoder_outputs, attention_bias) else: # 给了目标句子,那就是要训练或者验证了 logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def transformer_encoder(input, lengths): # Set up estimator and params params = model_params.BASE_PARAMS params["default_batch_size"] = K params["max_length"] = 500 params["vocab_size"] = VOCABULARY_SIZE + 1 params["filter_size"] = 256 params["num_hidden_layers"] = 2 params["num_heads"] = 2 params["hidden_size"] = EMBEDDING_SIZE model = transformer.Transformer(params, tf.estimator.ModeKeys.TRAIN) initializer = tf.variance_scaling_initializer( model.params["initializer_gain"], mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder = model.encode(inputs, attention_bias) return tf.reduce_mean(encoder, 1)
tf_res = tf_sess.run(tf_output, feed_dict={ tf_input_x_raw: my_input_x_raw, tf_input_y_raw: my_input_y_raw }) print("tf output:") with printoptions(precision=3, suppress=True): print(tf_res) tf_embedded_inputs = tf_transformer.embedding_softmax_layer(tf_input_x_raw) tf_pos_encoding = tf_model_utils.get_position_encoding( seq_len_x, tf_transformer.params.hidden_size) tf_embedding_inputs = tf_embedded_inputs + tf_pos_encoding tf_attention_bias = tf_model_utils.get_padding_bias(tf_input_x_raw) tf_encoder_outputs = tf_transformer.encode(tf_input_x_raw, tf_attention_bias) tf_pred = tf_transformer(tf_input_x_raw)["outputs"] tf_pred_res = tf_sess.run(tf_pred, feed_dict={tf_input_x_raw: my_input_x_raw}) print("tf prediction:") with printoptions(threshold=2000): print(tf_pred_res) k_transformer = KTransformer(params) k_input_x_raw = Input(shape=(_seq_len_x, )) k_input_y_raw = Input(shape=(_seq_len_y, )) k_embedded_inputs = k_transformer.embedding_softmax_layer(k_input_x_raw)