def __call__(self, inputs, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then return encoder outputs. """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer(scale=1.0, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer, reuse=tf.AUTO_REUSE): # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias) # Add attention model to generate sentence embedding: attention_outputs, alpha = self.full_attention_layer( encoder_outputs) return attention_outputs else: src_attention_bias = model_utils.get_padding_bias(inputs) src_encoder_outputs = self.encode(inputs, src_attention_bias) src_attention_outputs, _ = self.full_attention_layer( src_encoder_outputs) tgt_attention_bias = model_utils.get_padding_bias(targets) tgt_encoder_outputs = self.encode(targets, tgt_attention_bias) tgt_attention_outputs, _ = self.full_attention_layer( tgt_encoder_outputs) print(src_attention_outputs.get_shape().as_list()) print(tgt_attention_outputs.get_shape().as_list()) logits = tf.reduce_sum(tf.square( tf.subtract(src_attention_outputs, tgt_attention_outputs)), 1, keep_dims=True) logits = tf.reshape(logits, [-1], name="logits") return logits
def decoder_train(self, x, y): ## x: (batch_size, enc_len) , y: (batch_size, dec_len) dec_bias = model_utils.get_decoder_self_attention_bias( self.max_dec_len) attention_bias = model_utils.get_padding_bias(x) # Encoder encoder_emb_inp = self.build_embed(x, encoder=True, reuse=False) encoder_outputs = self.build_encoder(x, encoder_emb_inp, attention_bias, reuse=False) # Decoder batch_size = tf.shape(x)[0] start_tokens = tf.fill([batch_size, 1], self.bos_idx) # 2: <s> ID target_slice_last_1 = tf.slice(y, [0, 0], [batch_size, self.max_dec_len - 1]) decoder_inputs = tf.concat([start_tokens, target_slice_last_1], axis=1) ## shift to right decoder_emb_inp = self.build_embed(decoder_inputs, encoder=False, reuse=True) decoder_outputs = self.build_decoder(decoder_emb_inp, encoder_outputs, dec_bias, attention_bias, reuse=False) train_prob = self.build_output(decoder_outputs, reuse=False) return encoder_outputs, decoder_inputs, train_prob
def call(self, inputs, targets): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # 此处没有什么实际的解释,总之是有好处的 # initializer = tf.variance_scaling_initializer( # self.params["initializer_gain"], mode="fan_avg", distribution="uniform") # with tf.variable_scope("Transformer", initializer=initializer): # 所有的padding位置标记为 -1e9, 其余位置为0 attention_bias = model_utils.get_padding_bias(inputs) # 经过 encoder 得到输入句子的编码 encoder_outputs = self.encode( inputs, attention_bias) # batch,length,hidden_size # 用于预测 logits = self.decode(targets, encoder_outputs, attention_bias) return logits
def translate(): source = ["Es un gran honor conocerte aqui.", "Me gustaría hablar contigo sobre lo que ocurrió ayer en la escuela.", "Tom tiene una hermana que puede hablar francés.", "Soy un estudiante de la Universidad."] source = [load_data.preprocess_sentence(s) for s in source] input_tensor = [[inp_lang.word2idx[s] for s in sp.split(' ')] for sp in source] input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, maxlen=params["max_length_input"], padding='post') # 经过 encoder attention_bias = model_utils.get_padding_bias(input_tensor) # batch, 1, 1, length encoder_outputs = model.encode(input_tensor, attention_bias) # batch,length,hidden_size print("---------Decoder-----------") # 进入decode IDS = predict(encoder_outputs, attention_bias) for i in range(len(source)): word = " ".join([targ_lang.idx2word[w] for w in IDS[i]]) print("----------") print(source[i]) print(word) print("----------\n")
def Embedding(self, x): # args: x shape: [ batch_size, length] # return: [batch_size, length, hidden_size] hparams = self.hparams if hparams['embedding_model'] == 'transformer': self.embedding_layer = embedding_layer.EmbeddingSharedWeights( hparams["vocab_size"], hparams["hidden_size"]) embedded_inputs = self.embedding_layer(x) with tf.name_scope("add_pos_encoding"): length = tf.shape(embedded_inputs)[1] pos_encoding = model_utils.get_position_encoding( length, hparams["hidden_size"]) encoder_inputs = embedded_inputs + pos_encoding if self.hparams['train']: encoder_inputs = tf.nn.dropout( encoder_inputs, rate=self.hparams["layer_postprocess_dropout"]) self.inputs_padding = model_utils.get_padding(x) self.attention_bias = model_utils.get_padding_bias(x) return encoder_inputs
def __call__(self, inputs, padnum, pos): initializer = tf.variance_scaling_initializer(1, mode="fan_avg", distribution="uniform") with tf.variable_scope("Transformer", initializer=initializer): attention_bias = model_utils.get_padding_bias(padnum) encoderout = self.encode(inputs, attention_bias, padnum, pos) return encoderout
def call(self, inputs, training): """Calculate target logits or inferred target sequences. Args: inputs: input tensor list of size 1 or 2. First item, inputs: int tensor with shape [batch_size, input_length]. Second item (optional), targets: None or int tensor with shape [batch_size, target_length]. training: boolean, whether in training mode or not. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { outputs: [batch_size, decoded length] scores: [batch_size, float]} Even when float16 is used, the output tensor(s) are always float32. Raises: NotImplementedError: If try to use padded decode method on CPU/GPUs. """ if len(inputs) == 2: inputs, targets = inputs[0], inputs[1] else: # Decoding path. inputs, targets = inputs[0], None if self.params["padded_decode"]: if not self.params["num_replicas"]: raise NotImplementedError( "Padded decoding on CPU/GPUs is not supported.") decode_batch_size = int(self.params["decode_batch_size"] / self.params["num_replicas"]) inputs.set_shape([ decode_batch_size, self.params["decode_max_length"] ]) # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. with tf.name_scope("Transformer"): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias, training) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias, training) else: logits = self.decode(targets, encoder_outputs, attention_bias, training) return logits
def test_get_padding_bias(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) bias = model_utils.get_padding_bias(x) bias_shape = tf.shape(bias) flattened_bias = tf.reshape(bias, [3, 5]) self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias) self.assertAllEqual([3, 1, 1, 5], bias_shape)
def test_get_padding_bias(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) bias = model_utils.get_padding_bias(x) bias_shape = tf.shape(input=bias) flattened_bias = tf.reshape(bias, [3, 5]) with self.test_session() as sess: flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape)) self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias) self.assertAllEqual([3, 1, 1, 5], bias_shape)
def test_get_padding_bias(self): x = tf.constant([[1, 0, 0, 0, 2], [3, 4, 0, 0, 0], [0, 5, 6, 0, 7]]) bias = model_utils.get_padding_bias(x) bias_shape = tf.shape(bias) flattened_bias = tf.reshape(bias, [3, 5]) with self.test_session() as sess: flattened_bias, bias_shape = sess.run((flattened_bias, bias_shape)) self.assertAllEqual([[0, NEG_INF, NEG_INF, NEG_INF, 0], [0, 0, NEG_INF, NEG_INF, NEG_INF], [NEG_INF, 0, 0, NEG_INF, 0]], flattened_bias) self.assertAllEqual([3, 1, 1, 5], bias_shape)
def call(self, inputs, training): if len(inputs) == 2: inputs, targets = inputs[0], inputs[1] else: inputs, targets = inputs[0], None with tf.name_scope('Transformer'): attention_bias = model_utils.get_padding_bias(inputs) encoder_outputs = self.encode(inputs, attention_bias, training) if targets is None: return self.predict(encoder_outputs, attention_bias, training) else: logits = self.decode(targets, encoder_outputs, attention_bias, training) return logits
def decoder_infer(self, x): dec_bias = model_utils.get_decoder_self_attention_bias( self.max_dec_len) attention_bias = model_utils.get_padding_bias(x) # Encoder encoder_emb_inp = self.build_embed(x, encoder=True, reuse=True) encoder_outputs = self.build_encoder(x, encoder_emb_inp, attention_bias, reuse=True) # Decoder batch_size = tf.shape(x)[0] start_tokens = tf.fill([batch_size, 1], self.bos_idx) # 2: <s> ID next_decoder_inputs = tf.concat([ start_tokens, tf.zeros([batch_size, self.max_dec_len - 1], dtype=tf.int32) ], axis=1) ## batch_size, dec_len # predict output with loop. [encoder_outputs, decoder_inputs (filled next token)] for i in range(1, self.max_dec_len): decoder_emb_inp = self.build_embed(next_decoder_inputs, encoder=False, reuse=True) decoder_outputs = self.build_decoder(decoder_emb_inp, encoder_outputs, dec_bias, attention_bias, reuse=True) logits = self.build_output(decoder_outputs, reuse=True) next_decoder_inputs = self._filled_next_token( next_decoder_inputs, logits, i) # slice start_token decoder_input_start_1 = tf.slice(next_decoder_inputs, [0, 1], [batch_size, self.max_dec_len - 1]) output_token = tf.concat( [decoder_input_start_1, tf.zeros([batch_size, 1], dtype=tf.int32)], axis=1) return output_token
def __call__(self, inputs, input_types, targets=None): """Calculate target logits or inferred target sequences. Args: inputs: int tensor with shape [batch_size, input_length]. targets: None or int tensor with shape [batch_size, target_length]. Returns: If targets is defined, then return logits for each word in the target sequence. float tensor with shape [batch_size, target_length, vocab_size] If target is none, then generate output sequence one token at a time. returns a dictionary { output: [batch_size, decoded length] score: [batch_size, float]} """ # Variance scaling is used here because it seems to work in many problems. # Other reasonable initializers may also work just as well. initializer = tf.variance_scaling_initializer( self.params["initializer_gain"], mode="fan_avg", distribution="uniform") # initializer = tf.truncated_normal_initializer(stddev=self.params["initializer_range"]) with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE, initializer=initializer): # Calculate attention bias for encoder self-attention and decoder # multi-headed attention layers. attention_bias = model_utils.get_padding_bias(inputs) # Run the inputs through the encoder layer to map the symbol # representations to continuous representations. encoder_outputs = self.encode(inputs, attention_bias, input_types) # Generate output sequence if targets is None, or return logits if target # sequence is known. if targets is None: return self.predict(encoder_outputs, attention_bias), encoder_outputs else: logits = self.decode(targets, encoder_outputs, attention_bias) return logits, encoder_outputs
tf_pred_res = tf_sess.run(tf_pred, feed_dict={tf_input_x_raw: my_input_x_raw}) print("tf prediction:") with printoptions(threshold=2000): print(tf_pred_res) k_transformer = KTransformer(params) k_input_x_raw = Input(shape=(_seq_len_x, )) k_input_y_raw = Input(shape=(_seq_len_y, )) k_embedded_inputs = k_transformer.embedding_softmax_layer(k_input_x_raw) k_pos_encoding = k_model_utils.get_position_encoding( seq_len_x, k_transformer.params.hidden_size) k_embedding_inputs = k_embedded_inputs + k_pos_encoding k_attention_bias = k_model_utils.get_padding_bias(k_input_x_raw) k_encoder_outputs = k_transformer.encode(k_input_x_raw, k_attention_bias, train=False) k_output = k_transformer([k_input_x_raw, k_input_y_raw], train=False) tf_sess.run(tf.global_variables_initializer()) tf_sess.run(get_assign_list(k_transformer)) k_run = K.function([k_input_x_raw, k_input_y_raw], [k_output]) k_res = k_run([my_input_x_raw, my_input_y_raw])[0] print("k output:") with printoptions(precision=3, suppress=True): print(k_res)