def decode(self, ys, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale if training: dec += positional_encoding(dec, self.hp.maxlen2) else: dec += positional_encoding(dec, 1000) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def representation(self, xs, ys, training=True): with tf.variable_scope("representation", reuse=tf.AUTO_REUSE): x = xs y = ys # print(x) # print(y) # embedding encx = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) encx *= self.hp.d_model**0.5 # scale encx += positional_encoding(encx, self.hp.maxlen) encx = tf.layers.dropout(encx, self.hp.dropout_rate, training=training) ency = tf.nn.embedding_lookup(self.embeddings, y) # (N, T1, d_model) ency *= self.hp.d_model**0.5 # scale ency += positional_encoding(ency, self.hp.maxlen) ency = tf.layers.dropout(ency, self.hp.dropout_rate, training=training) #add ln encx = ln(encx) ency = ln(ency) ## Blocks x_layer = [] y_layer = [] for i in range(self.hp.num_extract_blocks + self.hp.num_inter_blocks): if i < self.hp.num_extract_blocks: encx = self.base_blocks(encx, encx, training=training, scope="num_blocks_{}".format(i)) ency = self.base_blocks(ency, ency, training=training, scope="num_blocks_{}".format(i)) #encx, ency = localInference(encx, ency) x_layer.append(encx) y_layer.append(ency) else: encx, ency = self.inter_blocks( encx, ency, training=training, scope="num_blocks_{}".format(i)) #encx, ency = localInference(encx, ency) x_layer.append(encx) y_layer.append(ency) return x_layer, y_layer
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding1 enc1 = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_model) enc1 *= self.hp.d_model**0.5 # scale enc1 += positional_encoding(enc1, self.hp.maxlen1) enc1 = tf.layers.dropout(enc1, self.hp.dropout_rate, training=training) # embedding2 enc2 = tf.nn.embedding_lookup(self.embeddings2, x) # (N, T1, d_model) enc2 *= self.hp.d_model**0.5 # scale enc2 += positional_encoding(enc2, self.hp.maxlen1) enc2 = tf.layers.dropout(enc2, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc1, enc2 = multihead_attention( queries=(enc1, enc2), keys=enc1, values=enc2, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc1 = ff(enc1, num_units=[self.hp.d_ff, self.hp.d_model]) enc2 = ff(enc2, num_units=[self.hp.d_ff, self.hp.d_model]) memory = (enc1, enc2) return memory, sents1, src_masks
def _encode(self, enc, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # embedding enc *= self.arg.d_model**0.5 # scale enc += positional_encoding(enc, self.arg.maxlen1) enc = tf.layers.dropout(enc, self.arg.dropout_rate, training=training) ## Blocks for i in range(self.arg.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.arg.num_heads, dropout_rate=self.arg.dropout_rate, training=training, causality=False) memory = enc return memory
def time_encode(self, encoder_inputs): ''' Returns memory: encoder outputs. (BATCH, SEQ_LEN, HIDDEN_SIZE) ''' with tf.variable_scope("time_encoder", reuse=tf.AUTO_REUSE): # embedding enc = tf.nn.embedding_lookup(self.embeddings, encoder_inputs) enc *= hp.HIDDEN_SIZE**0.5 enc += positional_encoding(enc, hp.MAX_LEN) enc = tf.nn.dropout(enc, self.dropout) # Blocks for i in range(hp.NUM_BLOCKS): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=hp.NUM_HEADS, dropout=self.dropout, causality=True) # feed forward enc = ff(enc, num_units=[hp.FF_SIZE, hp.HIDDEN_SIZE]) output = tf.reshape(enc, (-1, hp.MAX_LEN, hp.HIDDEN_SIZE)) logits = tf.layers.dense(output, len(self.token2idx)) return logits
def build_embedding_layer(self, inputs, reuse=None): self.emb_char = embedding(inputs, vocab_size=self.vocab_size, num_units=self.hidden_units, scale=True, scope="emb_char", reuse=reuse) self.emb_char_pos = self.emb_char if self.emb_pos_type == 'sin': self.emb_char_pos += positional_encoding(inputs, num_units=self.hidden_units, zero_pad=False, scale=False, scope="emb_pos", reuse=reuse) else: self.emb_char_pos += embedding(tf.tile(tf.expand_dims(tf.range(tf.shape(inputs)[1]), 0), [tf.shape(inputs)[0], 1]), vocab_size=self.maxlen, num_units=self.hidden_units, zero_pad=False, scale=False, scope="emb_pos", reuse=reuse) self.emb = tf.layers.dropout(self.emb_char_pos, rate=self.dropout,) return self.emb
def decode(self, decoder_inputs, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' scopes = [] outputs = [] with tf.variable_scope("decoder_embedding_lookup", reuse=tf.AUTO_REUSE): # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(dec) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("decoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(dec) return dec, outputs, scopes
def transformer_encode(enc, config, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("Transformer", reuse=tf.AUTO_REUSE): # embedding enc *= config.d_model**0.5 # scale enc += positional_encoding(enc, config.max_sent_num) enc = tf.layers.dropout(enc, config.drop_rate, training=training) ## Blocks for i in range(config.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=config.num_heads, dropout_rate=config.drop_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[config.d_ff, config.d_model]) memory = enc return memory
def encode(self, xs, training=True): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def __init__(self, sess, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SAKmeans, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SAKmeans") with tf.variable_scope("Model_SAKmeans", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) num_heads = num_interest self.user_eb = getKVector(sess, self.seq, num_heads) self.dim = embedding_dim item_list_emb = tf.reshape(self.seq, [-1, seq_len, embedding_dim]) # item_list_emb = [-1, seq_len, embedding_dim] # atten: (batch, num_heads, dim) * (batch, dim, 1) = (batch, num_heads, 1) atten = tf.matmul(self.user_eb, tf.reshape(self.item_eb, [get_shape(item_list_emb)[0], self.dim, 1])) atten = tf.nn.softmax(tf.pow(tf.reshape(atten, [get_shape(item_list_emb)[0], num_heads]), 1)) # 找出与target item最相似的用户兴趣向量 readout = tf.gather(tf.reshape(self.user_eb, [-1, self.dim]), tf.argmax(atten, axis=1, output_type=tf.int32) + tf.range( tf.shape(item_list_emb)[0]) * num_heads) self.build_sampled_softmax_loss(self.item_eb, readout)
def encode(self, x, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' scopes = [] outputs = [] with tf.variable_scope("embeddings", reuse=tf.AUTO_REUSE): self.token2idx, self.idx2token = load_vocab(self.hp.vocab) self.embeddings = get_token_embeddings(self.hp.vocab_size, self.hp.d_model, zero_pad=True) scopes.append(tf.get_variable_scope().name) outputs.append(self.embeddings) with tf.variable_scope("encoder_embedding_lookup", reuse=tf.AUTO_REUSE): # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) scopes.append(tf.get_variable_scope().name) outputs.append(enc) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("encoder_num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention(queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) scopes.append(tf.get_variable_scope().name) outputs.append(enc) memory = enc return memory, src_masks, outputs, scopes
def encode(self, xs, training=True): ''' xs: 训练数据 Returns memory: encoder outputs. (N, T1, d_model) N: batch size; T1: sentence length d_model: 512, 词向量长度 ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # xs: tuple of # x: int32 tensor. (N, T1) # x_seqlens: int32 tensor. (N,) 句子长度 # sents1: str tensor. (N,) x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 加上位置编码向量 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # Blocks 编码器模块 # num_blocks=6编码器中小模块数量,小模块指 multihead_attention + feed_forward for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding if self.hp.fac_embed: enc = tf.nn.embedding_lookup(self.embeddings1, x) # (N, T1, d_embed) enc = tf.matmul(enc, self.embeddings2) # (N, T1, d_model) else: enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): if self.hp.share_weights: vs_name = "blocks_shared" else: vs_name = "num_blocks_{}".format(i) with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def encode(self, xs, training=True): # 实现encode 模型 ''' Returns memory: encoder outputs. (N, T1, d_model) # N:batch_size # T1:句子长度 # d_model:词向量维度 ''' # 实现的功能: # (1)输入词向量 + positional_encoding # (2)encode中共有6个blocks进行连接,每个encode中有multihead attention和全连接层ff进行连接 with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) # 将位置向量添加到初始词向量中 enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1, src_masks
def pre_encoder(self, x): with tf.variable_scope("pre_encoder", reuse=tf.AUTO_REUSE): #x, seqlens, sents1 = xs # src_masks src_masks = tf.math.equal(x, 0) # (N, T1) # embedding enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=self.is_training) return enc, src_masks
def __init__(self, n_mid, embedding_dim, hidden_size, batch_size, num_interest, dropout_rate=0.2, seq_len=256, num_blocks=2): super(Model_SASRec, self).__init__(n_mid, embedding_dim, hidden_size, batch_size, seq_len, flag="Model_SASRec") with tf.variable_scope("Model_SASRec", reuse=tf.AUTO_REUSE) as scope: # Positional Encoding t = tf.expand_dims(positional_encoding(embedding_dim, seq_len), axis=0) self.mid_his_batch_embedded += t # Dropout self.seq = tf.layers.dropout(self.mid_his_batch_embedded, rate=dropout_rate, training=tf.convert_to_tensor(True)) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # Build blocks for i in range(num_blocks): with tf.variable_scope("num_blocks_%d" % i): # Self-attention self.seq = multihead_attention(queries=normalize(self.seq), keys=self.seq, num_units=hidden_size, num_heads=num_interest, dropout_rate=dropout_rate, is_training=True, causality=True, scope="self_attention") # Feed forward self.seq = feedforward(normalize(self.seq), num_units=[hidden_size, hidden_size], dropout_rate=dropout_rate, is_training=True) self.seq *= tf.reshape(self.mask, (-1, seq_len, 1)) # (b, seq_len, dim) self.seq = normalize(self.seq) self.sum_pooling = tf.reduce_sum(self.seq, 1) fc1 = tf.layers.dense(self.sum_pooling, 1024, activation=tf.nn.relu) fc2 = tf.layers.dense(fc1, 512, activation=tf.nn.relu) fc3 = tf.layers.dense(fc2, 256, activation=tf.nn.relu) self.user_eb = tf.layers.dense(fc3, hidden_size, activation=tf.nn.relu) self.build_sampled_softmax_loss(self.item_eb, self.user_eb)
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, video_path = xs # src_masks # embedding enc = tf.layers.dense(x, self.d_model) #src_masks = tf.math.equal(mask, 0) # (N, T1) src_masks = tf.sequence_mask(seqlens) #enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) #enc *= self.hp.d_model**0.5 # scale enc /= self.hp.d_model**0.5 enc += positional_encoding(enc, self.hp.n_video) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, ) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, src_masks
def encode(self, xs, training=True): ''' Returns memory: encoder outputs. (N, T1, d_model) ''' with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): x, seqlens, sents1 = xs # x = tf.Print(x, [x], message='x =', summarize=10) # print_sent = tf.Print(sents1, [sents1], message='sents1 =', summarize=3) # with tf.control_dependencies([print_sent]): # embedding # xs_pri = tf.print('xs =', tf.shape(x), summarize=3) enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc *= self.hp.d_model**0.5 # scale enc += positional_encoding(enc, self.hp.maxlen1) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) # enc_pri = tf.print('enc =', tf.shape(enc), enc, summarize=3) ## Blocks # with tf.control_dependencies([xs_pri, enc_pri]): for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) memory = enc return memory, sents1
def __init__(self, num_layers, d_model, num_heads, d_ff, input_vocab_size, maximum_position_encoding, rate=0.1): super(Encoder, self).__init__() self.num_layers = num_layers self.d_model = d_model self.pos_enc = positional_encoding(maximum_position_encoding, d_model) self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model) self.dropout = tf.keras.layers.Dropout(rate) self.encoder_layers = [ EncoderLayer(d_model, d_ff, num_heads, rate) for x in range(num_layers) ]
def _encode(self, x, seq_num, training=True, name=None): """ Returns memory: encoder outputs. (N, T1, d_model) """ with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): # embedding x = tf.identity(x, "input_x") enc = tf.nn.embedding_lookup(self._embeddings[seq_num], x) # (N, T1, d_model) enc *= self._context.d_model**0.5 # scale enc += positional_encoding(enc, self._context.maxlens[seq_num]) enc = tf.layers.dropout(enc, self._context.dropout_rate, training=training) # # Blocks for i in range(self._context.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=enc, keys=enc, values=enc, num_heads=self._context.num_heads, dropout_rate=self._context.dropout_rate, training=training, causality=False) # feed forward enc = ff( enc, num_units=[self._context.d_ff, self._context.d_model]) memory = tf.identity(enc, name=name) return memory
def get_output(self, input, training, return_spectrogram=False, reuse=True): ''' Creates symbolic computation graph of the U-Net for a given input batch :param input: Input batch of mixtures, 3D tensor [batch_size, num_samples, 1], mono raw audio :param reuse: Whether to create new parameter variables or reuse existing ones :Param return_spectrogram: Whether to output the spectrogram estimate or convert it to raw audio and return that :return: U-Net output: If return_spectrogram: Accompaniment and voice magnitudes as length-two list with two 4D tensors. Otherwise Two 3D tensors containing the raw audio estimates ''' # Setup STFT computation window = functools.partial(window_ops.hann_window, periodic=True) inv_window = tf.contrib.signal.inverse_stft_window_fn( self.hop, forward_window_fn=window) with tf.variable_scope("separator", reuse=reuse): enc_outputs = list() # Compute spectrogram assert (input.get_shape().as_list()[2] == 1 ) # Model works ONLY on mono stfts = tf.contrib.signal.stft(tf.squeeze(input, 2), frame_length=self.frame_len, frame_step=self.hop, fft_length=self.frame_len, window_fn=window) mix_mag = tf.abs(stfts) mix_angle = tf.angle(stfts) # Input for network mix_mag_norm = tf.log1p(tf.expand_dims(mix_mag, 3)) mix_mag_norm = mix_mag_norm[:, :, : -1, :] # Cut off last frequency bin to make number of frequency bins divisible by 2 mags = dict() for name in self.source_names: current_layer = mix_mag_norm current_layer = tf.layers.conv2d(current_layer, 128, [3, 3], strides=[2, 2], activation=None, padding='same') current_layer = tf.contrib.layers.batch_norm( current_layer, activation_fn=LeakyReLU, is_training=training) # Position Embedding current_shape = current_layer.get_shape().as_list() maxlen = current_shape[1] * current_shape[2] pos_inputs = tf.reshape(current_layer, [current_shape[0], maxlen, -1]) pos_layer = positional_encoding(pos_inputs, maxlen, masking=False) pos_layer = tf.reshape(pos_layer, current_shape) current_layer += pos_layer # Down-convolution: Repeat pool-conv for i in range(self.num_layers): assert (current_layer.get_shape().as_list()[1] % 2 == 0 and current_layer.get_shape().as_list()[2] % 2 == 0) # block current_layer = tf_multihead_attention( queries=current_layer, keys=current_layer, values=current_layer, num_heads=8, dropout_rate=0.1, training=training, causality=False) current_layer = cnn(current_layer, training=training) # Compute mask mask = tf.layers.conv2d_transpose(current_layer, 1, [3, 3], strides=[2, 2], activation=tf.nn.sigmoid, padding="same") mask = tf.pad( mask, [(0, 0), (0, 0), (0, 1), (0, 0)], mode="CONSTANT", constant_values=0.5 ) # Pad last frequency bin of mask that is missing since we removed it in the input mask = tf.squeeze(mask, 3) # Compute source magnitudes source_mag = tf.multiply(mix_mag, mask) mags[name] = source_mag if return_spectrogram: return mags else: audio_out = dict() # Reconstruct audio for source_name in mags.keys(): stft = tf.multiply(tf.complex(mags[source_name], 0.0), tf.exp(tf.complex(0.0, mix_angle))) audio = tf.contrib.signal.inverse_stft( stft, self.frame_len, self.hop, self.frame_len, window_fn=inv_window) # Reshape to [batch_size, samples, 1] audio = tf.expand_dims(audio, 2) audio_out[source_name] = audio return audio_out
def decode(self, ys, x_paraphrased_dict, memory, training=True): with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys x_paraphrased_dict, paraphrased_lens, paraphrased_sents = x_paraphrased_dict # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) batch_size = tf.shape(decoder_inputs)[0] # (N, T2, 2) seqlens = tf.shape(decoder_inputs)[1] # (N, T2, 2) paraphrased_lens = tf.shape(x_paraphrased_dict)[1] # (N, T2, 2) x_paraphrased_o, x_paraphrased_p = x_paraphrased_dict[:,:,0], x_paraphrased_dict[:,:,1] x_paraphrased_o_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_o) # N, W2, d_model if self.hp.paraphrase_type == 0: x_paraphrased_p_embedding = tf.nn.embedding_lookup(self.embeddings, x_paraphrased_p) else: x_paraphrased_p_embedding = paraphrased_positional_encoding(x_paraphrased_p, self.hp.maxlen2, self.hp.d_model) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention(queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # add paraphrased dictionary attention h = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(dec, axis=2) o_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_o_embedding, axis=1) W_a_o = tf.get_variable("original_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_o = tf.get_variable("original_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_o_concat = tf.concat([h, o_embeding], -1) # N, T2, W2, 2*d_model score_tem_o = tf.tanh(W_a_o * h_o_concat) # N, T2, W2, 2*d_model score_o = tf.reduce_sum(V_a_o * score_tem_o, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_o) # N, T2, W2 c_o = tf.matmul(a, x_paraphrased_o_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model p_embeding = tf.fill([batch_size, seqlens, paraphrased_lens, self.hp.d_model], 1.0) * tf.expand_dims(x_paraphrased_p_embedding, axis=1) W_a_p = tf.get_variable("paraphrased_word_parameter_w", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) V_a_p = tf.get_variable("paraphrased_word_parameter_v", [2*self.hp.d_model], initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) h_p_concat = tf.concat([h, p_embeding], -1) # N, T2, W2, 2*d_model score_tem_p = tf.tanh(W_a_p * h_p_concat) # N, T2, W2, 2*d_model score_p = tf.reduce_sum(V_a_p * score_tem_p, axis=-1) # N, T2, W2 a = tf.nn.softmax(score_p) # N, T2, W2 c_p = tf.matmul(a, x_paraphrased_p_embedding) # (N, T2, W2) * (N, W2, d_model) --> N, T2, d_model c_t = tf.concat([c_o, c_p], axis=-1) # N, T2, d_model --> N, T2, 2*d_model out_dec = tf.layers.dense(tf.concat([dec, c_t], axis=-1), self.hp.d_model, activation=tf.tanh, use_bias=False, kernel_initializer=tf.initializers.random_normal( stddev=0.01, seed=None)) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', out_dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: # x: (32,10) y:(32,10) 一个batch32个句子,每个句子长度为10 self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) """ 定义decoder部分的input 假设真实翻译后的输出为 i am a student </S> decoder部分的input应为: <S> i am a student """ self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2代表<S>,是decoder的初始输入 # 词典 de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad=True, # 让padding一直是0 scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ##Drop out self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()
def build_model(self): # define decoder inputs self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) # 2:<S> # Encoder with tf.variable_scope("encoder"): ## Embedding self.enc = embedding(self.x, vocab_size=len(self.de2idx), num_units=hp.emb_dim, scale=True, scope="enc_embed") sign = tf.sign(tf.reduce_sum(tf.abs(self.enc), axis=-1)) key_masks = tf.expand_dims(sign, -1) ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="enc_pe") self.enc *= key_masks ## Dropout self.enc = tf.layers.dropout(self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### Multihead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False) ### Feed Forward self.enc = feedforward( self.enc, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Decoder with tf.variable_scope("decoder"): ## Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(self.en2idx), num_units=hp.emb_dim, scale=True, scope="dec_embed") key_masks = tf.expand_dims( tf.sign(tf.reduce_sum(tf.abs(self.dec), axis=-1)), -1) ## Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), num_units=hp.emb_dim, zero_pad=False, scale=False, scope="dec_pe") self.dec *= key_masks ## Dropout self.dec = tf.layers.dropout(self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor( self.is_training)) ## Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.emb_dim, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=self.is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.emb_dim, hp.emb_dim]) # Final linear projection self.logits = tf.layers.dense(self.dec, len(self.en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1))
def decode(self, xs, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' self.memory = memory with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, sents2 = ys x, _, = xs # embedding dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model ** 0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) attn_dists = [] # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec, _ = multihead_attention(queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec, attn_dist = multihead_attention(queries=dec, keys=self.memory, values=self.memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") attn_dists.append(attn_dist) ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared) weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): gens = tf.layers.dense(logits, 1, activation=tf.sigmoid, trainable=training, use_bias=False) logits = tf.nn.softmax(logits) # final distribution logits = self._calc_final_dist(x, gens, logits, attn_dists[-1]) return logits, y, sents2
def run_GaussionTransformer(self): embeddingScope = "embeddingBlock" encodingBlock = "encodingBlcok" interactionBlock = "interactionBlock" comparisonBlock = "comparisonBlock" self.positionEncoding1 = modules.positional_encoding( inputs=self.inputX_word, num_units=param.Hyperparams.postion_dimension ) #(N, L, postion_dimension) self.positionEncoding2 = modules.positional_encoding( inputs=self.inputY_word, num_units=param.Hyperparams.postion_dimension) self.shift = tf.Variable( tf.abs(tf.random_normal([1], stddev=0, seed=0, dtype=tf.float64)) + 0.001, trainable=True, name='shift', dtype=tf.float64) self.bias = tf.Variable( -tf.abs(tf.random_normal([1], stddev=0, seed=0, dtype=tf.float64)), trainable=True, name='bias', dtype=tf.float64) with tf.variable_scope(embeddingScope, reuse=False): self.embedding_1 = modules.embedding_block(self.inputX_word, self.inputX_char, self.positionEncoding1, scope="embedding_1") self.embedding_2 = modules.embedding_block(self.inputY_word, self.inputY_char, self.positionEncoding2, scope="embedding_2") self.embedding_1 = tf.check_numerics(self.embedding_1, "nan happend!!!!") self.embedding_2 = tf.check_numerics(self.embedding_2, "nan happend!!!!") with tf.variable_scope(encodingBlock, reuse=False): self.encoding_1 = self.embedding_1 self.encoding_2 = self.embedding_2 for i in range(param.Hyperparams.encoder_num_blocks): with tf.variable_scope("multihead-atttention_{0}".format(i), reuse=False): #这里添加scope, {}.format self.encoding_1 = modules.multihead_attention( self.encoding_1, self.shift, self.bias, num_heads=param.Hyperparams.num_heads, dropout_rate=self.dropout_rate, is_training=self.is_training) self.encoding_1 = tf.check_numerics( self.encoding_1, "encoding nan happend!!!! multihead-atttention_{0}". format(i)) with tf.variable_scope("multihead-atttention_{0}".format(i), reuse=True): # 这里添加scope, {}.format self.encoding_2 = modules.multihead_attention( self.encoding_2, self.shift, self.bias, num_heads=param.Hyperparams.num_heads, dropout_rate=self.dropout_rate, is_training=self.is_training) self.encoding_1 += self.positionEncoding1 self.encoding_2 += self.positionEncoding2 with tf.variable_scope(interactionBlock, reuse=None): self.interaction_1 = self.encoding_1 self.interaction_2 = self.encoding_2 for i in range(param.Hyperparams.inter_num_blocks): with tf.variable_scope("interaction_{0}".format(i), reuse=False): self.interaction_1 = modules.InteractionBlock( queries=self.interaction_1, keys=self.interaction_2, shift=self.shift, bias=self.bias, num_heads=param.Hyperparams.num_heads, dropout_rate=self.dropout_rate, is_training=self.is_training) self.interaction_1 = tf.check_numerics( self.interaction_1, "nan happend!!!!") with tf.variable_scope("interaction_{0}".format(i), reuse=True): self.interaction_2 = modules.InteractionBlock( queries=self.interaction_2, keys=self.interaction_1, shift=self.shift, bias=self.bias, num_heads=param.Hyperparams.num_heads, dropout_rate=self.dropout_rate, is_training=self.is_training) self.encoding_1 = tf.check_numerics(self.encoding_1, "encoding_1 is nan") self.encoding_2 = tf.check_numerics(self.encoding_2, "encoding_2 is nan") self.interaction_1 = tf.check_numerics(self.interaction_1, "interaction_ 1 is nan") self.interaction_2 = tf.check_numerics(self.interaction_2, "interaction_2 is nan") with tf.variable_scope(comparisonBlock, reuse=None): self.logit = modules.ComparisonBlock( input1_Encoding=self.encoding_1, input1_Interaction=self.interaction_1, input2_Encoding=self.encoding_2, input2_Interaction=self.interaction_2) self.pred_y = tf.argmax(tf.nn.softmax(self.logit), 1) if self.is_training: with tf.name_scope("optimize"): # 损失函数,交叉熵 cross_entropy = tf.nn.softmax_cross_entropy_with_logits( logits=self.logit, labels=self.y) # 对logits进行softmax操作后,做交叉墒,输出的是一个向量 self.loss = tf.reduce_mean( cross_entropy) # 将交叉熵向量求和,即可得到交叉熵 # 优化器 self.optim = tf.train.AdamOptimizer( learning_rate=param.Hyperparams.lr).minimize(self.loss) with tf.name_scope("accuracy"): # 准确率 correct_pred = tf.equal( tf.argmax(self.y, 1), self.pred_y ) # 由于input_y也是onehot编码,因此,调用tf.argmax(self.input_y)得到的是1所在的下表 self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
def decode(self, ys, memory, training=True): ''' memory: encoder outputs. (N, T1, d_model) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # decoder_inputs = tf.Print(decoder_inputs, [decoder_inputs], # message='decoder_inputs =', summarize=10) # embedding # ys_pri = tf.print('y =', tf.shape(y), summarize=3) dec = tf.nn.embedding_lookup(self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # dec = tf.Print(dec, [dec], message='dec =', summarize=10) # dec_pri = tf.print('dec =', tf.shape(dec), dec, summarize=3) # Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # dec = tf.Print(dec, [dec], message='dec_finally =', summarize=10) # Final linear projection (embedding weights are shared) # with tf.control_dependencies([ys_pri, dec_pri]): weights = tf.transpose(self.embeddings) # (d_model, vocab_size) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def encode_decode(self, xs, ys, training=True): x, seqlens = xs decoder_inputs, y, seqlens = ys with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): enc = tf.nn.embedding_lookup(self.embeddings, x) # (N, T1, d_model) enc += positional_encoding(enc, self.hp.maxlen1, self.hp) enc = tf.layers.dropout(enc, self.hp.dropout_rate, training=training) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): dec = tf.reduce_sum(tf.nn.embedding_lookup(self.embeddings, decoder_inputs), reduction_indices=2) # (N, T1, d_model) # test_dec = dec dec = dec * self.hp.d_model**0.5 # scale # 子图结构里也需要对应的位置编码,因为要对应输出的预测结构 dec += positional_encoding(dec, self.hp.maxlen2, self.hp) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) ## Blocks for i in range(self.hp.num_blocks): with tf.variable_scope("encoder", reuse=tf.AUTO_REUSE): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): # self-attention enc = multihead_attention( queries=dec, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False) # feed forward enc = ff(enc, num_units=[self.hp.d_ff, self.hp.d_model]) with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): with tf.variable_scope("num_blocks_{}".format(i), reuse=tf.AUTO_REUSE): dec = multihead_attention( queries=dec, keys=dec, values=dec, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, # 是否加上mask层 causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=enc, values=enc, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) if self.hp.type == 'attribute': enc = tf.reduce_sum(enc, reduction_indices=1) dec = tf.reduce_sum(dec, reduction_indices=1) logits = tf.layers.dense(inputs=tf.concat(enc, dec), units=1, activation=tf.nn.relu) else: logits = tf.einsum('ntd,nkd->ntk', dec, enc) # (N, T2, T2) logits = (logits + tf.transpose(logits, [0, 2, 1])) / 2 #强制最终结果为一个对称矩阵,符合 return logits, y, decoder_inputs
def decode(self, ys, memory, src_masks, training=True): ''' memory: encoder outputs. (N, T1, d_model) src_masks: (N, T1) Returns logits: (N, T2, V). float32. y_hat: (N, T2). int32 y: (N, T2). int32 sents2: (N,). string. ''' with tf.variable_scope("decoder", reuse=tf.AUTO_REUSE): decoder_inputs, y, seqlens, sents2 = ys # tgt_masks tgt_masks = tf.math.equal(decoder_inputs, 0) # (N, T2) # embedding if self.hp.fac_embed: dec = tf.nn.embedding_lookup( self.embeddings1, decoder_inputs) # (N, T2, d_embed) dec = tf.matmul(dec, self.embeddings2) # (N, T2, d_model) else: dec = tf.nn.embedding_lookup( self.embeddings, decoder_inputs) # (N, T2, d_model) dec *= self.hp.d_model**0.5 # scale dec += positional_encoding(dec, self.hp.maxlen2) dec = tf.layers.dropout(dec, self.hp.dropout_rate, training=training) # Blocks for i in range(self.hp.num_blocks): if self.hp.share_weights: vs_name = "blocks_shared" else: vs_name = "num_blocks_{}".format(i) with tf.variable_scope(vs_name, reuse=tf.AUTO_REUSE): # Masked self-attention (Note that causality is True at this time) dec = multihead_attention( queries=dec, keys=dec, values=dec, key_masks=tgt_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=True, scope="self_attention") # Vanilla attention dec = multihead_attention( queries=dec, keys=memory, values=memory, key_masks=src_masks, num_heads=self.hp.num_heads, dropout_rate=self.hp.dropout_rate, training=training, causality=False, scope="vanilla_attention") ### Feed Forward dec = ff(dec, num_units=[self.hp.d_ff, self.hp.d_model]) # Final linear projection (embedding weights are shared in some situation) if self.hp.fac_embed: if self.hp.io_tie: #0: no normalization. 1: cal cosine-sim on 1 and 2. 2: cal l2-norm-square on 2 and cosine-sim on 1. 3: cal l2-norm-square on 2 and dist on 1. if self.hp.embedding_normalization == 1: #need add en=2 situation output_embeddings1 = tf.transpose( tf.concat((tf.zeros(shape=[1, self.hp.d_embed], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings1[1::], axis=-1)), 0)) logits = tf.einsum('ntd,dk->ntk', dec, tf.nn.l2_normalize( tf.transpose(self.embeddings2), axis=0)) #maybe use lstsq? logits = tf.einsum('ntd,dk->ntk', logits, output_embeddings1) elif self.hp.embedding_normalization >= 2: weights2 = self.embeddings2[1:, :] weights2 = divide_norm_square_and_transpose(weights2) weights2 = tf.concat( (tf.zeros(shape=[self.hp.d_embed, 1], dtype=tf.float32), weights2), -1) if self.hp.embedding_normalization == 2: weights1 = tf.transpose( tf.concat( (tf.zeros(shape=[1, self.hp.d_embed], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings1[1::], axis=-1)), 0)) else: weights1 = tf.transpose(self.embeddings1) logits = tf.einsum('ntd,dk->ntk', dec, weights2) logits = tf.einsum('ntd,dk->ntk', logits, weights1) if self.hp.embedding_normalization == 3: ebias = get_half_squarenorm(self.embeddings1) logits = tf.subtract(logits, ebias) else: logits = tf.einsum('ntd,dk->ntk', dec, tf.transpose(self.embeddings2)) logits = tf.einsum('ntd,dk->ntk', logits, tf.transpose(self.embeddings1)) else: with tf.variable_scope("output_embedding", reuse=tf.AUTO_REUSE): logits = tf.layers.dense(dec, self.vocab_size) else: if self.hp.io_tie: if self.hp.embedding_normalization == 0 or self.hp.embedding_normalization == 3: weights = tf.transpose( self.embeddings) # (d_model, vocab_size) elif self.hp.embedding_normalization == 1: weights = tf.transpose( tf.concat((tf.zeros(shape=[1, self.hp.d_model], dtype=tf.float32), tf.nn.l2_normalize(self.embeddings[1:, :], axis=-1)), 0)) elif self.hp.embedding_normalization == 2: weights = self.embeddings[1:, :] weights = divide_norm_square_and_transpose(weights) weights = tf.concat((tf.zeros(shape=[self.hp.d_model, 1], dtype=tf.float32), weights), -1) logits = tf.einsum('ntd,dk->ntk', dec, weights) # (N, T2, vocab_size) if self.hp.embedding_normalization == 2: #bias=tf.ones(shape=[logits.shape[-1]],dtype=tf.float32) pass #with tf.variable_scope("gauss",reuse=tf.AUTO_REUSE): #bias=tf.constant(1.0) #logits=tf.subtract(logits,bias) #logits=tf.square(logits) #logits=tf.negative(logits) #logits=gaussian_activation(logits) #logits=tf.exp(logits) if self.hp.embedding_normalization == 3: ebias = get_half_squarenorm(self.embeddings) logits = tf.subtract(logits, ebias) else: with tf.variable_scope("output_embedding", reuse=tf.AUTO_REUSE): logits = tf.layers.dense(dec, self.vocab_size) y_hat = tf.to_int32(tf.argmax(logits, axis=-1)) return logits, y_hat, y, sents2
def __init__(self, is_training=True): self.graph = tf.Graph() with self.graph.as_default(): if is_training: self.x, self.y, self.num_batch = get_batch_data() else: self.x = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) self.y = tf.placeholder(tf.int32, shape=(None, hp.maxlen)) # define decoder inputs # id = 2代表<S>,是decoder的初始输入,这一步把正常的y向量做转换,比如y = [["i", "love", "china", "deeply"], ["can", "you", "speak", "chinese"]]修改为 # [["<s>", "i", "love", "china"], ["<s>, "can", "you", "speak"]], 这部分将在decoder阶段,最先输入self-attention部分 # 在训练阶段,decoder_inputs如上,在inference阶段,由于无法获知真正的y,所以y输入的是shape=[batch_size, max_length]的全0向量。 # 处理之后旧变成[["<s>", 0, 0, 0]]这样子,每次值取第一个预测结果,循环输入再取前两个结果 self.decoder_inputs = tf.concat( (tf.ones_like(self.y[:, :1]) * 2, self.y[:, :-1]), -1) de2idx, idx2de = load_de_vocab() en2idx, idx2en = load_en_vocab() with tf.variable_scope("encoder"): # Embedding self.enc = embedding( self.x, vocab_size=len(de2idx), num_units=hp.hidden_units, zero_pad= True, # id为0的行表示padding的embedding, true表示将这一行置0(随机初始化出来的可能不是0) scale=True, scope="enc_embed") ## Positional Encoding if hp.sinusoid: self.enc += positional_encoding(self.x, num_units=hp.hidden_units, zero_pad=False, scale=False, scope='enc_pe') else: self.enc += embedding(tf.tile( tf.expand_dims(tf.range(tf.shape(self.x)[1]), 0), [tf.shape(self.x)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="enc_pe") ## Dropout self.enc = tf.layers.dropout( self.enc, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) ## Blocks, 叠加block,6个 for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ### MultiHead Attention self.enc = multihead_attention( queries=self.enc, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False) self.enc = feedforward( self.enc, num_units=[4 * hp.hidden_units, hp.hidden_units]) with tf.variable_scope("decoder"): # Embedding self.dec = embedding(self.decoder_inputs, vocab_size=len(en2idx), num_units=hp.hidden_units, scale=True, scope="dec_embed") # Positional Encoding if hp.sinusoid: self.dec += positional_encoding(self.decoder_inputs, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") else: self.dec += embedding(tf.tile( tf.expand_dims( tf.range(tf.shape(self.decoder_inputs)[1]), 0), [tf.shape(self.decoder_inputs)[0], 1]), vocab_size=hp.maxlen, num_units=hp.hidden_units, zero_pad=False, scale=False, scope="dec_pe") # Dropout self.dec = tf.layers.dropout( self.dec, rate=hp.dropout_rate, training=tf.convert_to_tensor(is_training)) # Blocks for i in range(hp.num_blocks): with tf.variable_scope("num_blocks_{}".format(i)): ## Multihead Attention ( self-attention) self.dec = multihead_attention( queries=self.dec, keys=self.dec, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=True, scope="self_attention") ## Multihead Attention ( vanilla attention) self.dec = multihead_attention( queries=self.dec, keys=self.enc, num_units=hp.hidden_units, num_heads=hp.num_heads, dropout_rate=hp.dropout_rate, is_training=is_training, causality=False, scope="vanilla_attention") ## Feed Forward self.dec = feedforward( self.dec, num_units=[4 * hp.hidden_units, hp.hidden_units]) # Final linear projection, 分类任务,分类数量是词表长度 self.logits = tf.layers.dense(self.dec, len(en2idx)) self.preds = tf.to_int32(tf.argmax(self.logits, dimension=-1)) self.istarget = tf.to_float(tf.not_equal(self.y, 0)) self.acc = tf.reduce_sum( tf.to_float(tf.equal(self.preds, self.y)) * self.istarget / (tf.reduce_sum(self.istarget))) if is_training: # Loss # 将one_hot中的0改成了一个很小的数,1改成了一个比较接近于1的数。 self.y_smoothed = label_smoothing( tf.one_hot(self.y, depth=len(en2idx))) self.loss = tf.nn.softmax_cross_entropy_with_logits( logits=self.logits, labels=self.y_smoothed) self.mean_loss = tf.reduce_sum( self.loss * self.istarget) / (tf.reduce_sum(self.istarget)) self.global_step = tf.Variable(0, name='global_step', trainable=False) self.optimizer = tf.train.AdamOptimizer(learning_rate=hp.lr, beta1=0.9, beta2=0.98, epsilon=1e-8) self.train_op = self.optimizer.minimize( self.mean_loss, global_step=self.global_step) tf.summary.scalar('mean_loss', self.mean_loss) self.merged = tf.summary.merge_all()