def inner_loop(i, hit_eos, decoded_ids): tgt_embed = self.tgt_embedding.encode(decoded_ids) T = get_shape_as_list(tgt_embed)[1] tgt_mask = subsequent_mask(T) scope = 'TransformerDecoder' h = transformer_decoder_stack(tgt_embed, src_enc, src_mask, tgt_mask, num_heads, pdrop, scale, layers, activation_type, scope, d_ff) vsz = self.tgt_embedding.vsz do_weight_tying = bool(kwargs.get('tie_weights', True)) # False hsz = get_shape_as_list(h)[-1] h = tf.reshape(h, [-1, hsz]) if do_weight_tying and hsz == self.tgt_embedding.get_dsz(): with tf.variable_scope(self.tgt_embedding.scope, reuse=True): W = tf.get_variable("W") outputs = tf.matmul(h, W, transpose_b=True, name="logits") else: vocab_w = tf.get_variable("vocab_w", [hsz, vsz], dtype=tf.float32) vocab_b = tf.get_variable("vocab_b", [vsz], dtype=tf.float32) outputs = tf.nn.xw_plus_b(h, vocab_w, vocab_b, name="logits") preds = tf.reshape(outputs, [B, T, vsz]) next_id = tf.argmax(preds, axis=-1)[:, -1] hit_eos |= tf.equal(next_id, Offsets.EOS) next_id = tf.reshape(next_id, [B, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, hit_eos, decoded_ids
def inner_loop(i, hit_eos, decoded_ids): tgt_embed = self.tgt_embedding.encode(decoded_ids) T = get_shape_as_list(tgt_embed)[1] tgt_mask = subsequent_mask(T) scope = 'TransformerDecoder' h = transformer_decoder_stack(src_enc, tgt_embed, src_mask, tgt_mask, num_heads, pdrop, scale, layers, activation_type, scope, d_ff) vsz = self.tgt_embedding.vsz do_weight_tying = bool(kwargs.get('tie_weights', True)) # False hsz = get_shape_as_list(h)[-1] h = tf.reshape(h, [-1, hsz]) if do_weight_tying and hsz == self.tgt_embedding.get_dsz(): with tf.variable_scope(self.tgt_embedding.scope, reuse=True): W = tf.get_variable("W") outputs = tf.matmul(h, W, transpose_b=True, name="logits") else: vocab_w = tf.get_variable("vocab_w", [hsz, vsz], dtype=tf.float32) vocab_b = tf.get_variable("vocab_b", [vsz], dtype=tf.float32) outputs = tf.nn.xw_plus_b(h, vocab_w, vocab_b, name="logits") preds = tf.reshape(outputs, [B, T, vsz]) next_id = tf.argmax(preds, axis=-1)[:, -1] hit_eos |= tf.equal(next_id, Offsets.EOS) next_id = tf.reshape(next_id, [B, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, hit_eos, decoded_ids
def decode(self, inputs): encoder_outputs, tgt, src_len, tgt_len = inputs tgt_embed = self.tgt_embedding(tgt) #if not tgt: # tgt = self.tgt_embedding.create_placeholder(self.tgt_embedding.name) src_enc = encoder_outputs.output #tgt_embed = self.tgt_embedding.encode(tgt) shape = get_shape_as_list(tgt_embed) B = shape[0] T = shape[1] if hasattr(encoder_outputs, 'src_mask'): src_mask = encoder_outputs.src_mask else: src_mask = tf.sequence_mask(src_len, T, dtype=tf.float32) tgt_mask = subsequent_mask(T) h = self.decoder((tgt_embed, src_enc, src_mask, tgt_mask)) outputs = self.proj(h) self.preds = tf.transpose(tf.reshape(outputs, [B, T, -1]), [1, 0, 2]) best = tf.argmax(self.preds, -1) self.output(best) return self.best
def decode(self, x, num_heads=4, layers=1, scale=True, activation_type='relu', scope='TransformerEncoder', d_ff=None, **kwargs): T = get_shape_as_list(x)[1] dsz = get_shape_as_list(x)[-1] mask = subsequent_mask(T) if dsz != self.hsz: x = tf.layers.dense(x, self.hsz) x = transformer_encoder_stack(x, mask, num_heads, self.pdrop_value, scale, layers, activation_type, d_ff=d_ff) return tf.reshape(x, [-1, self.hsz])
def test_attn_value_sub_mask(qkv): q, k, v = qkv B, H, T, _ = q.get_shape().as_list() q = tf.zeros_like(q) mask = subsequent_mask(T) res = dot_product_attention(q, k, v, mask=mask) with tf.Session() as sess: res, gold = sess.run([res, v]) for b in range(B): for h in range(H): for t in range(T): np.testing.assert_allclose(res[b, h, t, :], np.mean(gold[:, :, :t+1, :], axis=2)[b, h, :], atol=1e-5)
def test_attn_value_sub_mask(qkv): q, k, v = qkv with tf.device('/cpu:0'): B, H, T, _ = q.get_shape().as_list() q = tf.zeros_like(q) mask = subsequent_mask(T) res = dot_product_attention(q, k, v, mask=mask) with tf.Session() as sess: res, gold = sess.run([res, v]) for b in range(B): for h in range(H): for t in range(T): np.testing.assert_allclose(res[b, h, t, :], np.mean(gold[:, :, :t+1, :], axis=2)[b, h, :], atol=1e-5)
def inner_loop(i, hit_eos, decoded_ids): tgt_embed = self.tgt_embedding(decoded_ids) T = get_shape_as_list(tgt_embed)[1] tgt_mask = subsequent_mask(T) h = self.decoder((tgt_embed, src_enc, src_mask, tgt_mask)) # hsz = get_shape_as_list(h)[-1] # h = tf.reshape(h, [-1, hsz]) outputs = self.proj(h) preds = tf.reshape(outputs, [B, T, -1]) next_id = tf.argmax(preds, axis=-1)[:, -1] hit_eos |= tf.equal(next_id, Offsets.EOS) next_id = tf.reshape(next_id, [B, 1]) decoded_ids = tf.concat([decoded_ids, next_id], axis=1) return i + 1, hit_eos, decoded_ids
def decode(self, encoder_outputs, src_len, tgt_len, pdrop, layers=1, scope='TransformerDecoder', num_heads=4, scale=True, activation_type='relu', d_ff=None, **kwargs): """self.best is [T, B]""" src_enc = encoder_outputs.output if hasattr(encoder_outputs, 'src_mask'): src_mask = encoder_outputs.src_mask else: T = get_shape_as_list(src_enc)[1] src_mask = tf.sequence_mask(src_len, T, dtype=tf.float32) tgt_embed = self.tgt_embedding.encode(kwargs.get('tgt')) T = get_shape_as_list(tgt_embed)[1] tgt_mask = subsequent_mask(T) scope = 'TransformerDecoder' h = transformer_decoder_stack(tgt_embed, src_enc, src_mask, tgt_mask, num_heads, pdrop, scale, layers, activation_type, scope, d_ff) vsz = self.tgt_embedding.vsz do_weight_tying = bool(kwargs.get('tie_weights', True)) # False hsz = get_shape_as_list(h)[-1] if do_weight_tying and hsz == self.tgt_embedding.get_dsz(): h = tf.reshape(h, [-1, hsz]) with tf.variable_scope(self.tgt_embedding.scope, reuse=True): W = tf.get_variable("W") outputs = tf.matmul(h, W, transpose_b=True, name="logits") else: h = tf.reshape(h, [-1, hsz]) vocab_w = tf.get_variable("vocab_w", [hsz, vsz], dtype=tf.float32) vocab_b = tf.get_variable("vocab_b", [vsz], dtype=tf.float32) outputs = tf.nn.xw_plus_b(h, vocab_w, vocab_b, name="logits") self.preds = tf.transpose(tf.reshape(outputs, [-1, T, vsz]), [1, 0, 2]) best = tf.argmax(self.preds, -1) self.output(best)
def decode(self, encoder_outputs, src_len, tgt_len, pdrop, layers=1, scope='TransformerDecoder', num_heads=4, scale=True, activation_type='relu', d_ff=None, **kwargs): """self.best is [T, B]""" src_enc = encoder_outputs.output if hasattr(encoder_outputs, 'src_mask'): src_mask = encoder_outputs.src_mask else: T = get_shape_as_list(src_enc)[1] src_mask = tf.sequence_mask(src_len, T, dtype=tf.float32) tgt_embed = self.tgt_embedding.encode(kwargs.get('tgt')) T = get_shape_as_list(tgt_embed)[1] tgt_mask = subsequent_mask(T) scope = 'TransformerDecoder' h = transformer_decoder_stack(src_enc, tgt_embed, src_mask, tgt_mask, num_heads, pdrop, scale, layers, activation_type, scope, d_ff) vsz = self.tgt_embedding.vsz do_weight_tying = bool(kwargs.get('tie_weights', True)) # False hsz = get_shape_as_list(h)[-1] if do_weight_tying and hsz == self.tgt_embedding.get_dsz(): h = tf.reshape(h, [-1, hsz]) with tf.variable_scope(self.tgt_embedding.scope, reuse=True): W = tf.get_variable("W") outputs = tf.matmul(h, W, transpose_b=True, name="logits") else: h = tf.reshape(h, [-1, hsz]) vocab_w = tf.get_variable("vocab_w", [hsz, vsz], dtype=tf.float32) vocab_b = tf.get_variable("vocab_b", [vsz], dtype=tf.float32) outputs = tf.nn.xw_plus_b(h, vocab_w, vocab_b, name="logits") self.preds = tf.transpose(tf.reshape(outputs, [-1, T, vsz]), [1, 0, 2]) best = tf.argmax(self.preds, -1) self.output(best)
def decode(self, x, num_heads=4, layers=1, scale=True, activation_type='relu', scope='TransformerEncoder', d_ff=None, **kwargs): T = get_shape_as_list(x)[1] mask = subsequent_mask(T) x = transformer_encoder_stack(x, mask, num_heads, self.pdrop_value, scale, layers, activation_type) return tf.reshape(x, [-1, self.hsz])