def combine_heads(x): x = tf.transpose(x, [0, 2, 1, 3]) shp = get_shape_as_list(x) num_heads, head_sz = shp[-2:] new_x_shape = shp[:-2]+[num_heads * head_sz] new_x = tf.reshape(x, new_x_shape) return new_x
def ffn(x, scope, pdrop, d_ff, activation_type='relu'): with tf.variable_scope(scope): d_model = get_shape_as_list(x)[-1] act = tf_activation(activation_type) expansion = act(time_distributed_projection(x, name='ffn_ff', filters=d_ff)) dropped = tf.layers.dropout(expansion, pdrop, training=TRAIN_FLAG()) squeeze = time_distributed_projection(dropped, name='ffn_model', filters=d_model) return squeeze
def encode(self, x=None): x = super(PositionalLookupTableEmbeddings, self).encode(x) * math.sqrt( self.dsz) B, T, C = get_shape_as_list(x) signal = get_timing_signal_1d(T, C, min_timescale=1.0, max_timescale=self.max_timescale, start_index=0) return x + signal
def ffn(x, scope, pdrop, d_ff=None, activation_type='relu'): with tf.variable_scope(scope): d_model = get_shape_as_list(x)[-1] if d_ff is None: d_ff = 4 * d_model act = tf_activation(activation_type) expansion = act(time_distributed_projection(x, name='ffn_ff', filters=d_ff)) dropped = tf.layers.dropout(expansion, pdrop, training=TRAIN_FLAG()) squeeze = time_distributed_projection(dropped, name='ffn_model', filters=d_model) return squeeze
def transformer_encoder(x, src_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None): with tf.variable_scope(scope): d_model = get_shape_as_list(x)[-1] if d_ff is None: d_ff = 4 * d_model x = layer_norm(x, 'ln_1') q, k, v = self_attention_qkv(x, d_model) a = multi_headed_attention(q, k, v, 'attn', d_model, num_heads, pdrop, scale=scale, mask=src_mask) x = x + tf.layers.dropout(a, pdrop, training=TRAIN_FLAG()) x = layer_norm(x, 'ln_2') m = ffn(x, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type) h = x + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG()) return h
def transformer_encoder(x, src_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None): with tf.variable_scope(scope): d_model = get_shape_as_list(x)[-1] if d_ff is None: d_ff = 4*d_model x = layer_norm(x, 'ln_1') q, k, v = self_attention_qkv(x, d_model) a = multi_headed_attention(q, k, v, 'attn', d_model, num_heads, pdrop, scale=scale, mask=src_mask) x = x + tf.layers.dropout(a, pdrop, training=TRAIN_FLAG()) x = layer_norm(x, 'ln_2') m = ffn(x, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type) h = x + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG()) return h
def transformer_decoder(tgt, src, src_mask, tgt_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None): with tf.variable_scope(scope): d_model = get_shape_as_list(tgt)[-1] if d_ff is None: d_ff = 4 * d_model tgt = layer_norm(tgt, 'ln_1') q, k, v = self_attention_qkv(tgt, d_model) self_attn = multi_headed_attention(q, k, v, 'self_attn', d_model, num_heads, pdrop, scale=scale, mask=tgt_mask) tgt = tgt + tf.layers.dropout(self_attn, pdrop, training=TRAIN_FLAG()) tgt = layer_norm(tgt, 'ln_2') q, k, v = low_order_projection_qkv(tgt, src, src, d_model) # Mask at zeros??? src_attn = multi_headed_attention(q, k, v, "dual_attn", d_model, num_heads, pdrop, scale=scale, mask=src_mask) tgt = tgt + tf.layers.dropout(src_attn, pdrop, training=TRAIN_FLAG()) tgt = layer_norm(tgt, 'ln_3') m = ffn(tgt, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type) h = tgt + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG()) return h
def transformer_decoder(src, tgt, src_mask, tgt_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None): with tf.variable_scope(scope): d_model = get_shape_as_list(tgt)[-1] if d_ff is None: d_ff = 4*d_model tgt = layer_norm(tgt, 'ln_1') q, k, v = self_attention_qkv(tgt, d_model) self_attn = multi_headed_attention(q, k, v, 'self_attn', d_model, num_heads, pdrop, scale=scale, mask=tgt_mask) tgt = tgt + tf.layers.dropout(self_attn, pdrop, training=TRAIN_FLAG()) tgt = layer_norm(tgt, 'ln_2') q, k, v = low_order_projection_qkv(tgt, src, src, d_model) # Mask at zeros??? src_attn = multi_headed_attention(q, k, v, "dual_attn", d_model, num_heads, pdrop, scale=scale, mask=src_mask) tgt = tgt + tf.layers.dropout(src_attn, pdrop, training=TRAIN_FLAG()) tgt = layer_norm(tgt, 'ln_3') m = ffn(tgt, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type) h = tgt + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG()) return h
def split_heads(x, num_heads): shp = get_shape_as_list(x) dsz = shp[-1] r = tf.reshape(x, shp[:-1] + [num_heads, dsz // num_heads]) # (B, T, num_heads, d_k) -> (B, num_heads, T, d_k) return tf.transpose(r, [0, 2, 1, 3])
def encode(self, x=None): x = super(PositionalLookupTableEmbeddings, self).encode(x) * math.sqrt(self.dsz) B, T, C = get_shape_as_list(x) signal = get_timing_signal_1d(T, C, min_timescale=1.0, max_timescale=self.max_timescale, start_index=0) return x + signal