예제 #1
0
def combine_heads(x):
    x = tf.transpose(x, [0, 2, 1, 3])
    shp = get_shape_as_list(x)
    num_heads, head_sz = shp[-2:]
    new_x_shape = shp[:-2]+[num_heads * head_sz]
    new_x = tf.reshape(x, new_x_shape)
    return new_x
예제 #2
0
def combine_heads(x):
    x = tf.transpose(x, [0, 2, 1, 3])
    shp = get_shape_as_list(x)
    num_heads, head_sz = shp[-2:]
    new_x_shape = shp[:-2]+[num_heads * head_sz]
    new_x = tf.reshape(x, new_x_shape)
    return new_x
예제 #3
0
def ffn(x, scope, pdrop, d_ff, activation_type='relu'):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        act = tf_activation(activation_type)
        expansion = act(time_distributed_projection(x, name='ffn_ff', filters=d_ff))
        dropped = tf.layers.dropout(expansion, pdrop, training=TRAIN_FLAG())
        squeeze = time_distributed_projection(dropped, name='ffn_model', filters=d_model)
        return squeeze
예제 #4
0
 def encode(self, x=None):
     x = super(PositionalLookupTableEmbeddings, self).encode(x) * math.sqrt(
         self.dsz)
     B, T, C = get_shape_as_list(x)
     signal = get_timing_signal_1d(T,
                                   C,
                                   min_timescale=1.0,
                                   max_timescale=self.max_timescale,
                                   start_index=0)
     return x + signal
예제 #5
0
def ffn(x, scope, pdrop, d_ff=None, activation_type='relu'):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        if d_ff is None:
            d_ff = 4 * d_model
        act = tf_activation(activation_type)
        expansion = act(time_distributed_projection(x, name='ffn_ff', filters=d_ff))
        dropped = tf.layers.dropout(expansion, pdrop, training=TRAIN_FLAG())
        squeeze = time_distributed_projection(dropped, name='ffn_model', filters=d_model)
        return squeeze
예제 #6
0
def transformer_encoder(x, src_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):

    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        if d_ff is None:
            d_ff = 4 * d_model
        x = layer_norm(x, 'ln_1')
        q, k, v = self_attention_qkv(x, d_model)
        a = multi_headed_attention(q, k, v, 'attn', d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        x = x + tf.layers.dropout(a, pdrop, training=TRAIN_FLAG())
        x = layer_norm(x, 'ln_2')
        m = ffn(x, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = x + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
예제 #7
0
def transformer_encoder(x, src_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):

    with tf.variable_scope(scope):
        d_model = get_shape_as_list(x)[-1]
        if d_ff is None:
            d_ff = 4*d_model
        x = layer_norm(x, 'ln_1')
        q, k, v = self_attention_qkv(x, d_model)
        a = multi_headed_attention(q, k, v, 'attn', d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        x = x + tf.layers.dropout(a, pdrop, training=TRAIN_FLAG())
        x = layer_norm(x, 'ln_2')
        m = ffn(x, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = x + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
예제 #8
0
def transformer_decoder(tgt, src, src_mask, tgt_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(tgt)[-1]
        if d_ff is None:
            d_ff = 4 * d_model

        tgt = layer_norm(tgt, 'ln_1')

        q, k, v = self_attention_qkv(tgt, d_model)
        self_attn = multi_headed_attention(q, k, v, 'self_attn', d_model, num_heads, pdrop, scale=scale, mask=tgt_mask)
        tgt = tgt + tf.layers.dropout(self_attn, pdrop, training=TRAIN_FLAG())
        tgt = layer_norm(tgt, 'ln_2')

        q, k, v = low_order_projection_qkv(tgt, src, src, d_model)
        # Mask at zeros???
        src_attn = multi_headed_attention(q, k, v, "dual_attn", d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        tgt = tgt + tf.layers.dropout(src_attn, pdrop, training=TRAIN_FLAG())

        tgt = layer_norm(tgt, 'ln_3')
        m = ffn(tgt, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = tgt + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
예제 #9
0
def transformer_decoder(src, tgt, src_mask, tgt_mask, scope, num_heads, pdrop, scale=True, activation_type='relu', d_ff=None):
    with tf.variable_scope(scope):
        d_model = get_shape_as_list(tgt)[-1]
        if d_ff is None:
            d_ff = 4*d_model

        tgt = layer_norm(tgt, 'ln_1')

        q, k, v = self_attention_qkv(tgt, d_model)
        self_attn = multi_headed_attention(q, k, v, 'self_attn', d_model, num_heads, pdrop, scale=scale, mask=tgt_mask)
        tgt = tgt + tf.layers.dropout(self_attn, pdrop, training=TRAIN_FLAG())
        tgt = layer_norm(tgt, 'ln_2')

        q, k, v = low_order_projection_qkv(tgt, src, src, d_model)
        # Mask at zeros???
        src_attn = multi_headed_attention(q, k, v, "dual_attn", d_model, num_heads, pdrop, scale=scale, mask=src_mask)
        tgt = tgt + tf.layers.dropout(src_attn, pdrop, training=TRAIN_FLAG())

        tgt = layer_norm(tgt, 'ln_3')
        m = ffn(tgt, 'ffn', pdrop, d_ff=d_ff, activation_type=activation_type)
        h = tgt + tf.layers.dropout(m, pdrop, training=TRAIN_FLAG())
        return h
예제 #10
0
def split_heads(x, num_heads):
    shp = get_shape_as_list(x)
    dsz = shp[-1]
    r = tf.reshape(x, shp[:-1] + [num_heads, dsz // num_heads])
    # (B, T, num_heads, d_k) -> (B, num_heads, T, d_k)
    return tf.transpose(r, [0, 2, 1, 3])
예제 #11
0
 def encode(self, x=None):
     x = super(PositionalLookupTableEmbeddings, self).encode(x) * math.sqrt(self.dsz)
     B, T, C = get_shape_as_list(x)
     signal = get_timing_signal_1d(T, C, min_timescale=1.0, max_timescale=self.max_timescale, start_index=0)
     return x + signal
예제 #12
0
def split_heads(x, num_heads):
    shp = get_shape_as_list(x)
    dsz = shp[-1]
    r = tf.reshape(x, shp[:-1] + [num_heads, dsz // num_heads])
    # (B, T, num_heads, d_k) -> (B, num_heads, T, d_k)
    return tf.transpose(r, [0, 2, 1, 3])