def wgan_critic(h):
    with C.layers.default_options(init=C.normal(0.02), pad=True, bias=False):
        h = C.leaky_relu(Convolution2D((3, 3), 32, strides=2, bias=True)(h),
                         alpha=0.2)

        h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3),
                                                            64,
                                                            strides=2)(h)),
                         alpha=0.2)
        h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3),
                                                            128,
                                                            strides=2)(h)),
                         alpha=0.2)
        h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3),
                                                            256,
                                                            strides=2)(h)),
                         alpha=0.2)
        h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3),
                                                            512,
                                                            strides=2)(h)),
                         alpha=0.2)
        h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3),
                                                            1024,
                                                            strides=2)(h)),
                         alpha=0.2)

        h = Convolution2D((4, 4), 1, pad=False, strides=1, bias=True)(h)
        return h
Exemple #2
0
def test_layers_layer_normalization():
    y = C.input_variable(4)
    p = LayerNormalization(name='foo')(y)

    dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32)
    res =  p(y).eval({y: dat})

    checkedBias = False
    checkedScale = False
    for param in p.parameters:
        if param.name == "bias":
            assert param.value.shape == y.shape
            checkedBias = True
        elif param.name == "scale":
            assert param.value.shape == y.shape
            checkedScale = True

    assert checkedBias and checkedScale

    mean_dat = np.mean(dat)
    x = dat-mean_dat
    std = np.sqrt(np.mean(x*x))
    epsilon = 0.00001

    np.testing.assert_array_almost_equal(res, x/(std + epsilon), decimal=6, \
        err_msg="Error in layer normalization computation")
Exemple #3
0
def MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None,
                            key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                            query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                            value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0, name=''):
    """ Multi head attention block as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Multi-head attention block comes with a residual connection and a layer norm.

    Example:
        a = C.sequence.input_variable(10)
        b = MultiHeadAttentionBlock(2, 10)(a, a, a)

        assert b.shape == (10, )

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    attention_layer = MultiHeadAttention(num_heads, model_dim, obey_sequence_order, max_seq_len,
                                         key_init=key_init, key_init_bias=key_init_bias,
                                         query_init=query_init, query_init_bias=query_init_bias,
                                         value_init=value_init, value_init_bias=value_init_bias,
                                         init=init, init_bias=init_bias, name='MultiheadAttention')

    layernorm = LayerNormalization(initial_scale=initial_scale, initial_bias=initial_bias, name='LayerNorm')

    @C.Function
    def inner(query, key, value):
        attended = attention_layer(query, key, value)
        skip_connect_attended = attended + query
        normed_skip_connect_attended = layernorm(skip_connect_attended)
        return normed_skip_connect_attended

    return _inject_name(inner, name)
Exemple #4
0
def test_layers_layer_normalization():
    y = input(4)
    p = LayerNormalization(name='foo')(y)

    dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32)
    res =  p(y).eval({y: dat})

    mean_dat = np.mean(dat)
    x = dat-mean_dat
    std = np.sqrt(np.mean(x*x))
    epsilon = 0.00001

    np.testing.assert_array_almost_equal(res, x/(std + epsilon), decimal=6, \
        err_msg="Error in layer normalization computation")
Exemple #5
0
def TransformerDecoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = True, max_seq_len: int = None,
                            mha1_key_init=default_override_or(C.glorot_uniform()), mha1_key_init_bias=default_override_or(0),
                            mha1_query_init=default_override_or(C.glorot_uniform()), mha1_query_init_bias=default_override_or(0),
                            mha1_value_init=default_override_or(C.glorot_uniform()), mha1_value_init_bias=default_override_or(0),
                            mha1_init=default_override_or(C.glorot_uniform()), mha1_init_bias=default_override_or(0),
                            mha1_initial_scale=1, mha1_initial_bias=0,
                            mha2_key_init=default_override_or(C.glorot_uniform()), mha2_key_init_bias=default_override_or(0),
                            mha2_query_init=default_override_or(C.glorot_uniform()), mha2_query_init_bias=default_override_or(0),
                            mha2_value_init=default_override_or(C.glorot_uniform()), mha2_value_init_bias=default_override_or(0),
                            mha2_init=default_override_or(C.glorot_uniform()), mha2_init_bias=default_override_or(0),
                            mha2_initial_scale=1, mha2_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()),
                            intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0):
    """ Decoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 2 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order (bool, defaults True): do not let attention peek into future values
        max_seq_len (int): max sequence length possible, used to ensure that sequence order is obeyed
        mha1_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha1_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        mha2_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha2_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block1 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=obey_sequence_order, max_seq_len=max_seq_len,
                                         key_init=mha1_key_init, key_init_bias=mha1_key_init_bias,
                                         query_init=mha1_query_init, query_init_bias=mha1_query_init_bias,
                                         value_init=mha1_value_init, value_init_bias=mha1_value_init_bias,
                                         init=mha1_init, init_bias=mha1_init_bias,
                                         initial_scale=mha1_initial_scale, initial_bias=mha1_initial_bias)
    
    mha_block2 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=False, max_seq_len=None,
                                         key_init=mha2_key_init, key_init_bias=mha2_key_init_bias,
                                         query_init=mha2_query_init, query_init_bias=mha2_query_init_bias,
                                         value_init=mha2_value_init, value_init_bias=mha2_value_init_bias,
                                         init=mha2_init, init_bias=mha2_init_bias,
                                         initial_scale=mha2_initial_scale, initial_bias=mha2_initial_bias)

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias)

    layernorm = LayerNormalization(initial_scale, initial_bias)

    @C.Function
    def block(encoded, x):
        inner = mha_block1(x, x, x)
        inner = mha_block2(inner, encoded, encoded)
        output = layernorm(ResNetBlock(feed_foward)(inner))
        return output

    return block
Exemple #6
0
def TransformerEncoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = None, max_seq_len: int = None,
                            key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                            query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                            value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                            mha_init=default_override_or(C.glorot_uniform()), mha_init_bias=default_override_or(0),
                            mha_initial_scale=1, mha_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0, name=''):
    """ Encoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 1 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
         mha_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block = MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order, max_seq_len,
                                        key_init=key_init, key_init_bias=key_init_bias,
                                        query_init=query_init, query_init_bias=query_init_bias,
                                        value_init=value_init, value_init_bias=value_init_bias,
                                        init=mha_init, init_bias=mha_init_bias,
                                        initial_scale=mha_initial_scale, initial_bias=mha_initial_bias,
                                        name='SelfAttention')

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias, name='PWFF')

    layernorm = LayerNormalization(initial_scale, initial_bias, name='LayerNorm')

    @C.Function
    def block(x):
        self_attended = mha_block(x, C.alias(x), C.alias(x))
        hidden = feed_foward(self_attended)
        output = layernorm(hidden + self_attended)  # residual connection
        return output

    return _inject_name(block, name)  # consider change to BlockFunction