def wgan_critic(h): with C.layers.default_options(init=C.normal(0.02), pad=True, bias=False): h = C.leaky_relu(Convolution2D((3, 3), 32, strides=2, bias=True)(h), alpha=0.2) h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3), 64, strides=2)(h)), alpha=0.2) h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3), 128, strides=2)(h)), alpha=0.2) h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3), 256, strides=2)(h)), alpha=0.2) h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3), 512, strides=2)(h)), alpha=0.2) h = C.leaky_relu(LayerNormalization()(Convolution2D((3, 3), 1024, strides=2)(h)), alpha=0.2) h = Convolution2D((4, 4), 1, pad=False, strides=1, bias=True)(h) return h
def test_layers_layer_normalization(): y = C.input_variable(4) p = LayerNormalization(name='foo')(y) dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32) res = p(y).eval({y: dat}) checkedBias = False checkedScale = False for param in p.parameters: if param.name == "bias": assert param.value.shape == y.shape checkedBias = True elif param.name == "scale": assert param.value.shape == y.shape checkedScale = True assert checkedBias and checkedScale mean_dat = np.mean(dat) x = dat-mean_dat std = np.sqrt(np.mean(x*x)) epsilon = 0.00001 np.testing.assert_array_almost_equal(res, x/(std + epsilon), decimal=6, \ err_msg="Error in layer normalization computation")
def MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0, name=''): """ Multi head attention block as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Multi-head attention block comes with a residual connection and a layer norm. Example: a = C.sequence.input_variable(10) b = MultiHeadAttentionBlock(2, 10)(a, a, a) assert b.shape == (10, ) Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ attention_layer = MultiHeadAttention(num_heads, model_dim, obey_sequence_order, max_seq_len, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, init=init, init_bias=init_bias, name='MultiheadAttention') layernorm = LayerNormalization(initial_scale=initial_scale, initial_bias=initial_bias, name='LayerNorm') @C.Function def inner(query, key, value): attended = attention_layer(query, key, value) skip_connect_attended = attended + query normed_skip_connect_attended = layernorm(skip_connect_attended) return normed_skip_connect_attended return _inject_name(inner, name)
def test_layers_layer_normalization(): y = input(4) p = LayerNormalization(name='foo')(y) dat = np.array([[1.0,2.0,3.0,4.0]], dtype=np.float32) res = p(y).eval({y: dat}) mean_dat = np.mean(dat) x = dat-mean_dat std = np.sqrt(np.mean(x*x)) epsilon = 0.00001 np.testing.assert_array_almost_equal(res, x/(std + epsilon), decimal=6, \ err_msg="Error in layer normalization computation")
def TransformerDecoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None, obey_sequence_order: bool = True, max_seq_len: int = None, mha1_key_init=default_override_or(C.glorot_uniform()), mha1_key_init_bias=default_override_or(0), mha1_query_init=default_override_or(C.glorot_uniform()), mha1_query_init_bias=default_override_or(0), mha1_value_init=default_override_or(C.glorot_uniform()), mha1_value_init_bias=default_override_or(0), mha1_init=default_override_or(C.glorot_uniform()), mha1_init_bias=default_override_or(0), mha1_initial_scale=1, mha1_initial_bias=0, mha2_key_init=default_override_or(C.glorot_uniform()), mha2_key_init_bias=default_override_or(0), mha2_query_init=default_override_or(C.glorot_uniform()), mha2_query_init_bias=default_override_or(0), mha2_value_init=default_override_or(C.glorot_uniform()), mha2_value_init_bias=default_override_or(0), mha2_init=default_override_or(C.glorot_uniform()), mha2_init_bias=default_override_or(0), mha2_initial_scale=1, mha2_initial_bias=0, intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0): """ Decoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Consist of 2 multi head attention followed by a dense layer, residual connect and layer norm Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer dropout_rate (float): probability of dropping out an element in the position-wise feed-forward obey_sequence_order (bool, defaults True): do not let attention peek into future values max_seq_len (int): max sequence length possible, used to ensure that sequence order is obeyed mha1_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha1_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta mha2_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha2_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ mha_block1 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim, obey_sequence_order=obey_sequence_order, max_seq_len=max_seq_len, key_init=mha1_key_init, key_init_bias=mha1_key_init_bias, query_init=mha1_query_init, query_init_bias=mha1_query_init_bias, value_init=mha1_value_init, value_init_bias=mha1_value_init_bias, init=mha1_init, init_bias=mha1_init_bias, initial_scale=mha1_initial_scale, initial_bias=mha1_initial_bias) mha_block2 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim, obey_sequence_order=False, max_seq_len=None, key_init=mha2_key_init, key_init_bias=mha2_key_init_bias, query_init=mha2_query_init, query_init_bias=mha2_query_init_bias, value_init=mha2_value_init, value_init_bias=mha2_value_init_bias, init=mha2_init, init_bias=mha2_init_bias, initial_scale=mha2_initial_scale, initial_bias=mha2_initial_bias) feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate, intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias, init=init, init_bias=init_bias) layernorm = LayerNormalization(initial_scale, initial_bias) @C.Function def block(encoded, x): inner = mha_block1(x, x, x) inner = mha_block2(inner, encoded, encoded) output = layernorm(ResNetBlock(feed_foward)(inner)) return output return block
def TransformerEncoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), mha_init=default_override_or(C.glorot_uniform()), mha_init_bias=default_override_or(0), mha_initial_scale=1, mha_initial_bias=0, intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0, name=''): """ Encoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Consist of 1 multi head attention followed by a dense layer, residual connect and layer norm Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer dropout_rate (float): probability of dropping out an element in the position-wise feed-forward obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ mha_block = MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order, max_seq_len, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, init=mha_init, init_bias=mha_init_bias, initial_scale=mha_initial_scale, initial_bias=mha_initial_bias, name='SelfAttention') feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate, intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias, init=init, init_bias=init_bias, name='PWFF') layernorm = LayerNormalization(initial_scale, initial_bias, name='LayerNorm') @C.Function def block(x): self_attended = mha_block(x, C.alias(x), C.alias(x)) hidden = feed_foward(self_attended) output = layernorm(hidden + self_attended) # residual connection return output return _inject_name(block, name) # consider change to BlockFunction