def LSTM(shape, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(LSTM, activation=activation) use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes) init = get_default_override(LSTM, init=init) init_bias = get_default_override(LSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( LSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('LSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def RNNUnit(shape, cell_shape=None, activation=default_override_or(sigmoid), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' RNNUnit(shape, cell_shape=None, activation=sigmoid, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') This is a deprecated name for :func:`~cntk.layers.blocks.RNNStep`. Use that name instead. ''' activation = get_default_override(RNNUnit, activation=activation) init = get_default_override(RNNUnit, init=init) init_bias = get_default_override(RNNUnit, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNUnit, enable_self_stabilization=enable_self_stabilization) warnings.warn( 'This name will be removed in future versions. Please use ' 'RNNStep(...) instead, which is identical except for its name', DeprecationWarning) return _RecurrentBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def WeightDroppedLSTM(shape, dropout_rate, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), seed=SentinelValueForAutoSelectRandomSeed, name=''): ''' WDLSTM(shape, cell_shape=None, activation=tanh, use_peepholes=False, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create an LSTM block for use inside a recurrence. The LSTM block implements one step of the recurrence and is stateless. It accepts the previous state as its first two arguments, and outputs its new state as a two-valued tuple ``(h,c)``. Example: >>> # a typical recurrent LSTM layer >>> from cntkx.layers import * >>> lstm_layer = Recurrence(WeightDroppedLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` use_peepholes (bool, defaults to `False`): init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent LSTM layer. ''' activation = get_default_override(WeightDroppedLSTM, activation=activation) use_peepholes = get_default_override(WeightDroppedLSTM, use_peepholes=use_peepholes) init = get_default_override(WeightDroppedLSTM, init=init) init_bias = get_default_override(WeightDroppedLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( WeightDroppedLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('WeightDroppedLSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, dropout_rate=dropout_rate, seed=seed, enable_self_stabilization=enable_self_stabilization, name=name)
def IndyLSTM(shape, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ Implementation of Independently Recurrent Long Short-term Memory cells: IndyLSTMs by Gonnet and Deselaers. Paper can be found at https://arxiv.org/abs/1903.08023 IndyLSTM differ from regular LSTM cells in that the recurrent weights are not modeled as a full matrix, but as a diagonal matrix, i.e. the output and state of each LSTM cell depends on the inputs and its own output/state, as opposed to the input and the outputs/states of all the cells in the layer. The number of parameters per IndyLSTM layer, and thus the number of FLOPS per evaluation, is linear in the number of nodes in the layer, as opposed to quadratic for regular LSTM layers, resulting in potentially both smaller and faster model. Example: >>> # a gated recurrent layer >>> from cntkx.layers import * >>> indy_lstm_layer = Recurrence(IndyLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent IndyLSTM layer. """ activation = get_default_override(IndyLSTM, activation=activation) init = get_default_override(IndyLSTM, init=init) init_bias = get_default_override(IndyLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( IndyLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndyLSTM', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0, name=''): """ Multi head attention block as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Multi-head attention block comes with a residual connection and a layer norm. Example: a = C.sequence.input_variable(10) b = MultiHeadAttentionBlock(2, 10)(a, a, a) assert b.shape == (10, ) Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ attention_layer = MultiHeadAttention(num_heads, model_dim, obey_sequence_order, max_seq_len, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, init=init, init_bias=init_bias, name='MultiheadAttention') layernorm = LayerNormalization(initial_scale=initial_scale, initial_bias=initial_bias, name='LayerNorm') @C.Function def inner(query, key, value): attended = attention_layer(query, key, value) skip_connect_attended = attended + query normed_skip_connect_attended = layernorm(skip_connect_attended) return normed_skip_connect_attended return _inject_name(inner, name)
def IndRNN(shape, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ IndRNN implementation found in "Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN" by Li, et al (https://arxiv.org/abs/1803.04831). IndRNN are RNNS where neurons in each layer are independent from each other, and the cross-channel information is obtained through stacking multiple layers. It has been shown that an IndRNN can be easily regulated to prevent the gradient exploding and vanishing problems while allowing the networkto learn long-term dependencies. Moreover, an IndRNN can work with non-saturated activation functions such as relu (rectified linear unit) and be still trained robustly. Multiple IndRNNs can be stacked to construct a network that is deeper than the existing RNNs. Experimental results have shown that the proposed IndRNN is able to process very long sequences (over 5000 time steps), can be used to construct very deep networks (21 layers used in the experiment) and still be trained robustly. Better performances have been achieved on various tasks by using IndRNNs compared with the traditional RNN and LSTM. IndRNN also enables the usable of Relu activation which more efficient to compute than sigmoid and leads to faster convergence during training. You may consider to initialise the recurrent weights using a uniform distribution from 0 to 1. The original code is available at: https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne. Example: >>> # a plain relu RNN layer >>> from cntkx.layers import * >>> relu_rnn_layer = Recurrence(IndRNN(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer activation (:class:`~cntk.ops.functions.Function`, defaults to signmoid): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, input) -> h`` where ``h = activation(input @ W + prev_h * R + b)`` """ activation = get_default_override(IndRNN, activation=activation) init = get_default_override(IndRNN, init=init) init_bias = get_default_override(IndRNN, init_bias=init_bias) enable_self_stabilization = get_default_override(IndRNN, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndRNN', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def GRU(shape, cell_shape=None, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create a GRU block for use inside a recurrence. The GRU block implements one step of the recurrence and is stateless. It accepts the previous state as its first argument, and outputs its new state. Example: >>> # a gated recurrent layer >>> from cntk.layers import * >>> gru_layer = Recurrence(GRU(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, input) -> h`` that implements one step of a recurrent GRU layer. ''' activation = get_default_override(GRU, activation=activation) init = get_default_override(GRU, init=init) init_bias = get_default_override(GRU, init_bias=init_bias) enable_self_stabilization = get_default_override( GRU, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('GRU', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def LinearAttentionModel(hidden_dim: int, model_dim: int, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), name=''): """ Convenience wrapper in the style of cntk.layers.AttentionModel """ attention = LinearAttention(hidden_dim=hidden_dim, model_dim=model_dim, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, name=name) def model(encoder_hidden_state, decoder_hidden_state): return attention(decoder_hidden_state, encoder_hidden_state, encoder_hidden_state) return model
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg,) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s+1) if isinstance(s, slice): if s.step is not None and s.step != 1: # TODO: This is not hard to implement in SliceNode. raise ValueError("slicing with a step other than 1 is " "currently not supported") # implement as a CNTK slice() operation begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append(ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError( 'type "%s" is not supported as index' % type(s)) return r
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg,) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s+1) if isinstance(s, slice): if s.step is not None and s.step != 1: # TODO: This is not hard to implement in SliceNode. raise ValueError("slicing with a step other than 1 is " "currently not supported") # implement as a CNTK slice() operation begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append(ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError( 'type "%s" is not supported as index' % type(s)) return r
def uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name=''): """uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name='') Generates samples from the uniform distribution in the interval [`low`,`high`). Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. low (float): lower end of the range of the random numbers high (float): upper end of the range of the random numbers seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> u = C.random.uniform((2,3), seed=98052) >>> u.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 0.931785, 0.814722, 0.479606], [ 0.937468, 0.004351, 0.185131]], dtype=float32) """ from cntk.cntk_py import uniform_random shape, dtype = sanitize_random_args(shape, dtype) return uniform_random(shape, dtype, low, high, seed, name)
def bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name=''): """bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name='') Generates samples from the Bernoulli distribution with success probability `mean`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. mean (float): success probability seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> b = C.random.bernoulli((2,3), seed=98052) >>> b.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 1., 1., 0.], [ 1., 0., 0.]], dtype=float32) """ from cntk.cntk_py import bernoulli_random shape, dtype = sanitize_random_args(shape, dtype) return bernoulli_random(shape, dtype, mean, seed, name)
def gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name=''): """gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name='') Generates samples from the Gumbel distribution with location `loc` and scale `scale`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. loc (float): location of the distribution scale (float): scale of the distribution seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> g = C.random.gumbel((2,3), seed=98052) >>> g.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[-0.987713, -0.522298, 0.425918], [-1.019599, 5.435177, 1.586071]], dtype=float32) See also: `The Gumbel-Max Trick <https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/>`_. """ from cntk.cntk_py import gumbel_random shape, dtype = sanitize_random_args(shape, dtype) return gumbel_random(shape, dtype, loc, scale, seed, name)
def normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name=''): """normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name='') Generates samples from the normal distribution with mean `mean` and standard deviation `scale`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. mean (float): mean of the distribution scale (float): scale (standard deviation) of the distribution seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> z = C.random.normal((2,3), seed=98052) >>> z.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 1.803254, 0.995395, -0.631974], [-1.73672 , 0.005615, -0.340025]], dtype=float32) """ from cntk.cntk_py import normal_random shape, dtype = sanitize_random_args(shape, dtype) return normal_random(shape, dtype, mean, scale, seed, name)
def uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name=''): """uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name='') Generates samples from the uniform distribution in the interval [`low`,`high`). Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. low (float): lower end of the range of the random numbers high (float): upper end of the range of the random numbers seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> u = C.random.uniform((2,3), seed=98052) >>> u.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 0.931785, 0.814722, 0.479606], [ 0.937468, 0.004351, 0.185131]], dtype=float32) """ from cntk.cntk_py import uniform_random shape, dtype = sanitize_random_args(shape, dtype) return uniform_random(shape, dtype, low, high, seed, name)
def gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name=''): """gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name='') Generates samples from the Gumbel distribution with location `loc` and scale `scale`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. loc (float): location of the distribution scale (float): scale of the distribution seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> g = C.random.gumbel((2,3), seed=98052) >>> g.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[-0.987713, -0.522298, 0.425918], [-1.019599, 5.435177, 1.586071]], dtype=float32) See also: `The Gumbel-Max Trick <https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/>`_. """ from cntk.cntk_py import gumbel_random shape, dtype = sanitize_random_args(shape, dtype) return gumbel_random(shape, dtype, loc, scale, seed, name)
def IndRNNStep(shape, cell_shape=None, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(RNNStep, activation=activation) init = get_default_override(RNNStep, init=init) init_bias = get_default_override(RNNStep, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNStep, enable_self_stabilization=enable_self_stabilization) return IndRNNBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name=''): """bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name='') Generates samples from the Bernoulli distribution with success probability `mean`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. mean (float): success probability seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> b = C.random.bernoulli((2,3), seed=98052) >>> b.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 1., 1., 0.], [ 1., 0., 0.]], dtype=float32) """ from cntk.cntk_py import bernoulli_random shape, dtype = sanitize_random_args(shape, dtype) return bernoulli_random(shape, dtype, mean, seed, name)
def normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name=''): """normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name='') Generates samples from the normal distribution with mean `mean` and standard deviation `scale`. Args: shape (tuple): shape of the output (entries are independent random draws) dtype (np.float32 or np.float64): data type. Default is np.float32. mean (float): mean of the distribution scale (float): scale (standard deviation) of the distribution seed (int): pseudo random number generator seed (default: automatically select a unique seed) name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function` Examples: >>> z = C.random.normal((2,3), seed=98052) >>> z.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code array([[ 1.803254, 0.995395, -0.631974], [-1.73672 , 0.005615, -0.340025]], dtype=float32) """ from cntk.cntk_py import normal_random shape, dtype = sanitize_random_args(shape, dtype) return normal_random(shape, dtype, mean, scale, seed, name)
def GroupLSTM(shape: int, groups=2, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ Implementation of group LSTM, the equivalent concept of group convolution but for recurrent neural networks. More details can be found in Efficient Sequence Learning with Group Recurrent Networks Gao et al https://www.aclweb.org/anthology/N18-1073/ While it is parametrically efficient, it uses more gpu memory during training due to the permutation of hidden states from the lstm groups. Arguments: shape (int): shape of desired output groups (int): number of groups of lstm (defaults 2) The larger the group size, the more parameter efficient. activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent GroupLSTM layer. """ assert isinstance(shape, int) if shape % groups: raise ValueError( f"shape ({shape}) must be divisible by groups ({groups})") lstms = [ C.layers.LSTM(shape // groups, activation=activation, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization) for __ in range(groups) ] @C.BlockFunction('GroupLSTM', name) def group_lstm(dh, dc, x): x_grps = split(x, groups).outputs dh_grps = split(dh, groups).outputs dc_grps = split(dc, groups).outputs h_grps = [] c_grps = [] for lstm, h_grp, c_grp, x_grp in zip(lstms, dh_grps, dc_grps, x_grps): h, c = lstm(h_grp, c_grp, x_grp).outputs h_grps.append(h) c_grps.append(c) # inter-group correlation through permutation of dimensions h_output = C.reshape( C.swapaxes(C.splice(*h_grps, axis=C.Axis.new_leading_axis())), (shape, )) c_output = C.reshape( C.swapaxes(C.splice(*c_grps, axis=C.Axis.new_leading_axis())), (shape, )) return h_output, c_output return group_lstm
def TransformerEncoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), mha_init=default_override_or(C.glorot_uniform()), mha_init_bias=default_override_or(0), mha_initial_scale=1, mha_initial_bias=0, intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0, name=''): """ Encoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Consist of 1 multi head attention followed by a dense layer, residual connect and layer norm Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer dropout_rate (float): probability of dropping out an element in the position-wise feed-forward obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ mha_block = MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order, max_seq_len, key_init=key_init, key_init_bias=key_init_bias, query_init=query_init, query_init_bias=query_init_bias, value_init=value_init, value_init_bias=value_init_bias, init=mha_init, init_bias=mha_init_bias, initial_scale=mha_initial_scale, initial_bias=mha_initial_bias, name='SelfAttention') feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate, intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias, init=init, init_bias=init_bias, name='PWFF') layernorm = LayerNormalization(initial_scale, initial_bias, name='LayerNorm') @C.Function def block(x): self_attended = mha_block(x, C.alias(x), C.alias(x)) hidden = feed_foward(self_attended) output = layernorm(hidden + self_attended) # residual connection return output return _inject_name(block, name) # consider change to BlockFunction
def MultiHeadAttention(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), name=''): """ Multi-head attention as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Example: a = C.sequence.input_variable(10) b = MultiHeadAttention(2, 10)(a, a, a) assert b.shape == (10, ) Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention obey_sequence_order: do not let attention peek into future values max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` Returns: :class:`~cntk.ops.functions.Function`: """ assert model_dim % num_heads == 0, "Model dimension must be divisible by number of heads" head_dim = int(model_dim / num_heads) query_linear = Dense(model_dim, init=query_init, init_bias=query_init_bias) key_linear = Dense(model_dim, init=key_init, init_bias=key_init_bias) value_linear = Dense(model_dim, init=value_init, init_bias=value_init_bias) multihead_liner = Dense(model_dim, init=init, init_bias=init_bias) scaled_dot_product_attention = ScaledDotProductAttention(obey_sequence_order, max_seq_len) @C.BlockFunction('MultiHeadAttention', name) def inner(query, key, value): mixed_queries = query_linear(query) # [#, *] {model_dim,] mixed_keys = key_linear(key) # [#, *] {model_dim,] mixed_values = value_linear(value) # [#, *] {model_dim,] # TODO: re-implement `ScaledDotProductAttention` when cntk has BatchMatMul so there's no need to slice here queries = [C.slice(mixed_queries, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)] keys = [C.slice(mixed_keys, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)] values = [C.slice(mixed_values, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)] # list of num_heads heads with shape (-3, head_dim) each attention_outputs = [scaled_dot_product_attention(q, k, v) for q, k, v in zip(queries, keys, values)] result = multihead_liner(C.splice(*attention_outputs)) return result return _inject_name(inner, name)
def AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=default_override_or(glorot_uniform()), go_backwards=default_override_or(False), enable_self_stabilization=default_override_or(True), name=''): ''' AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='') Layer factory function to create a function object that implements an attention model as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate." ''' init = get_default_override(AttentionModel, init=init) go_backwards = get_default_override(AttentionModel, go_backwards=go_backwards) enable_self_stabilization = get_default_override(AttentionModel, enable_self_stabilization=enable_self_stabilization) compatible_attention_mode = True if attention_span is None: if attention_axis is not None: raise ValueError('attention_span cannot be None when attention_axis is not None') compatible_attention_mode = False elif attention_span <= 0: raise ValueError('attention_span must be a positive value') elif attention_axis is None: raise ValueError('attention_axis cannot be None when attention_span is not None') # model parameters with default_options(bias=False): # all the projections have no bias attn_proj_enc = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects input hidden state, keeping span axes intact attn_proj_dec = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects decoder hidden state, but keeping span and beam-search axes intact attn_proj_tanh = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(1 , init=init, input_rank=1) # projects tanh output, keeping span and beam-search axes intact attn_final_stab = Stabilizer(enable_self_stabilization=enable_self_stabilization) if compatible_attention_mode: warn('Specifying non-default values for attention_span and attention_axis has been deprecated since version 2.2. ' 'These arguments will be removed in the future.', DeprecationWarning, stacklevel=2) # old attention function @Function def old_attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow(attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + (h_enc_valid - 1) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax(u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(C.sequence.broadcast_as(h_enc, history_axis), attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att return _inject_name(old_attention, name) else: # new attention function @Function def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output return _inject_name(new_attention, name)
def AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=default_override_or(glorot_uniform()), go_backwards=default_override_or(False), enable_self_stabilization=default_override_or(True), name=''): ''' AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='') Layer factory function to create a function object that implements an attention model as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate." ''' init = get_default_override(AttentionModel, init=init) go_backwards = get_default_override(AttentionModel, go_backwards=go_backwards) enable_self_stabilization = get_default_override( AttentionModel, enable_self_stabilization=enable_self_stabilization) # until CNTK can handle multiple nested dynamic loops, we require fixed windows and fake it if attention_span is None or attention_axis is None: raise NotImplementedError( 'AttentionModel currently requires a fixed attention_span and a static attention_axis to be specified' ) if attention_span <= 0: raise ValueError('attention_span must be a positive value') # model parameters with default_options(bias=False): # all the projections have no bias attn_proj_enc = Stabilizer( enable_self_stabilization=enable_self_stabilization) >> Dense( attention_dim, init=init, input_rank=1 ) # projects input hidden state, keeping span axes intact attn_proj_dec = Stabilizer( enable_self_stabilization=enable_self_stabilization ) >> Dense( attention_dim, init=init, input_rank=1 ) # projects decoder hidden state, but keeping span and beam-search axes intact attn_proj_tanh = Stabilizer( enable_self_stabilization=enable_self_stabilization) >> Dense( 1, init=init, input_rank=1 ) # projects tanh output, keeping span and beam-search axes intact attn_final_stab = Stabilizer( enable_self_stabilization=enable_self_stabilization) # attention function @Function def attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow( attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + ( h_enc_valid - 1 ) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax( u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att return _inject_name(attention, name)
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops if hasattr(self, 'outputs') and len(self.outputs) > 1: try: return self.outputs[arg] except Exception as e: msg = 'Slice for multioutput functions is not supported, ' \ 'the fallback to select to output requires ' \ 'that only one index is provided. arg: {}, self: {}'.format( arg, self) raise KeyError(msg) # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg,) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s+1) if isinstance(s, slice): begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append(ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError( 'type "%s" is not supported as index' % type(s)) return r
def LinearAttention(hidden_dim: int, model_dim: int, key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0), query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0), value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0), name=''): """ Attention model that is linear in time and memory complexity. This is a huge improvement from standard softmax attention models or self-attention where the time and memory complexity is quadratic in sequence length. This is especially significant since cntk doesn't have any build-in checkpointing functionality that saves gpu memory and hence allow the training of Transformer models. With this attention, it becomes possible to do transformer training on cntk. This implementation addresses the limitation of attentions by express the attention as a linear dot-product of kernel feature maps and made use of the associativity property of matrix products. When query, key and value are all the same, it becomes self-attention. For more details refer to "Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention" by Katharopoulos et al. (https://arxiv.org/abs/2006.16236) Note: Key and value must have the same sequence length Example: a = C.sequence.input_variable(24) b = LinearAttention(hidden_dim=32, model_dim=24)(a, a, a) assert b.shape == (32, ) Arguments: hidden_dim (int): number of dim in final output, does of projection of Value model_dim (int): number of dim in the attention key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` Returns: :class:`~cntk.ops.functions.Function`: """ query_linear = Dense(model_dim, init=query_init, init_bias=query_init_bias) key_linear = Dense(model_dim, init=key_init, init_bias=key_init_bias) value_linear = Dense(hidden_dim, init=value_init, init_bias=value_init_bias) def phi(x): # kernel return C.elu(x) + 1 @C.Function def model(query, key, value): q = phi(query_linear(query)) k = phi(key_linear(key)) v = value_linear(value) # key and value should have the same sequence length k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True) # k_unpacked: [#] [*kv=, model_dim] v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True) # v_unpacked: [#] [*kv=, hidden_dim] kv = C.times(C.swapaxes(k_unpacked), v_unpacked) # kv [#] [model_dim, hidden_dim] kv_broadcasted = C.sequence.broadcast_as(kv, q) # this can be reused across queries # kv [#, *] [model_dim, hidden_dim] numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted)) # numerator [#, *] [hidden_dim, ] denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q)) # denom [#, *] [1] return numerator / denom return model
def Stabilizer(steepness=4, enable_self_stabilization=default_override_or(True), name=''): ''' Stabilizer(steepness=4, enable_self_stabilization=True, name='') Layer factory function to create a `Droppo self-stabilizer <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/SelfLR.pdf>`_. It multiplies its input with a scalar that is learned. This takes `enable_self_stabilization` as a flag that allows to disable itself. Useful if this is a global default. Note: Some other layers (specifically, recurrent units like :func:`~cntk.layers.blocks.LSTM`) also have the option to use the ``Stabilizer()`` layer internally. That is enabled by passing `enable_self_stabilization=True` to those layers. In conjunction with those, the rule is that an explicit ``Stabilizer()`` must be inserted by the user for the main data input, whereas the recurrent layer will own the stabilizer(s) for the internal recurrent connection(s). Note: Unlike the original paper, which proposed a linear or exponential scalar, CNTK uses a sharpened Softplus: 1/steepness ln(1+e^{steepness*beta}). The softplus behaves linear for weights around and above 1 (like the linear scalar) while guaranteeing positiveness (like the exponentional variant) but is also more robust by avoiding exploding gradients. Example: >>> # recurrent model with self-stabilization >>> from cntk.layers import * >>> with default_options(enable_self_stabilization=True): # enable stabilizers by default for LSTM() ... model = Sequential([ ... Embedding(300), ... Stabilizer(), # stabilizer for main data input of recurrence ... Recurrence(LSTM(512)), # LSTM owns its own stabilizers for the recurrent connections ... Stabilizer(), ... Dense(10) ... ]) Args: steepness (`int`, defaults to 4): enable_self_stabilization (bool, defaults to `False`): a flag that allows to disable itself. Useful if this is a global default name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ''' enable_self_stabilization = get_default_override( Stabilizer, enable_self_stabilization=enable_self_stabilization) if not enable_self_stabilization: # disabled (typically through global option; otherwise one would not call this in the first place) return identity # parameters bound to this Function init_param = np.log( np.exp(steepness) - 1 ) / steepness # initialize so that factor is initially 1 (has no effect) param = Parameter((), init=init_param, name='alpha') beta = softplus(param, steepness=steepness) # expression @BlockFunction('Stabilizer', name) def stabilize(x): return beta * x return stabilize
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None, ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0), hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0), name=''): """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models. This difference between this implementation and cntk's one is that the slicing of the recurrent weights are different. pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation, i kept the biases to one to speed it up a little more. """ activation = get_default_override(LSTM, activation=activation) ih_init = get_default_override(LSTM, ih_init=ih_init) ih_bias = get_default_override(LSTM, ih_bias=ih_bias) hh_init = get_default_override(LSTM, hh_init=hh_init) hh_bias = get_default_override(LSTM, hh_bias=hh_bias) stack_axis = - 1 shape = _as_tuple(shape) cell_shape = shape cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times init_bias = ih_bias + hh_bias # combine both biases in pytorch into one b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=ih_init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=hh_init, name='H') # hidden-to-hidden dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None @C.BlockFunction('PT::LSTM', name) def lstm(dh, dc, x): # projected contribution from input(s), hidden, and bias dropped_H = dropout(H) if weight_drop_rate is not None else H proj4 = b + times(x, W) + times(dh, dropped_H) # slicing layout different from cntk's implementation it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) # g gate ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) it = sigmoid(it_proj) # input gate(t) bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(ot_proj) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) return ht, ct return lstm
def dense_factored(shapes, #(shape1, shape2) activation=default_override_or(identity), init={'W1':None, 'W2':None}, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert(input_rank is None and map_rank is None and all(isinstance(s,int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def TransformerDecoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None, obey_sequence_order: bool = True, max_seq_len: int = None, mha1_key_init=default_override_or(C.glorot_uniform()), mha1_key_init_bias=default_override_or(0), mha1_query_init=default_override_or(C.glorot_uniform()), mha1_query_init_bias=default_override_or(0), mha1_value_init=default_override_or(C.glorot_uniform()), mha1_value_init_bias=default_override_or(0), mha1_init=default_override_or(C.glorot_uniform()), mha1_init_bias=default_override_or(0), mha1_initial_scale=1, mha1_initial_bias=0, mha2_key_init=default_override_or(C.glorot_uniform()), mha2_key_init_bias=default_override_or(0), mha2_query_init=default_override_or(C.glorot_uniform()), mha2_query_init_bias=default_override_or(0), mha2_value_init=default_override_or(C.glorot_uniform()), mha2_value_init_bias=default_override_or(0), mha2_init=default_override_or(C.glorot_uniform()), mha2_init_bias=default_override_or(0), mha2_initial_scale=1, mha2_initial_bias=0, intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0), init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0), initial_scale=1, initial_bias=0): """ Decoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762 Consist of 2 multi head attention followed by a dense layer, residual connect and layer norm Arguments: num_heads (int): number of attention heads model_dim (int): number of hidden dim in final output of multi-head attention intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer dropout_rate (float): probability of dropping out an element in the position-wise feed-forward obey_sequence_order (bool, defaults True): do not let attention peek into future values max_seq_len (int): max sequence length possible, used to ensure that sequence order is obeyed mha1_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha1_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha1_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha1_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta mha2_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` mha2_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` mha2_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma mha2_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta Returns: :class:`~cntk.ops.functions.Function`: """ mha_block1 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim, obey_sequence_order=obey_sequence_order, max_seq_len=max_seq_len, key_init=mha1_key_init, key_init_bias=mha1_key_init_bias, query_init=mha1_query_init, query_init_bias=mha1_query_init_bias, value_init=mha1_value_init, value_init_bias=mha1_value_init_bias, init=mha1_init, init_bias=mha1_init_bias, initial_scale=mha1_initial_scale, initial_bias=mha1_initial_bias) mha_block2 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim, obey_sequence_order=False, max_seq_len=None, key_init=mha2_key_init, key_init_bias=mha2_key_init_bias, query_init=mha2_query_init, query_init_bias=mha2_query_init_bias, value_init=mha2_value_init, value_init_bias=mha2_value_init_bias, init=mha2_init, init_bias=mha2_init_bias, initial_scale=mha2_initial_scale, initial_bias=mha2_initial_bias) feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate, intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias, init=init, init_bias=init_bias) layernorm = LayerNormalization(initial_scale, initial_bias) @C.Function def block(encoded, x): inner = mha_block1(x, x, x) inner = mha_block2(inner, encoded, encoded) output = layernorm(ResNetBlock(feed_foward)(inner)) return output return block
def dense_factored( shapes, #(shape1, shape2) activation=default_override_or(identity), init={ 'W1': None, 'W2': None }, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert (input_rank is None and map_rank is None and all(isinstance(s, int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError( "Dense: input_rank and map_rank cannot be specified at the same time." ) # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops if hasattr(self, 'outputs') and len(self.outputs) > 1: try: return self.outputs[arg] except Exception as e: msg = 'Slice for multioutput functions is not supported, ' \ 'the fallback to select to output requires ' \ 'that only one index is provided. arg: {}, self: {}'.format( arg, self) raise KeyError(msg) # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg, ) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or( len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s + 1) if isinstance(s, slice): begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append( ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError('type "%s" is not supported as index' % type(s)) return r
def Recurrence(step_function, go_backwards=default_override_or(False), initial_state=default_override_or(0), return_full_state=False, dropout_rate_input=None, dropout_rate_output=None, seed=SentinelValueForAutoSelectRandomSeed, name=''): ''' Recurrence(step_function, go_backwards=False, initial_state=0, return_full_state=False, name='') Recurrence has option to variationally dropout input and output. Layer factory function that implements a recurrent model, including the common RNN, LSTM, and GRU recurrences. This factory function creates a function that runs a step function recurrently over an input sequence, where in each step, Recurrence() will pass to the step function a data input as well as the output of the previous step. The following pseudo-code repesents what happens when you call a `Recurrence()` layer:: # pseudo-code for y = Recurrence(step_function)(x) # x: input sequence of tensors along the dynamic axis # y: resulting sequence of outputs along the same dynamic axis y = [] # result sequence goes here s = initial_state # s = output of previous step ("state") for x_n in x: # pseudo-code for looping over all steps of input sequence along its dynamic axis s = step_function(s, x_n) # pass previous state and new data to step_function -> new state y.append(s) The common step functions are :func:`~cntk.layers.blocks.LSTM`, :func:`~cntk.layers.blocks.GRU`, and :func:`~cntk.layers.blocks.RNNStep`, but the step function can be any :class:`~cntk.ops.functions.Function` or Python function. The signature of a step function with a single state variable must be ``(h_prev, x) -> h``, where ``h_prev`` is the previous state, ``x`` is the new data input, and the output is the new state. The step function will be called item by item, resulting in a sequence of the same length as the input. Step functions can have more than one state output, e.g. :func:`~cntk.layers.blocks.LSTM`. In this case, the first N arguments are the previous state, followed by one more argument that is the data input; and its output must be a tuple of N values. In this case, the recurrence operation will, by default, return the first of the state variables (in the LSTM case, the ``h``), while additional state variables are internal (like the LSTM's ``c``). If all state variables should be returned, pass ``return_full_state=True``. To provide your own step function, just use any :class:`~cntk.ops.functions.Function` (or equivalent Python function) that has a signature as described above. For example, a cumulative sum over a sequence can be computed as ``Recurrence(plus)``, where each step consists of `plus(s,x_n)`, where `s` is the output of the previous call and hence the cumulative sum of all elements up to `x_n`. Another example is a GRU layer with projection, which could be realized as ``Recurrence(GRU(500) >> Dense(200))``, where the projection is applied to the hidden state as fed back to the next step. ``F>>G`` is a short-hand for ``Sequential([F, G])``. Optionally, the recurrence can run backwards. This is useful for constructing bidirectional models. ``initial_state`` must be a constant. To pass initial_state as a data input, e.g. for a sequence-to-sequence model, use :func:`~cntk.layers.sequence.RecurrenceFrom()` instead. Note: ``Recurrence()`` is the equivalent to what in functional programming is often called ``scanl()``. Example: >>> from cntk.layers import Sequential >>> from cntk.layers.typing import Tensor, Sequence >>> # a recurrent LSTM layer >>> lstm_layer = Recurrence(LSTM(500)) >>> # a bidirectional LSTM layer >>> # using function tuples to implement a bidirectional LSTM >>> bi_lstm_layer = Sequential([(Recurrence(LSTM(250)), # first tuple entry: forward pass ... Recurrence(LSTM(250), go_backwards=True)), # second: backward pass ... splice]) # splice both on top of each other >>> bi_lstm_layer.update_signature(Sequence[Tensor[13]]) >>> bi_lstm_layer.shape # shape reflects concatenation of both output states (500,) >>> tuple(str(axis.name) for axis in bi_lstm_layer.dynamic_axes) # (note: str() needed only for Python 2.7) ('defaultBatchAxis', 'defaultDynamicAxis') >>> # custom step function example: using Recurrence() to >>> # compute the cumulative sum over an input sequence >>> x = C.input_variable(**Sequence[Tensor[2]]) >>> x0 = np.array([[ 3, 2], ... [ 13, 42], ... [-100, +100]]) >>> cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5])) >>> y = cum_sum(x) >>> y(x0) [array([[ 3. , 2.5], [ 16. , 44.5], [ -84. , 144.5]], dtype=float32)] Args: step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function): This function must have N+1 inputs and N outputs, where N is the number of state variables (typically 1 for GRU and plain RNNs, and 2 for LSTMs). go_backwards (bool, defaults to ``False``): if ``True`` then run the recurrence from the end of the sequence to the start. initial_state (scalar or tensor without batch dimension; or a tuple thereof): the initial value for the state. This can be a constant or a learnable parameter. In the latter case, if the step function has more than 1 state variable, this parameter must be a tuple providing one initial state for every state variable. return_full_state (bool, defaults to ``False``): if ``True`` and the step function has more than one state variable, then the layer returns a all state variables (a tuple of sequences); whereas if not given or ``False``, only the first state variable is returned to the caller. dropout_rate_input (float): dropout for input dropout_rate_output (float): dropout for output seed (int): seed for randomisation name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it ''' # BUGBUG: the cum_sum expression in the docstring should be this: # cum_sum = Recurrence(C.plus, initial_state=np.array([0, 0.5])) # BUGBUG: whereas passing a NumPy array fails with "TypeError: cannot convert value of dictionary" # cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5])) go_backwards = get_default_override(Recurrence, go_backwards=go_backwards) initial_state = get_default_override(Recurrence, initial_state=initial_state) initial_state = _get_initial_state_or_default(initial_state) step_function = _santize_step_function(step_function) dropout_input = None if dropout_rate_input: dropout_input = VariationalDropout(dropout_rate=dropout_rate_input, seed=seed, name='variational_dropout_input') dropout_output = None if dropout_rate_output: dropout_output = VariationalDropout(dropout_rate=dropout_rate_output, seed=seed, name='variational_dropout_output') # get signature of step function #*prev_state_args, _ = step_function.signature # Python 3 prev_state_args = step_function.signature[0:-1] if len(step_function.outputs) != len(prev_state_args): raise TypeError('Recurrence: number of state variables inconsistent between create_placeholder() and recurrent block') # initial state can be a single value or one per state variable (if more than one, like for LSTM) if isinstance(initial_state, tuple) and len(initial_state) == 1: initial_state = initial_state[0] if not isinstance(initial_state, tuple): # TODO: if initial_state is a CNTK Function rather than an initializer, then require to pass it multiple times; otherwise broadcast to all initial_state = tuple(initial_state for out_var in prev_state_args) # express it w.r.t. RecurrenceFrom recurrence_from = RecurrenceFrom(step_function, go_backwards, return_full_state) # :: (x, state seq) -> (new state seq) # function that this layer represents @C.Function def recurrence(x): dropped_x = dropout_input(x) if dropout_input else x y = recurrence_from(*(initial_state + (dropped_x,))) dropped_y = dropout_output(y) if dropout_output else y return dropped_y return _inject_name(recurrence, name)