def RNNUnit(shape, cell_shape=None, activation=default_override_or(sigmoid), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' RNNUnit(shape, cell_shape=None, activation=sigmoid, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') This is a deprecated name for :func:`~cntk.layers.blocks.RNNStep`. Use that name instead. ''' activation = get_default_override(RNNUnit, activation=activation) init = get_default_override(RNNUnit, init=init) init_bias = get_default_override(RNNUnit, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNUnit, enable_self_stabilization=enable_self_stabilization) warnings.warn( 'This name will be removed in future versions. Please use ' 'RNNStep(...) instead, which is identical except for its name', DeprecationWarning) return _RecurrentBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def LSTM(shape, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(LSTM, activation=activation) use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes) init = get_default_override(LSTM, init=init) init_bias = get_default_override(LSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( LSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('LSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def WeightDroppedLSTM(shape, dropout_rate, cell_shape=None, activation=default_override_or(tanh), use_peepholes=default_override_or(False), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), seed=SentinelValueForAutoSelectRandomSeed, name=''): ''' WDLSTM(shape, cell_shape=None, activation=tanh, use_peepholes=False, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create an LSTM block for use inside a recurrence. The LSTM block implements one step of the recurrence and is stateless. It accepts the previous state as its first two arguments, and outputs its new state as a two-valued tuple ``(h,c)``. Example: >>> # a typical recurrent LSTM layer >>> from cntkx.layers import * >>> lstm_layer = Recurrence(WeightDroppedLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` use_peepholes (bool, defaults to `False`): init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent LSTM layer. ''' activation = get_default_override(WeightDroppedLSTM, activation=activation) use_peepholes = get_default_override(WeightDroppedLSTM, use_peepholes=use_peepholes) init = get_default_override(WeightDroppedLSTM, init=init) init_bias = get_default_override(WeightDroppedLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( WeightDroppedLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('WeightDroppedLSTM', shape, cell_shape, activation=activation, use_peepholes=use_peepholes, init=init, init_bias=init_bias, dropout_rate=dropout_rate, seed=seed, enable_self_stabilization=enable_self_stabilization, name=name)
def IndyLSTM(shape, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ Implementation of Independently Recurrent Long Short-term Memory cells: IndyLSTMs by Gonnet and Deselaers. Paper can be found at https://arxiv.org/abs/1903.08023 IndyLSTM differ from regular LSTM cells in that the recurrent weights are not modeled as a full matrix, but as a diagonal matrix, i.e. the output and state of each LSTM cell depends on the inputs and its own output/state, as opposed to the input and the outputs/states of all the cells in the layer. The number of parameters per IndyLSTM layer, and thus the number of FLOPS per evaluation, is linear in the number of nodes in the layer, as opposed to quadratic for regular LSTM layers, resulting in potentially both smaller and faster model. Example: >>> # a gated recurrent layer >>> from cntkx.layers import * >>> indy_lstm_layer = Recurrence(IndyLSTM(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent IndyLSTM layer. """ activation = get_default_override(IndyLSTM, activation=activation) init = get_default_override(IndyLSTM, init=init) init_bias = get_default_override(IndyLSTM, init_bias=init_bias) enable_self_stabilization = get_default_override( IndyLSTM, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndyLSTM', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def IndRNN(shape, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): """ IndRNN implementation found in "Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN" by Li, et al (https://arxiv.org/abs/1803.04831). IndRNN are RNNS where neurons in each layer are independent from each other, and the cross-channel information is obtained through stacking multiple layers. It has been shown that an IndRNN can be easily regulated to prevent the gradient exploding and vanishing problems while allowing the networkto learn long-term dependencies. Moreover, an IndRNN can work with non-saturated activation functions such as relu (rectified linear unit) and be still trained robustly. Multiple IndRNNs can be stacked to construct a network that is deeper than the existing RNNs. Experimental results have shown that the proposed IndRNN is able to process very long sequences (over 5000 time steps), can be used to construct very deep networks (21 layers used in the experiment) and still be trained robustly. Better performances have been achieved on various tasks by using IndRNNs compared with the traditional RNN and LSTM. IndRNN also enables the usable of Relu activation which more efficient to compute than sigmoid and leads to faster convergence during training. You may consider to initialise the recurrent weights using a uniform distribution from 0 to 1. The original code is available at: https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne. Example: >>> # a plain relu RNN layer >>> from cntkx.layers import * >>> relu_rnn_layer = Recurrence(IndRNN(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer activation (:class:`~cntk.ops.functions.Function`, defaults to signmoid): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, input) -> h`` where ``h = activation(input @ W + prev_h * R + b)`` """ activation = get_default_override(IndRNN, activation=activation) init = get_default_override(IndRNN, init=init) init_bias = get_default_override(IndRNN, init_bias=init_bias) enable_self_stabilization = get_default_override(IndRNN, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('IndRNN', shape, None, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed, enable_self_stabilization=enable_self_stabilization, name=name)
def GRU(shape, cell_shape=None, activation=default_override_or(tanh), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): ''' GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='') Layer factory function to create a GRU block for use inside a recurrence. The GRU block implements one step of the recurrence and is stateless. It accepts the previous state as its first argument, and outputs its new state. Example: >>> # a gated recurrent layer >>> from cntk.layers import * >>> gru_layer = Recurrence(GRU(500)) Args: shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape` and linearly projected to `shape` activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu` init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W` init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b` enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer` to all state-related projections (but not the data input) name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ``(prev_h, input) -> h`` that implements one step of a recurrent GRU layer. ''' activation = get_default_override(GRU, activation=activation) init = get_default_override(GRU, init=init) init_bias = get_default_override(GRU, init_bias=init_bias) enable_self_stabilization = get_default_override( GRU, enable_self_stabilization=enable_self_stabilization) return _RecurrentBlock('GRU', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg,) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s+1) if isinstance(s, slice): if s.step is not None and s.step != 1: # TODO: This is not hard to implement in SliceNode. raise ValueError("slicing with a step other than 1 is " "currently not supported") # implement as a CNTK slice() operation begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append(ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError( 'type "%s" is not supported as index' % type(s)) return r
def sanitize_random_args(shape, dtype): from cntk.default_options import get_default_override shape = sanitize_shape(shape) dtype = get_default_override(None, dtype=dtype) if dtype is None: dtype = np.float32 dtype = sanitize_dtype_cntk(dtype) return shape, dtype
def IndRNNStep(shape, cell_shape=None, activation=default_override_or(relu), init=default_override_or(glorot_uniform()), init_bias=default_override_or(0), enable_self_stabilization=default_override_or(False), name=''): activation = get_default_override(RNNStep, activation=activation) init = get_default_override(RNNStep, init=init) init_bias = get_default_override(RNNStep, init_bias=init_bias) enable_self_stabilization = get_default_override( RNNStep, enable_self_stabilization=enable_self_stabilization) return IndRNNBlock('RNNStep', shape, cell_shape, activation=activation, use_peepholes=False, init=init, init_bias=init_bias, enable_self_stabilization=enable_self_stabilization, name=name)
def Stabilizer(steepness=4, enable_self_stabilization=default_override_or(True), name=''): ''' Stabilizer(steepness=4, enable_self_stabilization=True, name='') Layer factory function to create a `Droppo self-stabilizer <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/SelfLR.pdf>`_. It multiplies its input with a scalar that is learned. This takes `enable_self_stabilization` as a flag that allows to disable itself. Useful if this is a global default. Note: Some other layers (specifically, recurrent units like :func:`~cntk.layers.blocks.LSTM`) also have the option to use the ``Stabilizer()`` layer internally. That is enabled by passing `enable_self_stabilization=True` to those layers. In conjunction with those, the rule is that an explicit ``Stabilizer()`` must be inserted by the user for the main data input, whereas the recurrent layer will own the stabilizer(s) for the internal recurrent connection(s). Note: Unlike the original paper, which proposed a linear or exponential scalar, CNTK uses a sharpened Softplus: 1/steepness ln(1+e^{steepness*beta}). The softplus behaves linear for weights around and above 1 (like the linear scalar) while guaranteeing positiveness (like the exponentional variant) but is also more robust by avoiding exploding gradients. Example: >>> # recurrent model with self-stabilization >>> from cntk.layers import * >>> with default_options(enable_self_stabilization=True): # enable stabilizers by default for LSTM() ... model = Sequential([ ... Embedding(300), ... Stabilizer(), # stabilizer for main data input of recurrence ... Recurrence(LSTM(512)), # LSTM owns its own stabilizers for the recurrent connections ... Stabilizer(), ... Dense(10) ... ]) Args: steepness (`int`, defaults to 4): enable_self_stabilization (bool, defaults to `False`): a flag that allows to disable itself. Useful if this is a global default name (str, defaults to ''): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function ''' enable_self_stabilization = get_default_override( Stabilizer, enable_self_stabilization=enable_self_stabilization) if not enable_self_stabilization: # disabled (typically through global option; otherwise one would not call this in the first place) return identity # parameters bound to this Function init_param = np.log( np.exp(steepness) - 1 ) / steepness # initialize so that factor is initially 1 (has no effect) param = Parameter((), init=init_param, name='alpha') beta = softplus(param, steepness=steepness) # expression @BlockFunction('Stabilizer', name) def stabilize(x): return beta * x return stabilize
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops if hasattr(self, 'outputs') and len(self.outputs) > 1: try: return self.outputs[arg] except Exception as e: msg = 'Slice for multioutput functions is not supported, ' \ 'the fallback to select to output requires ' \ 'that only one index is provided. arg: {}, self: {}'.format( arg, self) raise KeyError(msg) # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg,) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s+1) if isinstance(s, slice): begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append(ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError( 'type "%s" is not supported as index' % type(s)) return r
def dense_factored( shapes, #(shape1, shape2) activation=default_override_or(identity), init={ 'W1': None, 'W2': None }, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert (input_rank is None and map_rank is None and all(isinstance(s, int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError( "Dense: input_rank and map_rank cannot be specified at the same time." ) # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def Recurrence(step_function, go_backwards=default_override_or(False), initial_state=default_override_or(0), return_full_state=False, dropout_rate_input=None, dropout_rate_output=None, seed=SentinelValueForAutoSelectRandomSeed, name=''): ''' Recurrence(step_function, go_backwards=False, initial_state=0, return_full_state=False, name='') Recurrence has option to variationally dropout input and output. Layer factory function that implements a recurrent model, including the common RNN, LSTM, and GRU recurrences. This factory function creates a function that runs a step function recurrently over an input sequence, where in each step, Recurrence() will pass to the step function a data input as well as the output of the previous step. The following pseudo-code repesents what happens when you call a `Recurrence()` layer:: # pseudo-code for y = Recurrence(step_function)(x) # x: input sequence of tensors along the dynamic axis # y: resulting sequence of outputs along the same dynamic axis y = [] # result sequence goes here s = initial_state # s = output of previous step ("state") for x_n in x: # pseudo-code for looping over all steps of input sequence along its dynamic axis s = step_function(s, x_n) # pass previous state and new data to step_function -> new state y.append(s) The common step functions are :func:`~cntk.layers.blocks.LSTM`, :func:`~cntk.layers.blocks.GRU`, and :func:`~cntk.layers.blocks.RNNStep`, but the step function can be any :class:`~cntk.ops.functions.Function` or Python function. The signature of a step function with a single state variable must be ``(h_prev, x) -> h``, where ``h_prev`` is the previous state, ``x`` is the new data input, and the output is the new state. The step function will be called item by item, resulting in a sequence of the same length as the input. Step functions can have more than one state output, e.g. :func:`~cntk.layers.blocks.LSTM`. In this case, the first N arguments are the previous state, followed by one more argument that is the data input; and its output must be a tuple of N values. In this case, the recurrence operation will, by default, return the first of the state variables (in the LSTM case, the ``h``), while additional state variables are internal (like the LSTM's ``c``). If all state variables should be returned, pass ``return_full_state=True``. To provide your own step function, just use any :class:`~cntk.ops.functions.Function` (or equivalent Python function) that has a signature as described above. For example, a cumulative sum over a sequence can be computed as ``Recurrence(plus)``, where each step consists of `plus(s,x_n)`, where `s` is the output of the previous call and hence the cumulative sum of all elements up to `x_n`. Another example is a GRU layer with projection, which could be realized as ``Recurrence(GRU(500) >> Dense(200))``, where the projection is applied to the hidden state as fed back to the next step. ``F>>G`` is a short-hand for ``Sequential([F, G])``. Optionally, the recurrence can run backwards. This is useful for constructing bidirectional models. ``initial_state`` must be a constant. To pass initial_state as a data input, e.g. for a sequence-to-sequence model, use :func:`~cntk.layers.sequence.RecurrenceFrom()` instead. Note: ``Recurrence()`` is the equivalent to what in functional programming is often called ``scanl()``. Example: >>> from cntk.layers import Sequential >>> from cntk.layers.typing import Tensor, Sequence >>> # a recurrent LSTM layer >>> lstm_layer = Recurrence(LSTM(500)) >>> # a bidirectional LSTM layer >>> # using function tuples to implement a bidirectional LSTM >>> bi_lstm_layer = Sequential([(Recurrence(LSTM(250)), # first tuple entry: forward pass ... Recurrence(LSTM(250), go_backwards=True)), # second: backward pass ... splice]) # splice both on top of each other >>> bi_lstm_layer.update_signature(Sequence[Tensor[13]]) >>> bi_lstm_layer.shape # shape reflects concatenation of both output states (500,) >>> tuple(str(axis.name) for axis in bi_lstm_layer.dynamic_axes) # (note: str() needed only for Python 2.7) ('defaultBatchAxis', 'defaultDynamicAxis') >>> # custom step function example: using Recurrence() to >>> # compute the cumulative sum over an input sequence >>> x = C.input_variable(**Sequence[Tensor[2]]) >>> x0 = np.array([[ 3, 2], ... [ 13, 42], ... [-100, +100]]) >>> cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5])) >>> y = cum_sum(x) >>> y(x0) [array([[ 3. , 2.5], [ 16. , 44.5], [ -84. , 144.5]], dtype=float32)] Args: step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function): This function must have N+1 inputs and N outputs, where N is the number of state variables (typically 1 for GRU and plain RNNs, and 2 for LSTMs). go_backwards (bool, defaults to ``False``): if ``True`` then run the recurrence from the end of the sequence to the start. initial_state (scalar or tensor without batch dimension; or a tuple thereof): the initial value for the state. This can be a constant or a learnable parameter. In the latter case, if the step function has more than 1 state variable, this parameter must be a tuple providing one initial state for every state variable. return_full_state (bool, defaults to ``False``): if ``True`` and the step function has more than one state variable, then the layer returns a all state variables (a tuple of sequences); whereas if not given or ``False``, only the first state variable is returned to the caller. dropout_rate_input (float): dropout for input dropout_rate_output (float): dropout for output seed (int): seed for randomisation name (str, optional): the name of the Function instance in the network Returns: :class:`~cntk.ops.functions.Function`: A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it ''' # BUGBUG: the cum_sum expression in the docstring should be this: # cum_sum = Recurrence(C.plus, initial_state=np.array([0, 0.5])) # BUGBUG: whereas passing a NumPy array fails with "TypeError: cannot convert value of dictionary" # cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5])) go_backwards = get_default_override(Recurrence, go_backwards=go_backwards) initial_state = get_default_override(Recurrence, initial_state=initial_state) initial_state = _get_initial_state_or_default(initial_state) step_function = _santize_step_function(step_function) dropout_input = None if dropout_rate_input: dropout_input = VariationalDropout(dropout_rate=dropout_rate_input, seed=seed, name='variational_dropout_input') dropout_output = None if dropout_rate_output: dropout_output = VariationalDropout(dropout_rate=dropout_rate_output, seed=seed, name='variational_dropout_output') # get signature of step function #*prev_state_args, _ = step_function.signature # Python 3 prev_state_args = step_function.signature[0:-1] if len(step_function.outputs) != len(prev_state_args): raise TypeError('Recurrence: number of state variables inconsistent between create_placeholder() and recurrent block') # initial state can be a single value or one per state variable (if more than one, like for LSTM) if isinstance(initial_state, tuple) and len(initial_state) == 1: initial_state = initial_state[0] if not isinstance(initial_state, tuple): # TODO: if initial_state is a CNTK Function rather than an initializer, then require to pass it multiple times; otherwise broadcast to all initial_state = tuple(initial_state for out_var in prev_state_args) # express it w.r.t. RecurrenceFrom recurrence_from = RecurrenceFrom(step_function, go_backwards, return_full_state) # :: (x, state seq) -> (new state seq) # function that this layer represents @C.Function def recurrence(x): dropped_x = dropout_input(x) if dropout_input else x y = recurrence_from(*(initial_state + (dropped_x,))) dropped_y = dropout_output(y) if dropout_output else y return dropped_y return _inject_name(recurrence, name)
def AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=default_override_or(glorot_uniform()), go_backwards=default_override_or(False), enable_self_stabilization=default_override_or(True), name=''): ''' AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='') Layer factory function to create a function object that implements an attention model as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate." ''' init = get_default_override(AttentionModel, init=init) go_backwards = get_default_override(AttentionModel, go_backwards=go_backwards) enable_self_stabilization = get_default_override( AttentionModel, enable_self_stabilization=enable_self_stabilization) # until CNTK can handle multiple nested dynamic loops, we require fixed windows and fake it if attention_span is None or attention_axis is None: raise NotImplementedError( 'AttentionModel currently requires a fixed attention_span and a static attention_axis to be specified' ) if attention_span <= 0: raise ValueError('attention_span must be a positive value') # model parameters with default_options(bias=False): # all the projections have no bias attn_proj_enc = Stabilizer( enable_self_stabilization=enable_self_stabilization) >> Dense( attention_dim, init=init, input_rank=1 ) # projects input hidden state, keeping span axes intact attn_proj_dec = Stabilizer( enable_self_stabilization=enable_self_stabilization ) >> Dense( attention_dim, init=init, input_rank=1 ) # projects decoder hidden state, but keeping span and beam-search axes intact attn_proj_tanh = Stabilizer( enable_self_stabilization=enable_self_stabilization) >> Dense( 1, init=init, input_rank=1 ) # projects tanh output, keeping span and beam-search axes intact attn_final_stab = Stabilizer( enable_self_stabilization=enable_self_stabilization) # attention function @Function def attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow( attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + ( h_enc_valid - 1 ) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax( u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att return _inject_name(attention, name)
def AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=default_override_or(glorot_uniform()), go_backwards=default_override_or(False), enable_self_stabilization=default_override_or(True), name=''): ''' AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='') Layer factory function to create a function object that implements an attention model as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate." ''' init = get_default_override(AttentionModel, init=init) go_backwards = get_default_override(AttentionModel, go_backwards=go_backwards) enable_self_stabilization = get_default_override(AttentionModel, enable_self_stabilization=enable_self_stabilization) compatible_attention_mode = True if attention_span is None: if attention_axis is not None: raise ValueError('attention_span cannot be None when attention_axis is not None') compatible_attention_mode = False elif attention_span <= 0: raise ValueError('attention_span must be a positive value') elif attention_axis is None: raise ValueError('attention_axis cannot be None when attention_span is not None') # model parameters with default_options(bias=False): # all the projections have no bias attn_proj_enc = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects input hidden state, keeping span axes intact attn_proj_dec = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects decoder hidden state, but keeping span and beam-search axes intact attn_proj_tanh = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(1 , init=init, input_rank=1) # projects tanh output, keeping span and beam-search axes intact attn_final_stab = Stabilizer(enable_self_stabilization=enable_self_stabilization) if compatible_attention_mode: warn('Specifying non-default values for attention_span and attention_axis has been deprecated since version 2.2. ' 'These arguments will be removed in the future.', DeprecationWarning, stacklevel=2) # old attention function @Function def old_attention(h_enc, h_dec): history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders # --- encoder state window (h_enc, h_enc_valid) = PastValueWindow(attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs h_enc_proj = attn_proj_enc(h_enc) # window must be broadcast to every decoder time step h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis) h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis) # --- decoder state # project decoder hidden state h_dec_proj = attn_proj_dec(h_dec) tanh_out = C.tanh(h_dec_proj + h_enc_proj) # (attention_span, attention_dim) u = attn_proj_tanh(tanh_out) # (attention_span, 1) u_masked = u + (h_enc_valid - 1) * 50 # logzero-out the unused elements for the softmax denominator TODO: use a less arbitrary number than 50 attention_weights = C.softmax(u_masked, axis=attention_axis) #, name='attention_weights') attention_weights = Label('attention_weights')(attention_weights) # now take weighted sum over the encoder state vectors h_att = C.reduce_sum(C.element_times(C.sequence.broadcast_as(h_enc, history_axis), attention_weights), axis=attention_axis) h_att = attn_final_stab(h_att) return h_att return _inject_name(old_attention, name) else: # new attention function @Function def new_attention(encoder_hidden_state, decoder_hidden_state): # encode_hidden_state: [#, e] [h] # decoder_hidden_state: [#, d] [H] unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs # unpacked_encoder_hidden_state: [#] [*=e, h] # valid_mask: [#] [*=e] projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state) # projected_encoder_hidden_state: [#, d] [*=e, attention_dim] broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state) # broadcast_valid_mask: [#, d] [*=e] projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state) # projected_decoder_hidden_state: [#, d] [attention_dim] tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state) # tanh_output: [#, d] [*=e, attention_dim] attention_logits = attn_proj_tanh(tanh_output) # attention_logits = [#, d] [*=e, 1] minus_inf = C.constant(-1e+30) masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf) # masked_attention_logits = [#, d] [*=e] attention_weights = C.softmax(masked_attention_logits, axis=0) attention_weights = Label('attention_weights')(attention_weights) # attention_weights = [#, d] [*=e] attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0) # attended_encoder_hidden_state = [#, d] [1, h] output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1)) # output = [#, d], [h] return output return _inject_name(new_attention, name)
def dense_factored(shapes, #(shape1, shape2) activation=default_override_or(identity), init={'W1':None, 'W2':None}, input_rank=None, map_rank=None, bias=default_override_or(True), init_bias=default_override_or(0), name=''): ''' Perform the new model creation using the factored inputs W1 and W2. The returend function represents the new model. Args: shapes : dimensions of the input matrices. activation : activation function used for the model. init : the two matrices corresponding to the factorization. input_rank : rank of the input tensor. map_rank : ??? bias : bias for the model. init_bias : initial bias value. name : name of the block function that creates the new model. Returns: a model that is factored and projected (reduced). ''' # matthaip: Not sure how to handle input tensor of rank > 1 # or selective flattening of ranks assert(input_rank is None and map_rank is None and all(isinstance(s,int) for s in list(shapes))) activation = get_default_override(cntk.layers.Dense, activation=activation) bias = get_default_override(cntk.layers.Dense, bias=bias) init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias) # how to use get_default_override for init parameeter? output_shape1 = _as_tuple(shapes[0]) output_shape2 = _as_tuple(shapes[1]) if input_rank is not None and map_rank is not None: raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.") # If input_rank not given then pass a single _INFERRED; # map_rank if given will determine the input_rank. # The dimension inference may still create multiple axes. input_shape = _INFERRED # parameters bound to this Function # init_weights = _initializer_for(init, Record(output_rank=output_rank)) init_weights = init W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1') W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2') b = Parameter(output_shape2, init=init_bias, name='b') if bias else None # expression of this function @BlockFunction('DenseFactored', name) def dense(x): r = times(x, W1) r = times(r, W2) if b: r = r + b if activation is not None: r = activation(r) return r return dense
def __getitem__(self, arg): ''' Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3) ''' from . import ops if hasattr(self, 'outputs') and len(self.outputs) > 1: try: return self.outputs[arg] except Exception as e: msg = 'Slice for multioutput functions is not supported, ' \ 'the fallback to select to output requires ' \ 'that only one index is provided. arg: {}, self: {}'.format( arg, self) raise KeyError(msg) # int or slice: normalize into a tuple of int or tuple of slice if not isinstance(arg, tuple): arg = (arg, ) r = self axis0 = 0 from cntk.default_options import get_global_option, get_default_override, default_override_or keras_mode_flag = get_global_option('align_axis', 0) if keras_mode_flag == 1: if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0): axis0 = -get_default_override(None, axis_offset=default_override_or( len(self.dynamic_axes))) for axis, s in enumerate(arg): if s is Ellipsis: # ellipsis means index relative to end after this point axis0 = -len(arg) continue if isinstance(s, int): # int: normalize into a slice s = slice(s, s + 1) if isinstance(s, slice): begin = s.start or 0 end = s.stop or 0 if begin != 0 or end != 0: r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step) elif isinstance(s, (tuple, list)): # Select multiple elements from the same dimension. This is # different from NumPy's advanced indexing, since we just go # axis by axis from left to right and don't do any # broadcasting. slice_accum = [] for idx in s: if not isinstance(idx, int): raise IndexError( 'indices have to be of type int and not "%s"' % type(idx)) slice_accum.append( ops.slice(r, axis=axis, begin_index=idx, end_index=idx + 1)) if len(slice_accum) > 1: r = ops.splice(*slice_accum, axis=axis) else: r = slice_accum[0] else: raise IndexError('type "%s" is not supported as index' % type(s)) return r
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None, ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0), hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0), name=''): """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models. This difference between this implementation and cntk's one is that the slicing of the recurrent weights are different. pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation, i kept the biases to one to speed it up a little more. """ activation = get_default_override(LSTM, activation=activation) ih_init = get_default_override(LSTM, ih_init=ih_init) ih_bias = get_default_override(LSTM, ih_bias=ih_bias) hh_init = get_default_override(LSTM, hh_init=hh_init) hh_bias = get_default_override(LSTM, hh_bias=hh_bias) stack_axis = - 1 shape = _as_tuple(shape) cell_shape = shape cell_shape_list = list(cell_shape) stacked_dim = cell_shape_list[stack_axis] cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times cell_shape_list[stack_axis] = stacked_dim * 4 cell_shape_stacked_H = tuple(cell_shape_list) # patched dims with stack_axis duplicated 4 times init_bias = ih_bias + hh_bias # combine both biases in pytorch into one b = Parameter( cell_shape_stacked, init=init_bias, name='b') # bias W = Parameter(_INFERRED + cell_shape_stacked, init=ih_init, name='W') # input H = Parameter(shape + cell_shape_stacked_H, init=hh_init, name='H') # hidden-to-hidden dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None @C.BlockFunction('PT::LSTM', name) def lstm(dh, dc, x): # projected contribution from input(s), hidden, and bias dropped_H = dropout(H) if weight_drop_rate is not None else H proj4 = b + times(x, W) + times(dh, dropped_H) # slicing layout different from cntk's implementation it_proj = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim) # split along stack_axis ft_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim) bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim) # g gate ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim) it = sigmoid(it_proj) # input gate(t) bit = it * activation(bit_proj) # applied to tanh of input network ft = sigmoid(ft_proj) # forget-me-not gate(t) bft = ft * dc # applied to cell(t-1) ct = bft + bit # c(t) is sum of both ot = sigmoid(ot_proj) # output gate(t) ht = ot * activation(ct) # applied to tanh(cell(t)) return ht, ct return lstm