Ejemplo n.º 1
0
def LSTM(shape,
         cell_shape=None,
         activation=default_override_or(tanh),
         use_peepholes=default_override_or(False),
         init=default_override_or(glorot_uniform()),
         init_bias=default_override_or(0),
         enable_self_stabilization=default_override_or(False),
         name=''):

    activation = get_default_override(LSTM, activation=activation)
    use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes)
    init = get_default_override(LSTM, init=init)
    init_bias = get_default_override(LSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        LSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('LSTM',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=use_peepholes,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)
Ejemplo n.º 2
0
def RNNUnit(shape,
            cell_shape=None,
            activation=default_override_or(sigmoid),
            init=default_override_or(glorot_uniform()),
            init_bias=default_override_or(0),
            enable_self_stabilization=default_override_or(False),
            name=''):
    '''
    RNNUnit(shape, cell_shape=None, activation=sigmoid, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    This is a deprecated name for :func:`~cntk.layers.blocks.RNNStep`. Use that name instead.
    '''

    activation = get_default_override(RNNUnit, activation=activation)
    init = get_default_override(RNNUnit, init=init)
    init_bias = get_default_override(RNNUnit, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        RNNUnit, enable_self_stabilization=enable_self_stabilization)

    warnings.warn(
        'This name will be removed in future versions. Please use '
        'RNNStep(...) instead, which is identical except for its name',
        DeprecationWarning)

    return _RecurrentBlock('RNNStep',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)
Ejemplo n.º 3
0
def WeightDroppedLSTM(shape,
                      dropout_rate,
                      cell_shape=None,
                      activation=default_override_or(tanh),
                      use_peepholes=default_override_or(False),
                      init=default_override_or(glorot_uniform()),
                      init_bias=default_override_or(0),
                      enable_self_stabilization=default_override_or(False),
                      seed=SentinelValueForAutoSelectRandomSeed,
                      name=''):
    '''
    WDLSTM(shape, cell_shape=None, activation=tanh, use_peepholes=False, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    Layer factory function to create an LSTM block for use inside a recurrence.
    The LSTM block implements one step of the recurrence and is stateless. It accepts the previous state as its first two arguments,
    and outputs its new state as a two-valued tuple ``(h,c)``.

    Example:
     >>> # a typical recurrent LSTM layer
     >>> from cntkx.layers import *
     >>> lstm_layer = Recurrence(WeightDroppedLSTM(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        use_peepholes (bool, defaults to `False`):
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent LSTM layer.
    '''

    activation = get_default_override(WeightDroppedLSTM, activation=activation)
    use_peepholes = get_default_override(WeightDroppedLSTM,
                                         use_peepholes=use_peepholes)
    init = get_default_override(WeightDroppedLSTM, init=init)
    init_bias = get_default_override(WeightDroppedLSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        WeightDroppedLSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('WeightDroppedLSTM',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=use_peepholes,
                           init=init,
                           init_bias=init_bias,
                           dropout_rate=dropout_rate,
                           seed=seed,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)
Ejemplo n.º 4
0
def IndyLSTM(shape,
             activation=default_override_or(tanh),
             init=default_override_or(glorot_uniform()),
             init_bias=default_override_or(0),
             enable_self_stabilization=default_override_or(False),
             name=''):
    """
    Implementation of Independently Recurrent Long Short-term Memory cells: IndyLSTMs by Gonnet and Deselaers.
    Paper can be found at https://arxiv.org/abs/1903.08023

    IndyLSTM differ from regular LSTM cells in that the recurrent weights are not modeled as a full matrix,
    but as a diagonal matrix, i.e. the output and state of each LSTM cell depends on the inputs and its
    own output/state, as opposed to the input and the outputs/states of all the cells in the layer.
    The number of parameters per IndyLSTM layer, and thus the number of FLOPS per evaluation, is linear in the
    number of nodes in the layer, as opposed to quadratic for regular LSTM layers, resulting in potentially both
    smaller and faster model.

    Example:
     >>> # a gated recurrent layer
     >>> from cntkx.layers import *
     >>> indy_lstm_layer = Recurrence(IndyLSTM(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent IndyLSTM layer.
    """

    activation = get_default_override(IndyLSTM, activation=activation)
    init = get_default_override(IndyLSTM, init=init)
    init_bias = get_default_override(IndyLSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        IndyLSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('IndyLSTM',
                           shape,
                           None,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           dropout_rate=0,
                           seed=SentinelValueForAutoSelectRandomSeed,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)
Ejemplo n.º 5
0
def MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None,
                            key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                            query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                            value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0, name=''):
    """ Multi head attention block as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Multi-head attention block comes with a residual connection and a layer norm.

    Example:
        a = C.sequence.input_variable(10)
        b = MultiHeadAttentionBlock(2, 10)(a, a, a)

        assert b.shape == (10, )

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    attention_layer = MultiHeadAttention(num_heads, model_dim, obey_sequence_order, max_seq_len,
                                         key_init=key_init, key_init_bias=key_init_bias,
                                         query_init=query_init, query_init_bias=query_init_bias,
                                         value_init=value_init, value_init_bias=value_init_bias,
                                         init=init, init_bias=init_bias, name='MultiheadAttention')

    layernorm = LayerNormalization(initial_scale=initial_scale, initial_bias=initial_bias, name='LayerNorm')

    @C.Function
    def inner(query, key, value):
        attended = attention_layer(query, key, value)
        skip_connect_attended = attended + query
        normed_skip_connect_attended = layernorm(skip_connect_attended)
        return normed_skip_connect_attended

    return _inject_name(inner, name)
Ejemplo n.º 6
0
def IndRNN(shape, activation=default_override_or(relu),
            init=default_override_or(glorot_uniform()), init_bias=default_override_or(0),
            enable_self_stabilization=default_override_or(False), name=''):
    """
    IndRNN implementation found in "Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN"
    by Li, et al (https://arxiv.org/abs/1803.04831).

    IndRNN are RNNS where neurons in each layer are independent from each other, and the cross-channel information is
    obtained through stacking multiple layers.

    It has been shown that an IndRNN can be easily regulated to prevent the gradient exploding and vanishing problems
    while allowing the networkto learn long-term dependencies. Moreover, an IndRNN can work with non-saturated
    activation functions such as relu (rectified linear unit) and be still trained robustly.
    Multiple IndRNNs can be stacked to construct a network that is deeper than the existing RNNs.
    Experimental results have shown that the proposed IndRNN is able to process very long
    sequences (over 5000 time steps), can be used to construct very deep networks (21 layers used in the experiment)
    and still be trained robustly. Better performances have been achieved on various tasks by using IndRNNs compared
    with the traditional RNN and LSTM.

    IndRNN also enables the usable of Relu activation which more efficient to compute than sigmoid and leads to
    faster convergence during training. You may consider to initialise the recurrent weights using a uniform
    distribution from 0 to 1.

    The original code is available at: https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne.

    Example:
     >>> # a plain relu RNN layer
     >>> from cntkx.layers import *
     >>> relu_rnn_layer = Recurrence(IndRNN(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        activation (:class:`~cntk.ops.functions.Function`, defaults to signmoid): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, input) -> h`` where ``h = activation(input @ W + prev_h * R + b)``
    """

    activation                = get_default_override(IndRNN, activation=activation)
    init                      = get_default_override(IndRNN, init=init)
    init_bias                 = get_default_override(IndRNN, init_bias=init_bias)
    enable_self_stabilization = get_default_override(IndRNN, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('IndRNN', shape, None, activation=activation, use_peepholes=False,
                           init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed,
                           enable_self_stabilization=enable_self_stabilization, name=name)
Ejemplo n.º 7
0
def GRU(shape,
        cell_shape=None,
        activation=default_override_or(tanh),
        init=default_override_or(glorot_uniform()),
        init_bias=default_override_or(0),
        enable_self_stabilization=default_override_or(False),
        name=''):
    '''
    GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    Layer factory function to create a GRU block for use inside a recurrence.
    The GRU block implements one step of the recurrence and is stateless. It accepts the previous state as its first argument,
    and outputs its new state.

    Example:
     >>> # a gated recurrent layer
     >>> from cntk.layers import *
     >>> gru_layer = Recurrence(GRU(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, input) -> h`` that implements one step of a recurrent GRU layer.
    '''

    activation = get_default_override(GRU, activation=activation)
    init = get_default_override(GRU, init=init)
    init_bias = get_default_override(GRU, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        GRU, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('GRU',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)
Ejemplo n.º 8
0
def LinearAttentionModel(hidden_dim: int, model_dim: int,
                         key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                         query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                         value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                         name=''):
    """ Convenience wrapper in the style of cntk.layers.AttentionModel """
    attention = LinearAttention(hidden_dim=hidden_dim, model_dim=model_dim,
                                key_init=key_init, key_init_bias=key_init_bias,
                                query_init=query_init, query_init_bias=query_init_bias,
                                value_init=value_init, value_init_bias=value_init_bias, name=name)

    def model(encoder_hidden_state, decoder_hidden_state):
        return attention(decoder_hidden_state, encoder_hidden_state, encoder_hidden_state)

    return model
Ejemplo n.º 9
0
    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                if s.step is not None and s.step != 1:
                    # TODO: This is not hard to implement in SliceNode.
                    raise ValueError("slicing with a step other than 1 is "
                                     "currently not supported")
                # implement as a CNTK slice() operation
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r
Ejemplo n.º 10
0
    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                if s.step is not None and s.step != 1:
                    # TODO: This is not hard to implement in SliceNode.
                    raise ValueError("slicing with a step other than 1 is "
                                     "currently not supported")
                # implement as a CNTK slice() operation
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r
Ejemplo n.º 11
0
def uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name=''):
    """uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name='')
    Generates samples from the uniform distribution in the interval [`low`,`high`).

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        low (float): lower end of the range of the random numbers
        high (float): upper end of the range of the random numbers
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> u = C.random.uniform((2,3), seed=98052)
        >>> u.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 0.931785,  0.814722,  0.479606],
               [ 0.937468,  0.004351,  0.185131]], dtype=float32)

    """
    from cntk.cntk_py import uniform_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return uniform_random(shape, dtype, low, high, seed, name)
Ejemplo n.º 12
0
def bernoulli(shape,
              dtype=default_override_or(np.float32),
              mean=0.5,
              seed=auto_select,
              name=''):
    """bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name='')
    Generates samples from the Bernoulli distribution with success probability `mean`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        mean (float): success probability
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> b = C.random.bernoulli((2,3), seed=98052)
        >>> b.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 1.,  1.,  0.],
               [ 1.,  0.,  0.]], dtype=float32)
    """
    from cntk.cntk_py import bernoulli_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return bernoulli_random(shape, dtype, mean, seed, name)
Ejemplo n.º 13
0
def gumbel(shape,
           dtype=default_override_or(np.float32),
           loc=0.0,
           scale=1.0,
           seed=auto_select,
           name=''):
    """gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name='')
    Generates samples from the Gumbel distribution with location `loc` and scale `scale`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        loc (float): location of the distribution
        scale (float): scale of the distribution
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> g = C.random.gumbel((2,3), seed=98052)
        >>> g.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[-0.987713, -0.522298,  0.425918],
               [-1.019599,  5.435177,  1.586071]], dtype=float32)

    See also:
        `The Gumbel-Max Trick
        <https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/>`_.
    """
    from cntk.cntk_py import gumbel_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return gumbel_random(shape, dtype, loc, scale, seed, name)
Ejemplo n.º 14
0
def normal(shape,
           dtype=default_override_or(np.float32),
           mean=0.0,
           scale=1.0,
           seed=auto_select,
           name=''):
    """normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name='')
    Generates samples from the normal distribution with mean `mean` and standard deviation `scale`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        mean (float): mean of the distribution
        scale (float): scale (standard deviation) of the distribution
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> z = C.random.normal((2,3), seed=98052)
        >>> z.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 1.803254,  0.995395, -0.631974],
               [-1.73672 ,  0.005615, -0.340025]], dtype=float32)
    """
    from cntk.cntk_py import normal_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return normal_random(shape, dtype, mean, scale, seed, name)
Ejemplo n.º 15
0
def uniform(shape,
            dtype=default_override_or(np.float32),
            low=0.0,
            high=1.0,
            seed=auto_select,
            name=''):
    """uniform(shape, dtype=default_override_or(np.float32), low=0.0, high=1.0, seed=auto_select, name='')
    Generates samples from the uniform distribution in the interval [`low`,`high`).

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        low (float): lower end of the range of the random numbers
        high (float): upper end of the range of the random numbers
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> u = C.random.uniform((2,3), seed=98052)
        >>> u.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 0.931785,  0.814722,  0.479606],
               [ 0.937468,  0.004351,  0.185131]], dtype=float32)

    """
    from cntk.cntk_py import uniform_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return uniform_random(shape, dtype, low, high, seed, name)
Ejemplo n.º 16
0
def gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name=''):
    """gumbel(shape, dtype=default_override_or(np.float32), loc=0.0, scale=1.0, seed=auto_select, name='')
    Generates samples from the Gumbel distribution with location `loc` and scale `scale`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        loc (float): location of the distribution
        scale (float): scale of the distribution
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> g = C.random.gumbel((2,3), seed=98052)
        >>> g.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[-0.987713, -0.522298,  0.425918],
               [-1.019599,  5.435177,  1.586071]], dtype=float32)

    See also:
        `The Gumbel-Max Trick
        <https://hips.seas.harvard.edu/blog/2013/04/06/the-gumbel-max-trick-for-discrete-distributions/>`_.
    """
    from cntk.cntk_py import gumbel_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return gumbel_random(shape, dtype, loc, scale, seed, name)
Ejemplo n.º 17
0
def IndRNNStep(shape,
               cell_shape=None,
               activation=default_override_or(relu),
               init=default_override_or(glorot_uniform()),
               init_bias=default_override_or(0),
               enable_self_stabilization=default_override_or(False),
               name=''):

    activation = get_default_override(RNNStep, activation=activation)
    init = get_default_override(RNNStep, init=init)
    init_bias = get_default_override(RNNStep, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        RNNStep, enable_self_stabilization=enable_self_stabilization)

    return IndRNNBlock('RNNStep',
                       shape,
                       cell_shape,
                       activation=activation,
                       use_peepholes=False,
                       init=init,
                       init_bias=init_bias,
                       enable_self_stabilization=enable_self_stabilization,
                       name=name)
Ejemplo n.º 18
0
def bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name=''):
    """bernoulli(shape, dtype=default_override_or(np.float32), mean=0.5, seed=auto_select, name='')
    Generates samples from the Bernoulli distribution with success probability `mean`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        mean (float): success probability
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> b = C.random.bernoulli((2,3), seed=98052)
        >>> b.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 1.,  1.,  0.],
               [ 1.,  0.,  0.]], dtype=float32)
    """
    from cntk.cntk_py import bernoulli_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return bernoulli_random(shape, dtype, mean, seed, name)
Ejemplo n.º 19
0
def normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name=''):
    """normal(shape, dtype=default_override_or(np.float32), mean=0.0, scale=1.0, seed=auto_select, name='')
    Generates samples from the normal distribution with mean `mean` and standard deviation `scale`.

    Args:
        shape (tuple): shape of the output (entries are independent random draws)
        dtype (np.float32 or np.float64): data type. Default is np.float32.
        mean (float): mean of the distribution
        scale (float): scale (standard deviation) of the distribution
        seed (int): pseudo random number generator seed (default: automatically select a unique seed)
        name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`

    Examples:
        >>> z = C.random.normal((2,3), seed=98052)
        >>> z.eval(device=C.cpu()) # explicitly setting cpu because this is tested on multiple platforms; leave it unspecified in your code
        array([[ 1.803254,  0.995395, -0.631974],
               [-1.73672 ,  0.005615, -0.340025]], dtype=float32)
    """
    from cntk.cntk_py import normal_random
    shape, dtype = sanitize_random_args(shape, dtype)
    return normal_random(shape, dtype, mean, scale, seed, name)
Ejemplo n.º 20
0
def GroupLSTM(shape: int,
              groups=2,
              activation=default_override_or(tanh),
              init=default_override_or(glorot_uniform()),
              init_bias=default_override_or(0),
              enable_self_stabilization=default_override_or(False),
              name=''):
    """ Implementation of group LSTM, the equivalent concept of group convolution but for recurrent neural networks.

    More details can be found in Efficient Sequence Learning with Group Recurrent Networks Gao et al
    https://www.aclweb.org/anthology/N18-1073/

    While it is parametrically efficient, it uses more gpu memory during training due to the permutation of
    hidden states from the lstm groups.

    Arguments:
        shape (int): shape of desired output
        groups (int): number of groups of lstm (defaults 2) The larger the group size, the more parameter efficient.
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent GroupLSTM layer.

    """
    assert isinstance(shape, int)

    if shape % groups:
        raise ValueError(
            f"shape ({shape}) must be divisible by groups ({groups})")

    lstms = [
        C.layers.LSTM(shape // groups,
                      activation=activation,
                      init=init,
                      init_bias=init_bias,
                      enable_self_stabilization=enable_self_stabilization)
        for __ in range(groups)
    ]

    @C.BlockFunction('GroupLSTM', name)
    def group_lstm(dh, dc, x):
        x_grps = split(x, groups).outputs
        dh_grps = split(dh, groups).outputs
        dc_grps = split(dc, groups).outputs

        h_grps = []
        c_grps = []

        for lstm, h_grp, c_grp, x_grp in zip(lstms, dh_grps, dc_grps, x_grps):
            h, c = lstm(h_grp, c_grp, x_grp).outputs
            h_grps.append(h)
            c_grps.append(c)

        # inter-group correlation through permutation of dimensions
        h_output = C.reshape(
            C.swapaxes(C.splice(*h_grps, axis=C.Axis.new_leading_axis())),
            (shape, ))
        c_output = C.reshape(
            C.swapaxes(C.splice(*c_grps, axis=C.Axis.new_leading_axis())),
            (shape, ))

        return h_output, c_output

    return group_lstm
Ejemplo n.º 21
0
def TransformerEncoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = None, max_seq_len: int = None,
                            key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                            query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                            value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                            mha_init=default_override_or(C.glorot_uniform()), mha_init_bias=default_override_or(0),
                            mha_initial_scale=1, mha_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()), intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0, name=''):
    """ Encoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 1 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
         mha_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block = MultiHeadAttentionBlock(num_heads, model_dim, obey_sequence_order, max_seq_len,
                                        key_init=key_init, key_init_bias=key_init_bias,
                                        query_init=query_init, query_init_bias=query_init_bias,
                                        value_init=value_init, value_init_bias=value_init_bias,
                                        init=mha_init, init_bias=mha_init_bias,
                                        initial_scale=mha_initial_scale, initial_bias=mha_initial_bias,
                                        name='SelfAttention')

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias, name='PWFF')

    layernorm = LayerNormalization(initial_scale, initial_bias, name='LayerNorm')

    @C.Function
    def block(x):
        self_attended = mha_block(x, C.alias(x), C.alias(x))
        hidden = feed_foward(self_attended)
        output = layernorm(hidden + self_attended)  # residual connection
        return output

    return _inject_name(block, name)  # consider change to BlockFunction
Ejemplo n.º 22
0
def MultiHeadAttention(num_heads, model_dim, obey_sequence_order: bool = None, max_seq_len: int = None,
                       key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                       query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                       value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                       init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                       name=''):
    """ Multi-head attention as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Example:
        a = C.sequence.input_variable(10)
        b = MultiHeadAttention(2, 10)(a, a, a)

        assert b.shape == (10, )

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        obey_sequence_order: do not let attention peek into future values
        max_seq_len: max sequence length possible, used to ensure that sequence order is obeyed
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    assert model_dim % num_heads == 0, "Model dimension must be divisible by number of heads"

    head_dim = int(model_dim / num_heads)

    query_linear = Dense(model_dim, init=query_init, init_bias=query_init_bias)
    key_linear = Dense(model_dim, init=key_init, init_bias=key_init_bias)
    value_linear = Dense(model_dim, init=value_init, init_bias=value_init_bias)
    multihead_liner = Dense(model_dim, init=init, init_bias=init_bias)

    scaled_dot_product_attention = ScaledDotProductAttention(obey_sequence_order, max_seq_len)

    @C.BlockFunction('MultiHeadAttention', name)
    def inner(query, key, value):
        mixed_queries = query_linear(query)  # [#, *] {model_dim,]
        mixed_keys = key_linear(key)  # [#, *] {model_dim,]
        mixed_values = value_linear(value)  # [#, *] {model_dim,]

        # TODO: re-implement `ScaledDotProductAttention` when cntk has BatchMatMul so there's no need to slice here
        queries = [C.slice(mixed_queries, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)]
        keys = [C.slice(mixed_keys, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)]
        values = [C.slice(mixed_values, 0, i * head_dim, (i + 1) * head_dim) for i in range(num_heads)]

        # list of num_heads heads with shape (-3, head_dim) each
        attention_outputs = [scaled_dot_product_attention(q, k, v) for q, k, v in zip(queries, keys, values)]

        result = multihead_liner(C.splice(*attention_outputs))
        return result

    return _inject_name(inner, name)
Ejemplo n.º 23
0
def AttentionModel(attention_dim, attention_span=None, attention_axis=None,
                   init=default_override_or(glorot_uniform()),
                   go_backwards=default_override_or(False),
                   enable_self_stabilization=default_override_or(True), name=''):
    '''
    AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='')

    Layer factory function to create a function object that implements an attention model
    as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate."
    '''

    init                      = get_default_override(AttentionModel, init=init)
    go_backwards              = get_default_override(AttentionModel, go_backwards=go_backwards)
    enable_self_stabilization = get_default_override(AttentionModel, enable_self_stabilization=enable_self_stabilization)

    compatible_attention_mode = True
    if attention_span is None:
        if attention_axis is not None:
            raise ValueError('attention_span cannot be None when attention_axis is not None')
        compatible_attention_mode = False
    elif attention_span <= 0:
        raise ValueError('attention_span must be a positive value')
    elif attention_axis is None:
        raise ValueError('attention_axis cannot be None when attention_span is not None')

    # model parameters
    with default_options(bias=False): # all the projections have no bias
        attn_proj_enc   = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects input hidden state, keeping span axes intact
        attn_proj_dec   = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects decoder hidden state, but keeping span and beam-search axes intact
        attn_proj_tanh  = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(1            , init=init, input_rank=1) # projects tanh output, keeping span and beam-search axes intact
    attn_final_stab = Stabilizer(enable_self_stabilization=enable_self_stabilization)

    if compatible_attention_mode:
        warn('Specifying non-default values for attention_span and attention_axis has been deprecated since version 2.2. '
             'These arguments will be removed in the future.', DeprecationWarning, stacklevel=2)
        # old attention function
        @Function
        def old_attention(h_enc, h_dec):
            history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis
            # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
            # --- encoder state window
            (h_enc, h_enc_valid) = PastValueWindow(attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs
            h_enc_proj = attn_proj_enc(h_enc)
            # window must be broadcast to every decoder time step
            h_enc_proj  = C.sequence.broadcast_as(h_enc_proj,  history_axis)
            h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
            # --- decoder state
            # project decoder hidden state
            h_dec_proj = attn_proj_dec(h_dec)
            tanh_out = C.tanh(h_dec_proj + h_enc_proj)  # (attention_span, attention_dim)
            u = attn_proj_tanh(tanh_out)              # (attention_span, 1)
            u_masked = u + (h_enc_valid - 1) * 50     # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
            attention_weights = C.softmax(u_masked, axis=attention_axis) #, name='attention_weights')
            attention_weights = Label('attention_weights')(attention_weights)
            # now take weighted sum over the encoder state vectors
            h_att = C.reduce_sum(C.element_times(C.sequence.broadcast_as(h_enc, history_axis), attention_weights), axis=attention_axis)
            h_att = attn_final_stab(h_att)
            return h_att

        return _inject_name(old_attention, name)
    else:
        # new attention function
        @Function
        def new_attention(encoder_hidden_state, decoder_hidden_state):
            # encode_hidden_state: [#, e] [h]
            # decoder_hidden_state: [#, d] [H]
            unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs
            # unpacked_encoder_hidden_state: [#] [*=e, h]
            # valid_mask: [#] [*=e]
            projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state)
            # projected_encoder_hidden_state: [#, d] [*=e, attention_dim]
            broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state)
            # broadcast_valid_mask: [#, d] [*=e]
            projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state)
            # projected_decoder_hidden_state: [#, d] [attention_dim]
            tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state)
            # tanh_output: [#, d] [*=e, attention_dim]
            attention_logits = attn_proj_tanh(tanh_output)
            # attention_logits = [#, d] [*=e, 1]
            minus_inf = C.constant(-1e+30)
            masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf)
            # masked_attention_logits = [#, d] [*=e]
            attention_weights = C.softmax(masked_attention_logits, axis=0)
            attention_weights = Label('attention_weights')(attention_weights)
            # attention_weights = [#, d] [*=e]
            attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0)
            # attended_encoder_hidden_state = [#, d] [1, h]
            output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1))
            # output = [#, d], [h]
            return output

        return _inject_name(new_attention, name)
Ejemplo n.º 24
0
def AttentionModel(attention_dim,
                   attention_span=None,
                   attention_axis=None,
                   init=default_override_or(glorot_uniform()),
                   go_backwards=default_override_or(False),
                   enable_self_stabilization=default_override_or(True),
                   name=''):
    '''
    AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='')

    Layer factory function to create a function object that implements an attention model
    as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate."
    '''

    init = get_default_override(AttentionModel, init=init)
    go_backwards = get_default_override(AttentionModel,
                                        go_backwards=go_backwards)
    enable_self_stabilization = get_default_override(
        AttentionModel, enable_self_stabilization=enable_self_stabilization)

    # until CNTK can handle multiple nested dynamic loops, we require fixed windows and fake it
    if attention_span is None or attention_axis is None:
        raise NotImplementedError(
            'AttentionModel currently requires a fixed attention_span and a static attention_axis to be specified'
        )
    if attention_span <= 0:
        raise ValueError('attention_span must be a positive value')

    # model parameters
    with default_options(bias=False):  # all the projections have no bias
        attn_proj_enc = Stabilizer(
            enable_self_stabilization=enable_self_stabilization) >> Dense(
                attention_dim, init=init, input_rank=1
            )  # projects input hidden state, keeping span axes intact
        attn_proj_dec = Stabilizer(
            enable_self_stabilization=enable_self_stabilization
        ) >> Dense(
            attention_dim, init=init, input_rank=1
        )  # projects decoder hidden state, but keeping span and beam-search axes intact
        attn_proj_tanh = Stabilizer(
            enable_self_stabilization=enable_self_stabilization) >> Dense(
                1, init=init, input_rank=1
            )  # projects tanh output, keeping span and beam-search axes intact
    attn_final_stab = Stabilizer(
        enable_self_stabilization=enable_self_stabilization)

    # attention function
    @Function
    def attention(h_enc, h_dec):
        history_axis = h_dec  # we use history_axis wherever we pass this only for the sake of passing its axis
        # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
        # --- encoder state window
        (h_enc, h_enc_valid) = PastValueWindow(
            attention_span, axis=attention_axis,
            go_backwards=go_backwards)(h_enc).outputs
        h_enc_proj = attn_proj_enc(h_enc)
        # window must be broadcast to every decoder time step
        h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis)
        h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
        # --- decoder state
        # project decoder hidden state
        h_dec_proj = attn_proj_dec(h_dec)
        tanh_out = C.tanh(h_dec_proj +
                          h_enc_proj)  # (attention_span, attention_dim)
        u = attn_proj_tanh(tanh_out)  # (attention_span, 1)
        u_masked = u + (
            h_enc_valid - 1
        ) * 50  # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
        attention_weights = C.softmax(
            u_masked, axis=attention_axis)  #, name='attention_weights')
        attention_weights = Label('attention_weights')(attention_weights)
        # now take weighted sum over the encoder state vectors
        h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights),
                             axis=attention_axis)
        h_att = attn_final_stab(h_att)
        return h_att

    return _inject_name(attention, name)
Ejemplo n.º 25
0
    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops
        
        if hasattr(self, 'outputs') and len(self.outputs) > 1:
            try:
                return self.outputs[arg]
            except Exception as e:
                msg = 'Slice for multioutput functions is not supported, ' \
                      'the fallback to select to output requires ' \
                      'that only one index is provided. arg: {}, self: {}'.format(
                    arg, self)
                raise KeyError(msg)

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r
Ejemplo n.º 26
0
def LinearAttention(hidden_dim: int, model_dim: int,
                    key_init=default_override_or(C.glorot_uniform()), key_init_bias=default_override_or(0),
                    query_init=default_override_or(C.glorot_uniform()), query_init_bias=default_override_or(0),
                    value_init=default_override_or(C.glorot_uniform()), value_init_bias=default_override_or(0),
                    name=''):
    """ Attention model that is linear in time and memory complexity.
    This is a huge improvement from standard softmax attention models or self-attention
    where the time and memory complexity is quadratic in sequence length.

    This is especially significant since cntk doesn't have any build-in checkpointing functionality
    that saves gpu memory and hence allow the training of Transformer models. With this attention,
    it becomes possible to do transformer training on cntk.

    This implementation addresses the limitation of attentions by express the attention
    as a linear dot-product of kernel feature maps and made use of the associativity property of matrix products.

    When query, key and value are all the same, it becomes self-attention.

    For more details refer to "Transformers are RNNs:Fast Autoregressive Transformers with Linear Attention" by
    Katharopoulos et al. (https://arxiv.org/abs/2006.16236)

    Note:
        Key and value must have the same sequence length

    Example:
        a = C.sequence.input_variable(24)
        b = LinearAttention(hidden_dim=32, model_dim=24)(a, a, a)

        assert b.shape == (32, )

    Arguments:
        hidden_dim (int): number of dim in final output, does of projection of Value
        model_dim (int): number of dim in the attention
        key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    query_linear = Dense(model_dim, init=query_init, init_bias=query_init_bias)
    key_linear = Dense(model_dim, init=key_init, init_bias=key_init_bias)
    value_linear = Dense(hidden_dim, init=value_init, init_bias=value_init_bias)

    def phi(x):  # kernel
        return C.elu(x) + 1

    @C.Function
    def model(query, key, value):
        q = phi(query_linear(query))
        k = phi(key_linear(key))
        v = value_linear(value)

        # key and value should have the same sequence length
        k_unpacked = C.sequence.unpack(k, padding_value=0, no_mask_output=True)
        # k_unpacked: [#] [*kv=, model_dim]
        v_unpacked = C.sequence.unpack(v, padding_value=0, no_mask_output=True)
        # v_unpacked: [#] [*kv=, hidden_dim]
        kv = C.times(C.swapaxes(k_unpacked), v_unpacked)
        # kv [#] [model_dim, hidden_dim]
        kv_broadcasted = C.sequence.broadcast_as(kv, q)  # this can be reused across queries
        # kv [#, *] [model_dim, hidden_dim]

        numerator = C.squeeze(C.times(C.expand_dims(q, axis=C.Axis.new_leading_axis()), kv_broadcasted))
        # numerator [#, *] [hidden_dim, ]
        denom = C.reduce_sum(q * C.sequence.broadcast_as(C.sequence.reduce_sum(k), q))
        # denom [#, *] [1]

        return numerator / denom

    return model
Ejemplo n.º 27
0
def Stabilizer(steepness=4,
               enable_self_stabilization=default_override_or(True),
               name=''):
    '''
    Stabilizer(steepness=4, enable_self_stabilization=True, name='')

    Layer factory function to create a `Droppo self-stabilizer <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/SelfLR.pdf>`_.
    It multiplies its input with a scalar that is learned.

    This takes `enable_self_stabilization` as a flag that allows to disable itself. Useful if this is a global default.

    Note:
        Some other layers (specifically, recurrent units like :func:`~cntk.layers.blocks.LSTM`) also have the option to
        use the ``Stabilizer()`` layer internally. That is enabled by passing `enable_self_stabilization=True`
        to those layers. In conjunction with those, the rule is that an explicit ``Stabilizer()`` must be
        inserted by the user for the main data input, whereas the recurrent layer will own the stabilizer(s)
        for the internal recurrent connection(s).

    Note:
        Unlike the original paper, which proposed a linear or exponential scalar,
        CNTK uses a sharpened Softplus: 1/steepness ln(1+e^{steepness*beta}).
        The softplus behaves linear for weights around and above 1 (like the linear scalar) while guaranteeing
        positiveness (like the exponentional variant) but is also more robust by avoiding exploding gradients.

    Example:
     >>> # recurrent model with self-stabilization
     >>> from cntk.layers import *
     >>> with default_options(enable_self_stabilization=True): # enable stabilizers by default for LSTM()
     ...     model = Sequential([
     ...         Embedding(300),
     ...         Stabilizer(),           # stabilizer for main data input of recurrence
     ...         Recurrence(LSTM(512)),  # LSTM owns its own stabilizers for the recurrent connections
     ...         Stabilizer(),
     ...         Dense(10)
     ...     ])

    Args:
        steepness (`int`, defaults to 4):
        enable_self_stabilization (bool, defaults to `False`): a flag that allows to disable itself. Useful if this is a global default
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function
    '''

    enable_self_stabilization = get_default_override(
        Stabilizer, enable_self_stabilization=enable_self_stabilization)

    if not enable_self_stabilization:  # disabled (typically through global option; otherwise one would not call this in the first place)
        return identity

    # parameters bound to this Function
    init_param = np.log(
        np.exp(steepness) - 1
    ) / steepness  # initialize so that factor is initially 1 (has no effect)
    param = Parameter((), init=init_param, name='alpha')
    beta = softplus(param, steepness=steepness)

    # expression
    @BlockFunction('Stabilizer', name)
    def stabilize(x):
        return beta * x

    return stabilize
Ejemplo n.º 28
0
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None,
         ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0),
         hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0),
         name=''):
    """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models.

    This difference between this implementation and cntk's one is that the slicing of
    the recurrent weights are different.

    pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation,
    i kept the biases to one to speed it up a little more.

    """
    activation = get_default_override(LSTM, activation=activation)
    ih_init = get_default_override(LSTM, ih_init=ih_init)
    ih_bias = get_default_override(LSTM, ih_bias=ih_bias)
    hh_init = get_default_override(LSTM, hh_init=hh_init)
    hh_bias = get_default_override(LSTM, hh_bias=hh_bias)

    stack_axis = - 1
    shape = _as_tuple(shape)
    cell_shape = shape
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    init_bias = ih_bias + hh_bias  # combine both biases in pytorch into one
    b  = Parameter(            cell_shape_stacked,   init=init_bias,    name='b')                    # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=ih_init,      name='W')                    # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=hh_init,      name='H')                    # hidden-to-hidden

    dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None

    @C.BlockFunction('PT::LSTM', name)
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj  = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim)  # split along stack_axis
        ft_proj  = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)  # g gate
        ot_proj  = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)                        # input gate(t)
        bit = it * activation(bit_proj)              # applied to tanh of input network

        ft = sigmoid(ft_proj)                        # forget-me-not gate(t)
        bft = ft * dc                                # applied to cell(t-1)

        ct = bft + bit                               # c(t) is sum of both

        ot = sigmoid(ot_proj)                        # output gate(t)
        ht = ot * activation(ct)                     # applied to tanh(cell(t))
        return ht, ct

    return lstm
Ejemplo n.º 29
0
def dense_factored(shapes, #(shape1, shape2)
                  activation=default_override_or(identity),
                  init={'W1':None, 'W2':None},
                  input_rank=None,
                  map_rank=None,
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert(input_rank is None and
           map_rank is None and
           all(isinstance(s,int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias       = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias  = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")


    # If input_rank not given then pass a single _INFERRED; 
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
    W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
    b = Parameter(output_shape2, init=init_bias,    name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r
    return dense
Ejemplo n.º 30
0
def TransformerDecoderBlock(num_heads: int, model_dim: int, intermediate_dim: int, dropout_rate: float = None,
                            obey_sequence_order: bool = True, max_seq_len: int = None,
                            mha1_key_init=default_override_or(C.glorot_uniform()), mha1_key_init_bias=default_override_or(0),
                            mha1_query_init=default_override_or(C.glorot_uniform()), mha1_query_init_bias=default_override_or(0),
                            mha1_value_init=default_override_or(C.glorot_uniform()), mha1_value_init_bias=default_override_or(0),
                            mha1_init=default_override_or(C.glorot_uniform()), mha1_init_bias=default_override_or(0),
                            mha1_initial_scale=1, mha1_initial_bias=0,
                            mha2_key_init=default_override_or(C.glorot_uniform()), mha2_key_init_bias=default_override_or(0),
                            mha2_query_init=default_override_or(C.glorot_uniform()), mha2_query_init_bias=default_override_or(0),
                            mha2_value_init=default_override_or(C.glorot_uniform()), mha2_value_init_bias=default_override_or(0),
                            mha2_init=default_override_or(C.glorot_uniform()), mha2_init_bias=default_override_or(0),
                            mha2_initial_scale=1, mha2_initial_bias=0,
                            intermediate_init=default_override_or(C.glorot_uniform()),
                            intermediate_init_bias=default_override_or(0),
                            init=default_override_or(C.glorot_uniform()), init_bias=default_override_or(0),
                            initial_scale=1, initial_bias=0):
    """ Decoder block of transformer as described in "Attention is all you need", https://arxiv.org/abs/1706.03762

    Consist of 2 multi head attention followed by a dense layer, residual connect and layer norm

    Arguments:
        num_heads (int): number of attention heads
        model_dim (int): number of hidden dim in final output of multi-head attention
        intermediate_dim (int): hidden/ intermediate dimension within position-wise feed-forward layer
        dropout_rate (float): probability of dropping out an element in the position-wise feed-forward
        obey_sequence_order (bool, defaults True): do not let attention peek into future values
        max_seq_len (int): max sequence length possible, used to ensure that sequence order is obeyed
        mha1_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha1_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha1_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha1_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        mha2_key_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_key_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_query_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_query_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_value_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_value_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        mha2_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        mha2_initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        mha2_initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta
        intermediate_init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        intermediate_init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to :func:`~cntk.initializer.glorot_uniform` ): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        initial_scale (float, default 1): initial value for the ``scale`` parameter aka gamma
        initial_bias (float, default 0): initial value for the ``bias`` parameter aka beta

    Returns:
        :class:`~cntk.ops.functions.Function`:

    """
    mha_block1 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=obey_sequence_order, max_seq_len=max_seq_len,
                                         key_init=mha1_key_init, key_init_bias=mha1_key_init_bias,
                                         query_init=mha1_query_init, query_init_bias=mha1_query_init_bias,
                                         value_init=mha1_value_init, value_init_bias=mha1_value_init_bias,
                                         init=mha1_init, init_bias=mha1_init_bias,
                                         initial_scale=mha1_initial_scale, initial_bias=mha1_initial_bias)
    
    mha_block2 = MultiHeadAttentionBlock(num_heads=num_heads, model_dim=model_dim,
                                         obey_sequence_order=False, max_seq_len=None,
                                         key_init=mha2_key_init, key_init_bias=mha2_key_init_bias,
                                         query_init=mha2_query_init, query_init_bias=mha2_query_init_bias,
                                         value_init=mha2_value_init, value_init_bias=mha2_value_init_bias,
                                         init=mha2_init, init_bias=mha2_init_bias,
                                         initial_scale=mha2_initial_scale, initial_bias=mha2_initial_bias)

    feed_foward = PositionwiseFeedForward(model_dim, intermediate_dim, dropout_rate=dropout_rate,
                                          intermediate_init=intermediate_init, intermediate_init_bias=intermediate_init_bias,
                                          init=init, init_bias=init_bias)

    layernorm = LayerNormalization(initial_scale, initial_bias)

    @C.Function
    def block(encoded, x):
        inner = mha_block1(x, x, x)
        inner = mha_block2(inner, encoded, encoded)
        output = layernorm(ResNetBlock(feed_foward)(inner))
        return output

    return block
def dense_factored(
        shapes,  #(shape1, shape2)
        activation=default_override_or(identity),
        init={
            'W1': None,
            'W2': None
        },
        input_rank=None,
        map_rank=None,
        bias=default_override_or(True),
        init_bias=default_override_or(0),
        name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert (input_rank is None and map_rank is None
            and all(isinstance(s, int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError(
            "Dense: input_rank and map_rank cannot be specified at the same time."
        )

    # If input_rank not given then pass a single _INFERRED;
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1,
                   init=init_weights['W1'],
                   name='W1')
    W2 = Parameter(output_shape1 + output_shape2,
                   init=init_weights['W2'],
                   name='W2')
    b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r

    return dense
Ejemplo n.º 32
0
    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        if hasattr(self, 'outputs') and len(self.outputs) > 1:
            try:
                return self.outputs[arg]
            except Exception as e:
                msg = 'Slice for multioutput functions is not supported, ' \
                      'the fallback to select to output requires ' \
                      'that only one index is provided. arg: {}, self: {}'.format(
                    arg, self)
                raise KeyError(msg)

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple):
            arg = (arg, )
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None
                    and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None,
                                              axis_offset=default_override_or(
                                                  len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis:  # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int):  # int: normalize into a slice
                s = slice(s, s + 1)

            if isinstance(s, slice):
                begin = s.start or 0
                end = s.stop or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r,
                                  axis=axis + axis0,
                                  begin_index=begin,
                                  end_index=end,
                                  strides=s.step)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                            'indices have to be of type int and not "%s"' %
                            type(idx))
                    slice_accum.append(
                        ops.slice(r,
                                  axis=axis,
                                  begin_index=idx,
                                  end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError('type "%s" is not supported as index' %
                                 type(s))

        return r
Ejemplo n.º 33
0
def Recurrence(step_function, go_backwards=default_override_or(False), initial_state=default_override_or(0),
               return_full_state=False, dropout_rate_input=None,
               dropout_rate_output=None, seed=SentinelValueForAutoSelectRandomSeed, name=''):
    '''
    Recurrence(step_function, go_backwards=False, initial_state=0, return_full_state=False, name='')

    Recurrence has option to variationally dropout input and output.

    Layer factory function that implements a recurrent model, including the common RNN, LSTM, and GRU recurrences.
    This factory function creates a function that runs a step function recurrently over an input sequence,
    where in each step, Recurrence() will pass to the step function a data input as well as the output of the
    previous step.
    The following pseudo-code repesents what happens when you call a `Recurrence()` layer::

      # pseudo-code for y = Recurrence(step_function)(x)
      #  x: input sequence of tensors along the dynamic axis
      #  y: resulting sequence of outputs along the same dynamic axis
      y = []              # result sequence goes here
      s = initial_state   # s = output of previous step ("state")
      for x_n in x:       # pseudo-code for looping over all steps of input sequence along its dynamic axis
          s = step_function(s, x_n)  # pass previous state and new data to step_function -> new state
          y.append(s)

    The common step functions are :func:`~cntk.layers.blocks.LSTM`, :func:`~cntk.layers.blocks.GRU`, and :func:`~cntk.layers.blocks.RNNStep`,
    but the step function can be any :class:`~cntk.ops.functions.Function` or Python function.
    The signature of a step function with a single state variable must be
    ``(h_prev, x) -> h``, where ``h_prev`` is the previous state, ``x`` is the new
    data input, and the output is the new state.
    The step function will be called item by item, resulting in a sequence of the same length as the input.

    Step functions can have more than one state output, e.g. :func:`~cntk.layers.blocks.LSTM`.
    In this case, the first N arguments are the previous state, followed by one more argument that
    is the data input; and its output must be a tuple of N values.
    In this case, the recurrence operation will, by default, return the first of the state variables
    (in the LSTM case, the ``h``), while additional state variables are internal (like the LSTM's ``c``).
    If all state variables should be returned, pass ``return_full_state=True``.

    To provide your own step function, just use any :class:`~cntk.ops.functions.Function` (or equivalent Python function) that
    has a signature as described above.
    For example, a cumulative sum over a sequence can be computed as ``Recurrence(plus)``,
    where each step consists of `plus(s,x_n)`, where `s` is the output of the previous call
    and hence the cumulative sum of all elements up to `x_n`.
    Another example is a GRU layer with projection, which could be realized as ``Recurrence(GRU(500) >> Dense(200))``,
    where the projection is applied to the hidden state as fed back to the next step.
    ``F>>G`` is a short-hand for ``Sequential([F, G])``.

    Optionally, the recurrence can run backwards. This is useful for constructing bidirectional models.

    ``initial_state`` must be a constant. To pass initial_state as a data input, e.g. for a sequence-to-sequence
    model, use :func:`~cntk.layers.sequence.RecurrenceFrom()` instead.

    Note: ``Recurrence()`` is the equivalent to what in functional programming is often called ``scanl()``.

    Example:
     >>> from cntk.layers import Sequential
     >>> from cntk.layers.typing import Tensor, Sequence

     >>> # a recurrent LSTM layer
     >>> lstm_layer = Recurrence(LSTM(500))

     >>> # a bidirectional LSTM layer
     >>> # using function tuples to implement a bidirectional LSTM
     >>> bi_lstm_layer = Sequential([(Recurrence(LSTM(250)),                      # first tuple entry: forward pass
     ...                              Recurrence(LSTM(250), go_backwards=True)),  # second: backward pass
     ...                             splice])                                     # splice both on top of each other
     >>> bi_lstm_layer.update_signature(Sequence[Tensor[13]])
     >>> bi_lstm_layer.shape   # shape reflects concatenation of both output states
     (500,)
     >>> tuple(str(axis.name) for axis in bi_lstm_layer.dynamic_axes)  # (note: str() needed only for Python 2.7)
     ('defaultBatchAxis', 'defaultDynamicAxis')

     >>> # custom step function example: using Recurrence() to
     >>> # compute the cumulative sum over an input sequence
     >>> x = C.input_variable(**Sequence[Tensor[2]])
     >>> x0 = np.array([[   3,    2],
     ...                [  13,   42],
     ...                [-100, +100]])
     >>> cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5]))
     >>> y = cum_sum(x)
     >>> y(x0)
     [array([[   3. ,    2.5],
             [  16. ,   44.5],
             [ -84. ,  144.5]], dtype=float32)]

    Args:
     step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function):
      This function must have N+1 inputs and N outputs, where N is the number of state variables
      (typically 1 for GRU and plain RNNs, and 2 for LSTMs).
     go_backwards (bool, defaults to ``False``): if ``True`` then run the recurrence from the end of the sequence to the start.
     initial_state (scalar or tensor without batch dimension; or a tuple thereof):
      the initial value for the state. This can be a constant or a learnable parameter.
      In the latter case, if the step function has more than 1 state variable,
      this parameter must be a tuple providing one initial state for every state variable.
     return_full_state (bool, defaults to ``False``): if ``True`` and the step function has more than one
      state variable, then the layer returns a all state variables (a tuple of sequences);
      whereas if not given or ``False``, only the first state variable is returned to the caller.
     dropout_rate_input (float): dropout for input
     dropout_rate_output (float): dropout for output
     seed (int): seed for randomisation
     name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it
    '''

    # BUGBUG: the cum_sum expression in the docstring should be this:
    # cum_sum = Recurrence(C.plus, initial_state=np.array([0, 0.5]))
    # BUGBUG: whereas passing a NumPy array fails with "TypeError: cannot convert value of dictionary"
    # cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5]))

    go_backwards  = get_default_override(Recurrence, go_backwards=go_backwards)
    initial_state = get_default_override(Recurrence, initial_state=initial_state)
    initial_state = _get_initial_state_or_default(initial_state)

    step_function = _santize_step_function(step_function)

    dropout_input = None
    if dropout_rate_input:
        dropout_input = VariationalDropout(dropout_rate=dropout_rate_input, seed=seed, name='variational_dropout_input')

    dropout_output = None
    if dropout_rate_output:
        dropout_output = VariationalDropout(dropout_rate=dropout_rate_output, seed=seed, name='variational_dropout_output')

    # get signature of step function
    #*prev_state_args, _ = step_function.signature  # Python 3
    prev_state_args = step_function.signature[0:-1]

    if len(step_function.outputs) != len(prev_state_args):
        raise TypeError('Recurrence: number of state variables inconsistent between create_placeholder() and recurrent block')

    # initial state can be a single value or one per state variable (if more than one, like for LSTM)
    if isinstance(initial_state, tuple) and len(initial_state) == 1:
        initial_state = initial_state[0]
    if not isinstance(initial_state, tuple):
        # TODO: if initial_state is a CNTK Function rather than an initializer, then require to pass it multiple times; otherwise broadcast to all
        initial_state = tuple(initial_state for out_var in prev_state_args)

    # express it w.r.t. RecurrenceFrom
    recurrence_from = RecurrenceFrom(step_function, go_backwards, return_full_state) # :: (x, state seq) -> (new state seq)

    # function that this layer represents
    @C.Function
    def recurrence(x):
        dropped_x = dropout_input(x) if dropout_input else x
        y = recurrence_from(*(initial_state + (dropped_x,)))
        dropped_y = dropout_output(y) if dropout_output else y
        return dropped_y

    return _inject_name(recurrence, name)