Python get_default_override 예제들, cntk.default_options.get_default_override Python 예제들

예제 #1

0

파일 보기

def RNNUnit(shape,
            cell_shape=None,
            activation=default_override_or(sigmoid),
            init=default_override_or(glorot_uniform()),
            init_bias=default_override_or(0),
            enable_self_stabilization=default_override_or(False),
            name=''):
    '''
    RNNUnit(shape, cell_shape=None, activation=sigmoid, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    This is a deprecated name for :func:`~cntk.layers.blocks.RNNStep`. Use that name instead.
    '''

    activation = get_default_override(RNNUnit, activation=activation)
    init = get_default_override(RNNUnit, init=init)
    init_bias = get_default_override(RNNUnit, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        RNNUnit, enable_self_stabilization=enable_self_stabilization)

    warnings.warn(
        'This name will be removed in future versions. Please use '
        'RNNStep(...) instead, which is identical except for its name',
        DeprecationWarning)

    return _RecurrentBlock('RNNStep',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)

예제 #2

0

파일 보기

def LSTM(shape,
         cell_shape=None,
         activation=default_override_or(tanh),
         use_peepholes=default_override_or(False),
         init=default_override_or(glorot_uniform()),
         init_bias=default_override_or(0),
         enable_self_stabilization=default_override_or(False),
         name=''):

    activation = get_default_override(LSTM, activation=activation)
    use_peepholes = get_default_override(LSTM, use_peepholes=use_peepholes)
    init = get_default_override(LSTM, init=init)
    init_bias = get_default_override(LSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        LSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('LSTM',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=use_peepholes,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)

예제 #3

0

파일 보기

파일: blocks.py 프로젝트: newoneincntk/cntkx

def WeightDroppedLSTM(shape,
                      dropout_rate,
                      cell_shape=None,
                      activation=default_override_or(tanh),
                      use_peepholes=default_override_or(False),
                      init=default_override_or(glorot_uniform()),
                      init_bias=default_override_or(0),
                      enable_self_stabilization=default_override_or(False),
                      seed=SentinelValueForAutoSelectRandomSeed,
                      name=''):
    '''
    WDLSTM(shape, cell_shape=None, activation=tanh, use_peepholes=False, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    Layer factory function to create an LSTM block for use inside a recurrence.
    The LSTM block implements one step of the recurrence and is stateless. It accepts the previous state as its first two arguments,
    and outputs its new state as a two-valued tuple ``(h,c)``.

    Example:
     >>> # a typical recurrent LSTM layer
     >>> from cntkx.layers import *
     >>> lstm_layer = Recurrence(WeightDroppedLSTM(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        use_peepholes (bool, defaults to `False`):
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent LSTM layer.
    '''

    activation = get_default_override(WeightDroppedLSTM, activation=activation)
    use_peepholes = get_default_override(WeightDroppedLSTM,
                                         use_peepholes=use_peepholes)
    init = get_default_override(WeightDroppedLSTM, init=init)
    init_bias = get_default_override(WeightDroppedLSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        WeightDroppedLSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('WeightDroppedLSTM',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=use_peepholes,
                           init=init,
                           init_bias=init_bias,
                           dropout_rate=dropout_rate,
                           seed=seed,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)

예제 #4

0

파일 보기

파일: blocks.py 프로젝트: newoneincntk/cntkx

def IndyLSTM(shape,
             activation=default_override_or(tanh),
             init=default_override_or(glorot_uniform()),
             init_bias=default_override_or(0),
             enable_self_stabilization=default_override_or(False),
             name=''):
    """
    Implementation of Independently Recurrent Long Short-term Memory cells: IndyLSTMs by Gonnet and Deselaers.
    Paper can be found at https://arxiv.org/abs/1903.08023

    IndyLSTM differ from regular LSTM cells in that the recurrent weights are not modeled as a full matrix,
    but as a diagonal matrix, i.e. the output and state of each LSTM cell depends on the inputs and its
    own output/state, as opposed to the input and the outputs/states of all the cells in the layer.
    The number of parameters per IndyLSTM layer, and thus the number of FLOPS per evaluation, is linear in the
    number of nodes in the layer, as opposed to quadratic for regular LSTM layers, resulting in potentially both
    smaller and faster model.

    Example:
     >>> # a gated recurrent layer
     >>> from cntkx.layers import *
     >>> indy_lstm_layer = Recurrence(IndyLSTM(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, prev_c, input) -> (h, c)`` that implements one step of a recurrent IndyLSTM layer.
    """

    activation = get_default_override(IndyLSTM, activation=activation)
    init = get_default_override(IndyLSTM, init=init)
    init_bias = get_default_override(IndyLSTM, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        IndyLSTM, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('IndyLSTM',
                           shape,
                           None,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           dropout_rate=0,
                           seed=SentinelValueForAutoSelectRandomSeed,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)

예제 #5

0

파일 보기

파일: blocks.py 프로젝트: haixpham/cntkx

def IndRNN(shape, activation=default_override_or(relu),
            init=default_override_or(glorot_uniform()), init_bias=default_override_or(0),
            enable_self_stabilization=default_override_or(False), name=''):
    """
    IndRNN implementation found in "Independently Recurrent Neural Network (IndRNN): Building A Longer andDeeper RNN"
    by Li, et al (https://arxiv.org/abs/1803.04831).

    IndRNN are RNNS where neurons in each layer are independent from each other, and the cross-channel information is
    obtained through stacking multiple layers.

    It has been shown that an IndRNN can be easily regulated to prevent the gradient exploding and vanishing problems
    while allowing the networkto learn long-term dependencies. Moreover, an IndRNN can work with non-saturated
    activation functions such as relu (rectified linear unit) and be still trained robustly.
    Multiple IndRNNs can be stacked to construct a network that is deeper than the existing RNNs.
    Experimental results have shown that the proposed IndRNN is able to process very long
    sequences (over 5000 time steps), can be used to construct very deep networks (21 layers used in the experiment)
    and still be trained robustly. Better performances have been achieved on various tasks by using IndRNNs compared
    with the traditional RNN and LSTM.

    IndRNN also enables the usable of Relu activation which more efficient to compute than sigmoid and leads to
    faster convergence during training. You may consider to initialise the recurrent weights using a uniform
    distribution from 0 to 1.

    The original code is available at: https://github.com/Sunnydreamrain/IndRNN_Theano_Lasagne.

    Example:
     >>> # a plain relu RNN layer
     >>> from cntkx.layers import *
     >>> relu_rnn_layer = Recurrence(IndRNN(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        activation (:class:`~cntk.ops.functions.Function`, defaults to signmoid): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, input) -> h`` where ``h = activation(input @ W + prev_h * R + b)``
    """

    activation                = get_default_override(IndRNN, activation=activation)
    init                      = get_default_override(IndRNN, init=init)
    init_bias                 = get_default_override(IndRNN, init_bias=init_bias)
    enable_self_stabilization = get_default_override(IndRNN, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('IndRNN', shape, None, activation=activation, use_peepholes=False,
                           init=init, init_bias=init_bias, dropout_rate=0, seed=SentinelValueForAutoSelectRandomSeed,
                           enable_self_stabilization=enable_self_stabilization, name=name)

예제 #6

0

파일 보기

def GRU(shape,
        cell_shape=None,
        activation=default_override_or(tanh),
        init=default_override_or(glorot_uniform()),
        init_bias=default_override_or(0),
        enable_self_stabilization=default_override_or(False),
        name=''):
    '''
    GRU(shape, cell_shape=None, activation=tanh, init=glorot_uniform(), init_bias=0, enable_self_stabilization=False, name='')

    Layer factory function to create a GRU block for use inside a recurrence.
    The GRU block implements one step of the recurrence and is stateless. It accepts the previous state as its first argument,
    and outputs its new state.

    Example:
     >>> # a gated recurrent layer
     >>> from cntk.layers import *
     >>> gru_layer = Recurrence(GRU(500))

    Args:
        shape (`int` or `tuple` of `ints`): vector or tensor dimension of the output of this layer
        cell_shape (tuple, defaults to `None`): if given, then the output state is first computed at `cell_shape`
         and linearly projected to `shape`
        activation (:class:`~cntk.ops.functions.Function`, defaults to :func:`~cntk.ops.tanh`): function to apply at the end, e.g. `relu`
        init (scalar or NumPy array or :mod:`cntk.initializer`, defaults to `glorot_uniform`): initial value of weights `W`
        init_bias (scalar or NumPy array or :mod:`cntk.initializer`, defaults to 0): initial value of weights `b`
        enable_self_stabilization (bool, defaults to `False`): if `True` then add a :func:`~cntk.layers.blocks.Stabilizer`
         to all state-related projections (but not the data input)
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function ``(prev_h, input) -> h`` that implements one step of a recurrent GRU layer.
    '''

    activation = get_default_override(GRU, activation=activation)
    init = get_default_override(GRU, init=init)
    init_bias = get_default_override(GRU, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        GRU, enable_self_stabilization=enable_self_stabilization)

    return _RecurrentBlock('GRU',
                           shape,
                           cell_shape,
                           activation=activation,
                           use_peepholes=False,
                           init=init,
                           init_bias=init_bias,
                           enable_self_stabilization=enable_self_stabilization,
                           name=name)

예제 #7

0

파일 보기

파일: tensor.py 프로젝트: gaoxuesong/CNTK

    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                if s.step is not None and s.step != 1:
                    # TODO: This is not hard to implement in SliceNode.
                    raise ValueError("slicing with a step other than 1 is "
                                     "currently not supported")
                # implement as a CNTK slice() operation
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r

예제 #8

0

파일 보기

    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                if s.step is not None and s.step != 1:
                    # TODO: This is not hard to implement in SliceNode.
                    raise ValueError("slicing with a step other than 1 is "
                                     "currently not supported")
                # implement as a CNTK slice() operation
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r

예제 #9

0

파일 보기

파일: sanitize.py 프로젝트: AllanYiin/CNTK

def sanitize_random_args(shape, dtype):
    from cntk.default_options import get_default_override
    shape = sanitize_shape(shape)
    dtype = get_default_override(None, dtype=dtype)
    if dtype is None:
        dtype = np.float32
    dtype = sanitize_dtype_cntk(dtype)
    return shape, dtype

예제 #10

0

파일 보기

파일: sanitize.py 프로젝트: Shzaidi/CNTK

def sanitize_random_args(shape, dtype):
    from cntk.default_options import get_default_override
    shape = sanitize_shape(shape)
    dtype = get_default_override(None, dtype=dtype)
    if dtype is None:
        dtype = np.float32
    dtype = sanitize_dtype_cntk(dtype)
    return shape, dtype

예제 #11

0

파일 보기

파일: IndRNN.py 프로젝트: egg-west/IndRNN_cntk

def IndRNNStep(shape,
               cell_shape=None,
               activation=default_override_or(relu),
               init=default_override_or(glorot_uniform()),
               init_bias=default_override_or(0),
               enable_self_stabilization=default_override_or(False),
               name=''):

    activation = get_default_override(RNNStep, activation=activation)
    init = get_default_override(RNNStep, init=init)
    init_bias = get_default_override(RNNStep, init_bias=init_bias)
    enable_self_stabilization = get_default_override(
        RNNStep, enable_self_stabilization=enable_self_stabilization)

    return IndRNNBlock('RNNStep',
                       shape,
                       cell_shape,
                       activation=activation,
                       use_peepholes=False,
                       init=init,
                       init_bias=init_bias,
                       enable_self_stabilization=enable_self_stabilization,
                       name=name)

예제 #12

0

파일 보기

def Stabilizer(steepness=4,
               enable_self_stabilization=default_override_or(True),
               name=''):
    '''
    Stabilizer(steepness=4, enable_self_stabilization=True, name='')

    Layer factory function to create a `Droppo self-stabilizer <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/SelfLR.pdf>`_.
    It multiplies its input with a scalar that is learned.

    This takes `enable_self_stabilization` as a flag that allows to disable itself. Useful if this is a global default.

    Note:
        Some other layers (specifically, recurrent units like :func:`~cntk.layers.blocks.LSTM`) also have the option to
        use the ``Stabilizer()`` layer internally. That is enabled by passing `enable_self_stabilization=True`
        to those layers. In conjunction with those, the rule is that an explicit ``Stabilizer()`` must be
        inserted by the user for the main data input, whereas the recurrent layer will own the stabilizer(s)
        for the internal recurrent connection(s).

    Note:
        Unlike the original paper, which proposed a linear or exponential scalar,
        CNTK uses a sharpened Softplus: 1/steepness ln(1+e^{steepness*beta}).
        The softplus behaves linear for weights around and above 1 (like the linear scalar) while guaranteeing
        positiveness (like the exponentional variant) but is also more robust by avoiding exploding gradients.

    Example:
     >>> # recurrent model with self-stabilization
     >>> from cntk.layers import *
     >>> with default_options(enable_self_stabilization=True): # enable stabilizers by default for LSTM()
     ...     model = Sequential([
     ...         Embedding(300),
     ...         Stabilizer(),           # stabilizer for main data input of recurrence
     ...         Recurrence(LSTM(512)),  # LSTM owns its own stabilizers for the recurrent connections
     ...         Stabilizer(),
     ...         Dense(10)
     ...     ])

    Args:
        steepness (`int`, defaults to 4):
        enable_self_stabilization (bool, defaults to `False`): a flag that allows to disable itself. Useful if this is a global default
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function
    '''

    enable_self_stabilization = get_default_override(
        Stabilizer, enable_self_stabilization=enable_self_stabilization)

    if not enable_self_stabilization:  # disabled (typically through global option; otherwise one would not call this in the first place)
        return identity

    # parameters bound to this Function
    init_param = np.log(
        np.exp(steepness) - 1
    ) / steepness  # initialize so that factor is initially 1 (has no effect)
    param = Parameter((), init=init_param, name='alpha')
    beta = softplus(param, steepness=steepness)

    # expression
    @BlockFunction('Stabilizer', name)
    def stabilize(x):
        return beta * x

    return stabilize

예제 #13

0

파일 보기

파일: tensor.py 프로젝트: vnvizitiu/CNTK

    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops
        
        if hasattr(self, 'outputs') and len(self.outputs) > 1:
            try:
                return self.outputs[arg]
            except Exception as e:
                msg = 'Slice for multioutput functions is not supported, ' \
                      'the fallback to select to output requires ' \
                      'that only one index is provided. arg: {}, self: {}'.format(
                    arg, self)
                raise KeyError(msg)

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple): 
            arg = (arg,)
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None, axis_offset=default_override_or(len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis: # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int): # int: normalize into a slice
                s = slice(s, s+1)

            if isinstance(s, slice):
                begin = s.start or 0
                end   = s.stop  or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r, axis=axis + axis0, begin_index=begin, end_index=end, strides=s.step)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                              'indices have to be of type int and not "%s"' %
                               type(idx))
                    slice_accum.append(ops.slice(r, axis=axis,
                                                 begin_index=idx,
                                                 end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError(
                    'type "%s" is not supported as index' % type(s))

        return r

예제 #14

0

파일 보기

파일: factorization.py 프로젝트: bigdatasciencegroup/Microsoft-Cognitive-Recognition-Toolkit-CNTK

def dense_factored(
        shapes,  #(shape1, shape2)
        activation=default_override_or(identity),
        init={
            'W1': None,
            'W2': None
        },
        input_rank=None,
        map_rank=None,
        bias=default_override_or(True),
        init_bias=default_override_or(0),
        name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert (input_rank is None and map_rank is None
            and all(isinstance(s, int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError(
            "Dense: input_rank and map_rank cannot be specified at the same time."
        )

    # If input_rank not given then pass a single _INFERRED;
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1,
                   init=init_weights['W1'],
                   name='W1')
    W2 = Parameter(output_shape1 + output_shape2,
                   init=init_weights['W2'],
                   name='W2')
    b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r

    return dense

예제 #15

0

파일 보기

파일: sequence.py 프로젝트: haixpham/cntkx

def Recurrence(step_function, go_backwards=default_override_or(False), initial_state=default_override_or(0),
               return_full_state=False, dropout_rate_input=None,
               dropout_rate_output=None, seed=SentinelValueForAutoSelectRandomSeed, name=''):
    '''
    Recurrence(step_function, go_backwards=False, initial_state=0, return_full_state=False, name='')

    Recurrence has option to variationally dropout input and output.

    Layer factory function that implements a recurrent model, including the common RNN, LSTM, and GRU recurrences.
    This factory function creates a function that runs a step function recurrently over an input sequence,
    where in each step, Recurrence() will pass to the step function a data input as well as the output of the
    previous step.
    The following pseudo-code repesents what happens when you call a `Recurrence()` layer::

      # pseudo-code for y = Recurrence(step_function)(x)
      #  x: input sequence of tensors along the dynamic axis
      #  y: resulting sequence of outputs along the same dynamic axis
      y = []              # result sequence goes here
      s = initial_state   # s = output of previous step ("state")
      for x_n in x:       # pseudo-code for looping over all steps of input sequence along its dynamic axis
          s = step_function(s, x_n)  # pass previous state and new data to step_function -> new state
          y.append(s)

    The common step functions are :func:`~cntk.layers.blocks.LSTM`, :func:`~cntk.layers.blocks.GRU`, and :func:`~cntk.layers.blocks.RNNStep`,
    but the step function can be any :class:`~cntk.ops.functions.Function` or Python function.
    The signature of a step function with a single state variable must be
    ``(h_prev, x) -> h``, where ``h_prev`` is the previous state, ``x`` is the new
    data input, and the output is the new state.
    The step function will be called item by item, resulting in a sequence of the same length as the input.

    Step functions can have more than one state output, e.g. :func:`~cntk.layers.blocks.LSTM`.
    In this case, the first N arguments are the previous state, followed by one more argument that
    is the data input; and its output must be a tuple of N values.
    In this case, the recurrence operation will, by default, return the first of the state variables
    (in the LSTM case, the ``h``), while additional state variables are internal (like the LSTM's ``c``).
    If all state variables should be returned, pass ``return_full_state=True``.

    To provide your own step function, just use any :class:`~cntk.ops.functions.Function` (or equivalent Python function) that
    has a signature as described above.
    For example, a cumulative sum over a sequence can be computed as ``Recurrence(plus)``,
    where each step consists of `plus(s,x_n)`, where `s` is the output of the previous call
    and hence the cumulative sum of all elements up to `x_n`.
    Another example is a GRU layer with projection, which could be realized as ``Recurrence(GRU(500) >> Dense(200))``,
    where the projection is applied to the hidden state as fed back to the next step.
    ``F>>G`` is a short-hand for ``Sequential([F, G])``.

    Optionally, the recurrence can run backwards. This is useful for constructing bidirectional models.

    ``initial_state`` must be a constant. To pass initial_state as a data input, e.g. for a sequence-to-sequence
    model, use :func:`~cntk.layers.sequence.RecurrenceFrom()` instead.

    Note: ``Recurrence()`` is the equivalent to what in functional programming is often called ``scanl()``.

    Example:
     >>> from cntk.layers import Sequential
     >>> from cntk.layers.typing import Tensor, Sequence

     >>> # a recurrent LSTM layer
     >>> lstm_layer = Recurrence(LSTM(500))

     >>> # a bidirectional LSTM layer
     >>> # using function tuples to implement a bidirectional LSTM
     >>> bi_lstm_layer = Sequential([(Recurrence(LSTM(250)),                      # first tuple entry: forward pass
     ...                              Recurrence(LSTM(250), go_backwards=True)),  # second: backward pass
     ...                             splice])                                     # splice both on top of each other
     >>> bi_lstm_layer.update_signature(Sequence[Tensor[13]])
     >>> bi_lstm_layer.shape   # shape reflects concatenation of both output states
     (500,)
     >>> tuple(str(axis.name) for axis in bi_lstm_layer.dynamic_axes)  # (note: str() needed only for Python 2.7)
     ('defaultBatchAxis', 'defaultDynamicAxis')

     >>> # custom step function example: using Recurrence() to
     >>> # compute the cumulative sum over an input sequence
     >>> x = C.input_variable(**Sequence[Tensor[2]])
     >>> x0 = np.array([[   3,    2],
     ...                [  13,   42],
     ...                [-100, +100]])
     >>> cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5]))
     >>> y = cum_sum(x)
     >>> y(x0)
     [array([[   3. ,    2.5],
             [  16. ,   44.5],
             [ -84. ,  144.5]], dtype=float32)]

    Args:
     step_function (:class:`~cntk.ops.functions.Function` or equivalent Python function):
      This function must have N+1 inputs and N outputs, where N is the number of state variables
      (typically 1 for GRU and plain RNNs, and 2 for LSTMs).
     go_backwards (bool, defaults to ``False``): if ``True`` then run the recurrence from the end of the sequence to the start.
     initial_state (scalar or tensor without batch dimension; or a tuple thereof):
      the initial value for the state. This can be a constant or a learnable parameter.
      In the latter case, if the step function has more than 1 state variable,
      this parameter must be a tuple providing one initial state for every state variable.
     return_full_state (bool, defaults to ``False``): if ``True`` and the step function has more than one
      state variable, then the layer returns a all state variables (a tuple of sequences);
      whereas if not given or ``False``, only the first state variable is returned to the caller.
     dropout_rate_input (float): dropout for input
     dropout_rate_output (float): dropout for output
     seed (int): seed for randomisation
     name (str, optional): the name of the Function instance in the network

    Returns:
        :class:`~cntk.ops.functions.Function`:
        A function that accepts one argument (which must be a sequence) and performs the recurrent operation on it
    '''

    # BUGBUG: the cum_sum expression in the docstring should be this:
    # cum_sum = Recurrence(C.plus, initial_state=np.array([0, 0.5]))
    # BUGBUG: whereas passing a NumPy array fails with "TypeError: cannot convert value of dictionary"
    # cum_sum = Recurrence(C.plus, initial_state=Constant([0, 0.5]))

    go_backwards  = get_default_override(Recurrence, go_backwards=go_backwards)
    initial_state = get_default_override(Recurrence, initial_state=initial_state)
    initial_state = _get_initial_state_or_default(initial_state)

    step_function = _santize_step_function(step_function)

    dropout_input = None
    if dropout_rate_input:
        dropout_input = VariationalDropout(dropout_rate=dropout_rate_input, seed=seed, name='variational_dropout_input')

    dropout_output = None
    if dropout_rate_output:
        dropout_output = VariationalDropout(dropout_rate=dropout_rate_output, seed=seed, name='variational_dropout_output')

    # get signature of step function
    #*prev_state_args, _ = step_function.signature  # Python 3
    prev_state_args = step_function.signature[0:-1]

    if len(step_function.outputs) != len(prev_state_args):
        raise TypeError('Recurrence: number of state variables inconsistent between create_placeholder() and recurrent block')

    # initial state can be a single value or one per state variable (if more than one, like for LSTM)
    if isinstance(initial_state, tuple) and len(initial_state) == 1:
        initial_state = initial_state[0]
    if not isinstance(initial_state, tuple):
        # TODO: if initial_state is a CNTK Function rather than an initializer, then require to pass it multiple times; otherwise broadcast to all
        initial_state = tuple(initial_state for out_var in prev_state_args)

    # express it w.r.t. RecurrenceFrom
    recurrence_from = RecurrenceFrom(step_function, go_backwards, return_full_state) # :: (x, state seq) -> (new state seq)

    # function that this layer represents
    @C.Function
    def recurrence(x):
        dropped_x = dropout_input(x) if dropout_input else x
        y = recurrence_from(*(initial_state + (dropped_x,)))
        dropped_y = dropout_output(y) if dropout_output else y
        return dropped_y

    return _inject_name(recurrence, name)

예제 #16

0

파일 보기

def AttentionModel(attention_dim,
                   attention_span=None,
                   attention_axis=None,
                   init=default_override_or(glorot_uniform()),
                   go_backwards=default_override_or(False),
                   enable_self_stabilization=default_override_or(True),
                   name=''):
    '''
    AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='')

    Layer factory function to create a function object that implements an attention model
    as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate."
    '''

    init = get_default_override(AttentionModel, init=init)
    go_backwards = get_default_override(AttentionModel,
                                        go_backwards=go_backwards)
    enable_self_stabilization = get_default_override(
        AttentionModel, enable_self_stabilization=enable_self_stabilization)

    # until CNTK can handle multiple nested dynamic loops, we require fixed windows and fake it
    if attention_span is None or attention_axis is None:
        raise NotImplementedError(
            'AttentionModel currently requires a fixed attention_span and a static attention_axis to be specified'
        )
    if attention_span <= 0:
        raise ValueError('attention_span must be a positive value')

    # model parameters
    with default_options(bias=False):  # all the projections have no bias
        attn_proj_enc = Stabilizer(
            enable_self_stabilization=enable_self_stabilization) >> Dense(
                attention_dim, init=init, input_rank=1
            )  # projects input hidden state, keeping span axes intact
        attn_proj_dec = Stabilizer(
            enable_self_stabilization=enable_self_stabilization
        ) >> Dense(
            attention_dim, init=init, input_rank=1
        )  # projects decoder hidden state, but keeping span and beam-search axes intact
        attn_proj_tanh = Stabilizer(
            enable_self_stabilization=enable_self_stabilization) >> Dense(
                1, init=init, input_rank=1
            )  # projects tanh output, keeping span and beam-search axes intact
    attn_final_stab = Stabilizer(
        enable_self_stabilization=enable_self_stabilization)

    # attention function
    @Function
    def attention(h_enc, h_dec):
        history_axis = h_dec  # we use history_axis wherever we pass this only for the sake of passing its axis
        # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
        # --- encoder state window
        (h_enc, h_enc_valid) = PastValueWindow(
            attention_span, axis=attention_axis,
            go_backwards=go_backwards)(h_enc).outputs
        h_enc_proj = attn_proj_enc(h_enc)
        # window must be broadcast to every decoder time step
        h_enc_proj = C.sequence.broadcast_as(h_enc_proj, history_axis)
        h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
        # --- decoder state
        # project decoder hidden state
        h_dec_proj = attn_proj_dec(h_dec)
        tanh_out = C.tanh(h_dec_proj +
                          h_enc_proj)  # (attention_span, attention_dim)
        u = attn_proj_tanh(tanh_out)  # (attention_span, 1)
        u_masked = u + (
            h_enc_valid - 1
        ) * 50  # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
        attention_weights = C.softmax(
            u_masked, axis=attention_axis)  #, name='attention_weights')
        attention_weights = Label('attention_weights')(attention_weights)
        # now take weighted sum over the encoder state vectors
        h_att = C.reduce_sum(C.element_times(h_enc_proj, attention_weights),
                             axis=attention_axis)
        h_att = attn_final_stab(h_att)
        return h_att

    return _inject_name(attention, name)

예제 #17

0

파일 보기

def AttentionModel(attention_dim, attention_span=None, attention_axis=None,
                   init=default_override_or(glorot_uniform()),
                   go_backwards=default_override_or(False),
                   enable_self_stabilization=default_override_or(True), name=''):
    '''
    AttentionModel(attention_dim, attention_span=None, attention_axis=None, init=glorot_uniform(), go_backwards=False, enable_self_stabilization=True, name='')

    Layer factory function to create a function object that implements an attention model
    as described in Bahdanau, et al., "Neural machine translation by jointly learning to align and translate."
    '''

    init                      = get_default_override(AttentionModel, init=init)
    go_backwards              = get_default_override(AttentionModel, go_backwards=go_backwards)
    enable_self_stabilization = get_default_override(AttentionModel, enable_self_stabilization=enable_self_stabilization)

    compatible_attention_mode = True
    if attention_span is None:
        if attention_axis is not None:
            raise ValueError('attention_span cannot be None when attention_axis is not None')
        compatible_attention_mode = False
    elif attention_span <= 0:
        raise ValueError('attention_span must be a positive value')
    elif attention_axis is None:
        raise ValueError('attention_axis cannot be None when attention_span is not None')

    # model parameters
    with default_options(bias=False): # all the projections have no bias
        attn_proj_enc   = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects input hidden state, keeping span axes intact
        attn_proj_dec   = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(attention_dim, init=init, input_rank=1) # projects decoder hidden state, but keeping span and beam-search axes intact
        attn_proj_tanh  = Stabilizer(enable_self_stabilization=enable_self_stabilization) >> Dense(1            , init=init, input_rank=1) # projects tanh output, keeping span and beam-search axes intact
    attn_final_stab = Stabilizer(enable_self_stabilization=enable_self_stabilization)

    if compatible_attention_mode:
        warn('Specifying non-default values for attention_span and attention_axis has been deprecated since version 2.2. '
             'These arguments will be removed in the future.', DeprecationWarning, stacklevel=2)
        # old attention function
        @Function
        def old_attention(h_enc, h_dec):
            history_axis = h_dec # we use history_axis wherever we pass this only for the sake of passing its axis
            # TODO: pull this apart so that we can compute the encoder window only once and apply it to multiple decoders
            # --- encoder state window
            (h_enc, h_enc_valid) = PastValueWindow(attention_span, axis=attention_axis, go_backwards=go_backwards)(h_enc).outputs
            h_enc_proj = attn_proj_enc(h_enc)
            # window must be broadcast to every decoder time step
            h_enc_proj  = C.sequence.broadcast_as(h_enc_proj,  history_axis)
            h_enc_valid = C.sequence.broadcast_as(h_enc_valid, history_axis)
            # --- decoder state
            # project decoder hidden state
            h_dec_proj = attn_proj_dec(h_dec)
            tanh_out = C.tanh(h_dec_proj + h_enc_proj)  # (attention_span, attention_dim)
            u = attn_proj_tanh(tanh_out)              # (attention_span, 1)
            u_masked = u + (h_enc_valid - 1) * 50     # logzero-out the unused elements for the softmax denominator  TODO: use a less arbitrary number than 50
            attention_weights = C.softmax(u_masked, axis=attention_axis) #, name='attention_weights')
            attention_weights = Label('attention_weights')(attention_weights)
            # now take weighted sum over the encoder state vectors
            h_att = C.reduce_sum(C.element_times(C.sequence.broadcast_as(h_enc, history_axis), attention_weights), axis=attention_axis)
            h_att = attn_final_stab(h_att)
            return h_att

        return _inject_name(old_attention, name)
    else:
        # new attention function
        @Function
        def new_attention(encoder_hidden_state, decoder_hidden_state):
            # encode_hidden_state: [#, e] [h]
            # decoder_hidden_state: [#, d] [H]
            unpacked_encoder_hidden_state, valid_mask = C.sequence.unpack(encoder_hidden_state, padding_value=0).outputs
            # unpacked_encoder_hidden_state: [#] [*=e, h]
            # valid_mask: [#] [*=e]
            projected_encoder_hidden_state = C.sequence.broadcast_as(attn_proj_enc(unpacked_encoder_hidden_state), decoder_hidden_state)
            # projected_encoder_hidden_state: [#, d] [*=e, attention_dim]
            broadcast_valid_mask = C.sequence.broadcast_as(C.reshape(valid_mask, (1,), 1), decoder_hidden_state)
            # broadcast_valid_mask: [#, d] [*=e]
            projected_decoder_hidden_state = attn_proj_dec(decoder_hidden_state)
            # projected_decoder_hidden_state: [#, d] [attention_dim]
            tanh_output = C.tanh(projected_decoder_hidden_state + projected_encoder_hidden_state)
            # tanh_output: [#, d] [*=e, attention_dim]
            attention_logits = attn_proj_tanh(tanh_output)
            # attention_logits = [#, d] [*=e, 1]
            minus_inf = C.constant(-1e+30)
            masked_attention_logits = C.element_select(broadcast_valid_mask, attention_logits, minus_inf)
            # masked_attention_logits = [#, d] [*=e]
            attention_weights = C.softmax(masked_attention_logits, axis=0)
            attention_weights = Label('attention_weights')(attention_weights)
            # attention_weights = [#, d] [*=e]
            attended_encoder_hidden_state = C.reduce_sum(attention_weights * C.sequence.broadcast_as(unpacked_encoder_hidden_state, attention_weights), axis=0)
            # attended_encoder_hidden_state = [#, d] [1, h]
            output = attn_final_stab(C.reshape(attended_encoder_hidden_state, (), 0, 1))
            # output = [#, d], [h]
            return output

        return _inject_name(new_attention, name)

예제 #18

0

파일 보기

파일: factorization.py 프로젝트: AllanYiin/CNTK

def dense_factored(shapes, #(shape1, shape2)
                  activation=default_override_or(identity),
                  init={'W1':None, 'W2':None},
                  input_rank=None,
                  map_rank=None,
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert(input_rank is None and
           map_rank is None and
           all(isinstance(s,int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias       = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias  = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")


    # If input_rank not given then pass a single _INFERRED; 
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
    W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
    b = Parameter(output_shape2, init=init_bias,    name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r
    return dense

예제 #19

0

파일 보기

    def __getitem__(self, arg):
        '''
        Slicing of a Variable. E.g. var[2:3] will translate into slice(var, axis=0, begin_index=2, end_index=3)
        '''
        from . import ops

        if hasattr(self, 'outputs') and len(self.outputs) > 1:
            try:
                return self.outputs[arg]
            except Exception as e:
                msg = 'Slice for multioutput functions is not supported, ' \
                      'the fallback to select to output requires ' \
                      'that only one index is provided. arg: {}, self: {}'.format(
                    arg, self)
                raise KeyError(msg)

        # int or slice: normalize into a tuple of int or tuple of slice
        if not isinstance(arg, tuple):
            arg = (arg, )
        r = self
        axis0 = 0

        from cntk.default_options import get_global_option, get_default_override, default_override_or

        keras_mode_flag = get_global_option('align_axis', 0)
        if keras_mode_flag == 1:
            if (getattr(self, 'dynamic_axes') is not None
                    and len(self.dynamic_axes) > 0):
                axis0 = -get_default_override(None,
                                              axis_offset=default_override_or(
                                                  len(self.dynamic_axes)))

        for axis, s in enumerate(arg):
            if s is Ellipsis:  # ellipsis means index relative to end after this point
                axis0 = -len(arg)
                continue
            if isinstance(s, int):  # int: normalize into a slice
                s = slice(s, s + 1)

            if isinstance(s, slice):
                begin = s.start or 0
                end = s.stop or 0
                if begin != 0 or end != 0:
                    r = ops.slice(r,
                                  axis=axis + axis0,
                                  begin_index=begin,
                                  end_index=end,
                                  strides=s.step)
            elif isinstance(s, (tuple, list)):
                # Select multiple elements from the same dimension. This is
                # different from NumPy's advanced indexing, since we just go
                # axis by axis from left to right and don't do any
                # broadcasting.

                slice_accum = []
                for idx in s:
                    if not isinstance(idx, int):
                        raise IndexError(
                            'indices have to be of type int and not "%s"' %
                            type(idx))
                    slice_accum.append(
                        ops.slice(r,
                                  axis=axis,
                                  begin_index=idx,
                                  end_index=idx + 1))
                if len(slice_accum) > 1:
                    r = ops.splice(*slice_accum, axis=axis)
                else:
                    r = slice_accum[0]
            else:
                raise IndexError('type "%s" is not supported as index' %
                                 type(s))

        return r

예제 #20

0

파일 보기

파일: blocks.py 프로젝트: haixpham/cntkx

def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None,
         ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0),
         hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0),
         name=''):
    """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models.

    This difference between this implementation and cntk's one is that the slicing of
    the recurrent weights are different.

    pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation,
    i kept the biases to one to speed it up a little more.

    """
    activation = get_default_override(LSTM, activation=activation)
    ih_init = get_default_override(LSTM, ih_init=ih_init)
    ih_bias = get_default_override(LSTM, ih_bias=ih_bias)
    hh_init = get_default_override(LSTM, hh_init=hh_init)
    hh_bias = get_default_override(LSTM, hh_bias=hh_bias)

    stack_axis = - 1
    shape = _as_tuple(shape)
    cell_shape = shape
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    init_bias = ih_bias + hh_bias  # combine both biases in pytorch into one
    b  = Parameter(            cell_shape_stacked,   init=init_bias,    name='b')                    # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=ih_init,      name='W')                    # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=hh_init,      name='H')                    # hidden-to-hidden

    dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None

    @C.BlockFunction('PT::LSTM', name)
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj  = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim)  # split along stack_axis
        ft_proj  = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)  # g gate
        ot_proj  = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)                        # input gate(t)
        bit = it * activation(bit_proj)              # applied to tanh of input network

        ft = sigmoid(ft_proj)                        # forget-me-not gate(t)
        bft = ft * dc                                # applied to cell(t-1)

        ct = bft + bit                               # c(t) is sum of both

        ot = sigmoid(ot_proj)                        # output gate(t)
        ht = ot * activation(ct)                     # applied to tanh(cell(t))
        return ht, ct

    return lstm