Ejemplo n.º 1
0
def test_op_batch_normalization(use_cudnn, sample, device_id, precision):
    dtype = PRECISION_TO_TYPE[precision]
    epsilon = 0.00001
    dev = cntk_device(device_id)

    t = AA(sample, dtype=dtype).reshape(-1, 1)
    mean = 1
    var = 2
    init_scale = 3
    init_bias = 4

    forward = [(x - mean) / np.sqrt(var + epsilon) * init_scale + init_bias
               for x in t]

    expected_forward = AA(forward)

    scale = Parameter(init=AA([init_scale], dtype=dtype),
                      dtype=dtype,
                      device=dev)
    bias = Parameter(init=AA([init_bias], dtype=dtype),
                     dtype=dtype,
                     device=dev)
    run_mean = constant(mean, shape=(1), dtype=dtype, device=dev)
    run_variance = constant(var, shape=(1), dtype=dtype, device=dev)
    run_count = constant(0, dtype=dtype, device=dev)

    from cntk import batch_normalization, input

    a = input(shape=(1), dtype=dtype, needs_gradient=False, name='a')

    with pytest.warns(Warning):
        op = batch_normalization(
            a,
            scale,
            bias,
            run_mean,
            run_variance,
            False,
            #no running_count here,
            epsilon=epsilon,
            use_cudnn_engine=use_cudnn)

    op_node = batch_normalization(a,
                                  scale,
                                  bias,
                                  run_mean,
                                  run_variance,
                                  running_count=run_count,
                                  spatial=False,
                                  epsilon=epsilon,
                                  use_cudnn_engine=use_cudnn)

    forward_input = {a: t}

    unittest_helper(op_node,
                    forward_input,
                    expected_forward,
                    expected_backward=None,
                    device_id=device_id,
                    precision=precision)
Ejemplo n.º 2
0
def Stabilizer(steepness=4, enable_self_stabilization=default_override_or(True), name=''):
    '''
    Stabilizer(steepness=4, enable_self_stabilization=True, name='')

    Layer factory function to create a `Droppo self-stabilizer <https://www.microsoft.com/en-us/research/wp-content/uploads/2016/11/SelfLR.pdf>`_.
    It multiplies its input with a scalar that is learned.

    This takes `enable_self_stabilization` as a flag that allows to disable itself. Useful if this is a global default.

    Note: Some other layers (specifically, recurrent units like :func:`~cntk.layers.blocks.LSTM`) also have the option to
    use the ``Stabilizer()`` layer internally. That is enabled by passing `enable_self_stabilization=True`
    to those layers. In conjunction with those, the rule is that an explicit ``Stabilizer()`` must be
    inserted by the user for the main data input, whereas the recurrent layer will own the stabilizer(s)
    for the internal recurrent connection(s).
    Note: Unlike the original paper, which proposed a linear or exponential scalar,
    CNTK uses a sharpened Softplus: 1/steepness ln(1+e^{steepness*beta}).
    The softplus behaves linear for weights around and above 1 (like the linear scalar) while guaranteeing
    positiveness (like the exponentional variant) but is also more robust by avoiding exploding gradients.

    Example:
     >>> # recurrent model with self-stabilization
     >>> from cntk.layers import *
     >>> with default_options(enable_self_stabilization=True): # enable stabilizers by default for LSTM()
     ...     model = Sequential([
     ...         Embedding(300),
     ...         Stabilizer(),           # stabilizer for main data input of recurrence
     ...         Recurrence(LSTM(512)),  # LSTM owns its own stabilizers for the recurrent connections
     ...         Stabilizer(),
     ...         Dense(10)
     ...     ])

    Args:
        steepness (`int`, defaults to 4):
        enable_self_stabilization (bool, defaults to `False`): a flag that allows to disable itself. Useful if this is a global default
        name (str, defaults to ''): the name of the Function instance in the network

    Returns:
        cntk.ops.functions.Function:
        A function
    '''

    enable_self_stabilization = get_default_override(Stabilizer, enable_self_stabilization=enable_self_stabilization)

    if not enable_self_stabilization: # disabled (typically through global option; otherwise one would not call this in the first place)
        return identity

    # parameters bound to this Function
    init_param = np.log(np.exp(steepness) -1) / steepness  # initialize so that factor is initially 1 (has no effect)
    param = Parameter((), init=init_param, name='alpha')
    beta = softplus(param, steepness=steepness)

    # expression
    @BlockFunction('Stabilizer', name)
    def stabilize(x):
        return beta * x

    return stabilize
Ejemplo n.º 3
0
def _RecurrentBlock(type,
                    shape,
                    cell_shape,
                    activation,
                    use_peepholes,
                    init,
                    init_bias,
                    enable_self_stabilization,
                    name=''):
    '''
    Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 3,
        'LSTM': 4
    }[type]
    cell_shape_stacked = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 2,
        'LSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')  # input
    H = Parameter(shape + cell_shape_stacked_H, init=init,
                  name='H')  # hidden-to-hidden
    H1 = Parameter(shape + cell_shape, init=init,
                   name='H1') if type == 'GRU' else None  # hidden-to-hidden
    Ci = Parameter(
        cell_shape, init=init, name='Ci'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(
        cell_shape, init=init, name='Cf'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(
        cell_shape, init=init, name='Co'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init,
                    name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='P_stabilizer')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, H)

        it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                        1 * stacked_dim)  # split along stack_axis
        bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)
        ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        #return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
        return (h, c)

    # GRU model function
    # in this case:
    #   (dh, x) --> (h)
    # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2 = times(dhs, H)

        zt_proj = slice(
            projx3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) + slice(
                projh2, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        rt_proj = slice(
            projx3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) + slice(
                projh2, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ct_proj = slice(projx3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        zt = sigmoid(zt_proj)  # update gate z(t)

        rt = sigmoid(rt_proj)  # reset gate r(t)

        rs = dhs * rt  # "cell" c
        ct = activation(ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs  # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        #return Function.NamedOutput(h=h)
        return h

    def rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + times(dhs, H) + b)
        h = times(Sht(ht), Wmr) if has_projection else \
            ht
        #return Function.NamedOutput(h=h)
        return h

    function = {'RNNUnit': rnn, 'GRU': gru, 'LSTM': lstm}[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Ejemplo n.º 4
0
def IndRNNBlock(type,
                shape,
                cell_shape,
                activation,
                use_peepholes,
                init,
                init_bias,
                enable_self_stabilization,
                name=''):
    '''
    Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNStep.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 1
    cell_shape_stacked = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 1
    cell_shape_stacked_H = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')  # input
    H = Parameter(cell_shape_stacked_H, init=init,
                  name='H')  # hidden-to-hidden

    Wmr = Parameter(cell_shape + shape, init=init,
                    name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='P_stabilizer')

    def rnn_step(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + dhs * H + b)
        h = times(Sht(ht), Wmr) if has_projection else \
            ht
        return h

    function = {
        'RNNStep': rnn_step,
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Ejemplo n.º 5
0
def _RecurrentBlock(type,
                    shape,
                    cell_shape,
                    activation,
                    use_peepholes,
                    init,
                    init_bias,
                    enable_self_stabilization,
                    name=''):

    has_projection = cell_shape is not None
    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)

    stack_axis = -1
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    cell_shape_list_W = list(cell_shape)
    cell_shape_list_H = list(cell_shape)
    cell_shape_stacked = tuple(cell_shape_list)
    shape_list = list(shape)

    # slot value pair onehot vector dimension
    sv_dim = cell_shape_list[stack_axis]
    stacked_dim = shape_list[stack_axis]
    sv_shape_stacked = tuple([sv_dim])

    # 3*hidden_dim
    cell_shape_list[stack_axis] = stacked_dim * {'LSTM': 3}[type]
    cell_shape_stacked = tuple(cell_shape_list)

    # 2*hidden_dim + sv_dim
    cell_shape_list_H[stack_axis] = stacked_dim * {'LSTM': 2}[type] + sv_dim
    cell_shape_stacked_H = tuple(cell_shape_list_H)

    cell_shape_list_W[stack_axis] = stacked_dim * {'LSTM': 2}[type]
    cell_shape_stacked_W = tuple(cell_shape_list_W)

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    brg = Parameter(sv_shape_stacked, init=init_bias, name='brg')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')
    Wrg = Parameter(_INFERRED + sv_shape_stacked, init=init, name='Wrg')
    Wcx = Parameter(_INFERRED + shape, init=init, name='Wcx')

    H = Parameter(shape + cell_shape_stacked, init=init, name='H')
    Hrg = Parameter(shape + sv_shape_stacked, init=init, name='Hrg')
    Hcx = Parameter(shape + shape, init=init, name='Hcx')
    Hsv = Parameter(sv_shape_stacked + cell_shape_stacked,
                    init=init,
                    name='Hsv')
    Hsvrg = Parameter(sv_shape_stacked + sv_shape_stacked,
                      init=init,
                      name='Hsvrg')
    Wfc = Parameter(sv_shape_stacked + shape, init=init, name='Wfc')

    # LSTM model function
    # in this case:
    #   (dh, dc, sv, x) --> (h, c, sv)

    def lstm(dh, dc, sv, x):

        # projected contribution from input(s), hidden, and bias
        proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv)

        it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        ot = sigmoid(ot_proj)  # output gate(t)

        # the following is reading gate
        proj3rg = sigmoid(
            times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg)
        v = proj3rg * sv

        cx_t = tanh(times(x, Wcx) + times(dh, Hcx))

        # need to do stablization ??
        # update memory cell
        c = it * cx_t + ft * dc + tanh(times(v, Wfc))

        h = ot * tanh(c)

        return (h, c, v)

    function = {'LSTM': lstm}[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
def dense_factored(
        shapes,  #(shape1, shape2)
        activation=default_override_or(identity),
        init={
            'W1': None,
            'W2': None
        },
        input_rank=None,
        map_rank=None,
        bias=default_override_or(True),
        init_bias=default_override_or(0),
        name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert (input_rank is None and map_rank is None
            and all(isinstance(s, int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError(
            "Dense: input_rank and map_rank cannot be specified at the same time."
        )

    # If input_rank not given then pass a single _INFERRED;
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1,
                   init=init_weights['W1'],
                   name='W1')
    W2 = Parameter(output_shape1 + output_shape2,
                   init=init_weights['W2'],
                   name='W2')
    b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r

    return dense
Ejemplo n.º 7
0
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None,
         ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0),
         hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0),
         name=''):
    """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models.

    This difference between this implementation and cntk's one is that the slicing of
    the recurrent weights are different.

    pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation,
    i kept the biases to one to speed it up a little more.

    """
    activation = get_default_override(LSTM, activation=activation)
    ih_init = get_default_override(LSTM, ih_init=ih_init)
    ih_bias = get_default_override(LSTM, ih_bias=ih_bias)
    hh_init = get_default_override(LSTM, hh_init=hh_init)
    hh_bias = get_default_override(LSTM, hh_bias=hh_bias)

    stack_axis = - 1
    shape = _as_tuple(shape)
    cell_shape = shape
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    init_bias = ih_bias + hh_bias  # combine both biases in pytorch into one
    b  = Parameter(            cell_shape_stacked,   init=init_bias,    name='b')                    # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=ih_init,      name='W')                    # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=hh_init,      name='H')                    # hidden-to-hidden

    dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None

    @C.BlockFunction('PT::LSTM', name)
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj  = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim)  # split along stack_axis
        ft_proj  = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)  # g gate
        ot_proj  = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)                        # input gate(t)
        bit = it * activation(bit_proj)              # applied to tanh of input network

        ft = sigmoid(ft_proj)                        # forget-me-not gate(t)
        bft = ft * dc                                # applied to cell(t-1)

        ct = bft + bit                               # c(t) is sum of both

        ot = sigmoid(ot_proj)                        # output gate(t)
        ht = ot * activation(ct)                     # applied to tanh(cell(t))
        return ht, ct

    return lstm
Ejemplo n.º 8
0
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes,
                    init, init_bias,
                    enable_self_stabilization, dropout_rate, seed,
                    name=''):
    '''
    Helper to create a recurrent block of type 'WeightDroppedLSTM', 'GRU', or RNNStep.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked,   init=init_bias, name='b')                              # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=init,      name='W')                              # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=init,      name='H')                              # hidden-to-hidden
    H1 = Parameter(            cell_shape_stacked_H, init=init,      name='H1') if type == 'IndyLSTM' else None  # hidden-to-hidden
    H2 = Parameter(shape                 ,           init=init,      name='H2') if type == 'IndRNN' else None  # hidden-to-hidden
    Ci = Parameter(            cell_shape,           init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,           init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,           init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer')

    # DropConnect
    dropout = C.layers.Dropout(dropout_rate=dropout_rate, seed=seed, name='h_dropout')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def weight_dropped_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, dropout(H))

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def indy_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + C.splice(dhs, dhs, dhs, dhs) * H1  # 4 is the number of stacked dim

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    def ind_rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + dhs * H2 + b)
        h = times(Sht(ht), Wmr) if has_projection else ht
        return h

    function = {
        'IndRNN': ind_rnn,
        'IndyLSTM': indy_lstm,
        'WeightDroppedLSTM':    weight_dropped_lstm
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)