Exemple #1
0
 def _sanitize_minibatch_source(minibatch_source,
                                model_inputs_to_streams,
                                criterion,
                                infinitely_repeat=True):
     '''
     Helper to wrap numpy/scipy data into a minibatch source.
     '''
     from ..io import MinibatchSource, UserMinibatchSource, MinibatchSourceFromData, INFINITELY_REPEAT
     if minibatch_source and not isinstance(
             minibatch_source, (MinibatchSource, UserMinibatchSource)
     ):  # UserMinibatchSource derives from cntk_py.SwigMinibatchSource, not MinibatchSource, for director purposes
         args = _as_tuple(
             minibatch_source
         )  # the minibatch_source is a tuple of numpy or scipy arrays that we construct a source around
         # args can also be a tuple of numpy/scipy arrays; we will construct on the fly
         if criterion is None:
             raise ValueError(
                 "when passing data directly in place of a minibatch source, criterion must be given"
             )
         params = criterion.arguments
         if len(params) != len(args):
             raise ValueError(
                 "to pass data directly in place of a minibatch source, pass a tuple of {} numpy or scipy arrays, in the order of the arguments of the criterion function. You passed {} value(s)"
                 .format(len(params), len(args)))
         param_names = [
             param.name if param.name else "stream_%s" % i
             for i, param in enumerate(params)
         ]  # names are for debugging...
         if len(params) != len(
                 set(param_names)
         ):  # ...and for stream names and thus must be unique. If multiple inputs have the same names...
             param_names = ["stream_%s" % i for i, _ in enumerate(params)
                            ]  # ...we fall back to generic names
         param_types = [param._type for param in params]
         max_samples = INFINITELY_REPEAT if infinitely_repeat else len(
             args[0])  # if not infinite then do one data pass
         minibatch_source = MinibatchSourceFromData(
             {
                 name: (input, type)
                 for name, input, type in zip(param_names, args,
                                              param_types)
             },
             max_samples=max_samples)
         if model_inputs_to_streams is not None:
             raise ValueError(
                 "mapping must not be provided when data is passed directly"
             )
         model_inputs_to_streams = {
             param: minibatch_source.streams[name]
             for param, name in zip(params, param_names)
         }
     return minibatch_source, model_inputs_to_streams
Exemple #2
0
def gru_cell(shape, init=glorot_uniform(), name=''):  # (x, (h,c))
    """ GRU cell function
  """
    shape = _as_tuple(shape)

    if len(shape) != 1:
        raise ValueError("gru_cell: shape must be vectors (rank-1 tensors)")

    # determine stacking dimensions
    cell_shape_stacked = shape * 2  # patched dims with stack_axis duplicated 2 times

    # parameters
    Wz = Parameter(cell_shape_stacked, init=init, name='Wz')
    Wr = Parameter(cell_shape_stacked, init=init, name='Wr')
    Wh = Parameter(cell_shape_stacked, init=init, name='Wh')
    Uz = Parameter(_INFERRED + shape, init=init, name='Uz')
    Ur = Parameter(_INFERRED + shape, init=init, name='Ur')
    Uh = Parameter(_INFERRED + shape, init=init, name='Uh')

    def create_s_placeholder():
        # we pass the known dimensions here, which makes dimension inference easier
        return Placeholder(shape=shape, name='S')  # (h, c)

    # parameters to model function
    x = Placeholder(name='gru_block_arg')
    prev_status = create_s_placeholder()

    # formula of model function
    Sn_1 = prev_status

    z = sigmoid(times(x, Uz, name='x*Uz') + times(Sn_1, Wz, name='Sprev*Wz'),
                name='z')
    r = sigmoid(times(x, Ur, name='x*Ur') + times(Sn_1, Wr, name='Sprev*Wr'),
                name='r')
    h = tanh(times(x, Uh, name='x*Uh') +
             times(element_times(Sn_1, r, name='Sprev*r'), Wh),
             name='h')
    s = plus(element_times((1 - z), h, name='(1-z)*h'),
             element_times(z, Sn_1, name='z*SPrev'),
             name=name)
    apply_x_s = combine([s])
    apply_x_s.create_placeholder = create_s_placeholder
    return apply_x_s
 def _sanitize_minibatch_source(minibatch_source, model_inputs_to_streams, criterion, infinitely_repeat=True):
     '''
     Helper to wrap numpy/scipy data into a minibatch source.
     '''
     from ..io import MinibatchSource, UserMinibatchSource, MinibatchSourceFromData, INFINITELY_REPEAT
     if minibatch_source and not isinstance(minibatch_source, (MinibatchSource, UserMinibatchSource)): # UserMinibatchSource derives from cntk_py.SwigMinibatchSource, not MinibatchSource, for director purposes
         args = _as_tuple(minibatch_source) # the minibatch_source is a tuple of numpy or scipy arrays that we construct a source around
         # args can also be a tuple of numpy/scipy arrays; we will construct on the fly
         if criterion is None:
             raise ValueError("when passing data directly in place of a minibatch source, criterion must be given")
         params = criterion.arguments
         if len(params) != len(args):
             raise ValueError("to pass data directly in place of a minibatch source, pass a tuple of {} numpy or scipy arrays, in the order of the arguments of the criterion function. You passed {} value(s)"
                              .format(len(params), len(args)))
         param_names = [param.name if param.name else "stream_%s" %  i for i, param in enumerate(params)] # names are for debugging...
         if len(params) != len(set(param_names)): # ...and for stream names and thus must be unique. If multiple inputs have the same names...
             param_names = ["stream_%s" % i for i, _ in enumerate(params)] # ...we fall back to generic names
         param_types = [param._type for param in params]
         max_samples = INFINITELY_REPEAT if infinitely_repeat else len(args[0]) # if not infinite then do one data pass
         minibatch_source = MinibatchSourceFromData({name: (input, type) for name, input, type in zip(param_names, args, param_types)}, max_samples=max_samples)
         if model_inputs_to_streams is not None:
             raise ValueError( "mapping must not be provided when data is passed directly")
         model_inputs_to_streams = {param: minibatch_source.streams[name] for param, name in zip(params, param_names)}
     return minibatch_source, model_inputs_to_streams
Exemple #4
0
def _RecurrentBlock(type,
                    shape,
                    cell_shape,
                    activation,
                    use_peepholes,
                    init,
                    init_bias,
                    enable_self_stabilization,
                    name=''):
    '''
    Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 3,
        'LSTM': 4
    }[type]
    cell_shape_stacked = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 2,
        'LSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')  # input
    H = Parameter(shape + cell_shape_stacked_H, init=init,
                  name='H')  # hidden-to-hidden
    H1 = Parameter(shape + cell_shape, init=init,
                   name='H1') if type == 'GRU' else None  # hidden-to-hidden
    Ci = Parameter(
        cell_shape, init=init, name='Ci'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(
        cell_shape, init=init, name='Cf'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(
        cell_shape, init=init, name='Co'
    ) if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init,
                    name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='P_stabilizer')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, H)

        it_proj = slice(proj4, stack_axis, 0 * stacked_dim,
                        1 * stacked_dim)  # split along stack_axis
        bit_proj = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ft_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)
        ot_proj = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        #return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))
        return (h, c)

    # GRU model function
    # in this case:
    #   (dh, x) --> (h)
    # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2 = times(dhs, H)

        zt_proj = slice(
            projx3, stack_axis, 0 * stacked_dim, 1 * stacked_dim) + slice(
                projh2, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        rt_proj = slice(
            projx3, stack_axis, 1 * stacked_dim, 2 * stacked_dim) + slice(
                projh2, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ct_proj = slice(projx3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        zt = sigmoid(zt_proj)  # update gate z(t)

        rt = sigmoid(rt_proj)  # reset gate r(t)

        rs = dhs * rt  # "cell" c
        ct = activation(ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs  # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        #return Function.NamedOutput(h=h)
        return h

    def rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + times(dhs, H) + b)
        h = times(Sht(ht), Wmr) if has_projection else \
            ht
        #return Function.NamedOutput(h=h)
        return h

    function = {'RNNUnit': rnn, 'GRU': gru, 'LSTM': lstm}[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Exemple #5
0
def IndRNNBlock(type,
                shape,
                cell_shape,
                activation,
                use_peepholes,
                init,
                init_bias,
                enable_self_stabilization,
                name=''):
    '''
    Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNStep.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 1
    cell_shape_stacked = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 1
    cell_shape_stacked_H = tuple(
        cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')  # input
    H = Parameter(cell_shape_stacked_H, init=init,
                  name='H')  # hidden-to-hidden

    Wmr = Parameter(cell_shape + shape, init=init,
                    name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization,
                     name='P_stabilizer')

    def rnn_step(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + dhs * H + b)
        h = times(Sht(ht), Wmr) if has_projection else \
            ht
        return h

    function = {
        'RNNStep': rnn_step,
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Exemple #6
0
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes,
                    init, init_bias,
                    enable_self_stabilization,
                    name=''):
    '''
    Helper to create a recurrent block of type 'LSTM', 'GRU', or RNNUnit.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 3,
        'LSTM': 4
    }[type]
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'RNNUnit': 1,
        'GRU': 2,
        'LSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked,   init=init_bias, name='b')                              # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=init,      name='W')                              # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=init,      name='H')                              # hidden-to-hidden
    H1 = Parameter(shape     + cell_shape,           init=init,      name='H1') if type == 'GRU' else None  # hidden-to-hidden
    Ci = Parameter(            cell_shape,           init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,           init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,           init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, H)

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid (peep (it_proj, dcs, Ci))        # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation (bit_proj)              # applied to tanh of input network

        ft = sigmoid (peep (ft_proj, dcs, Cf))        # forget-me-not gate(t)
        bft = ft * dc                                 # applied to cell(t-1)

        ct = bft + bit                                # c(t) is sum of both

        ot = sigmoid (peep (ot_proj, Sct(ct), Co))    # output gate(t)
        ht = ot * activation (ct)                     # applied to tanh(cell(t))

        c = ct                                        # cell value
        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return (Function.NamedOutput(h=h), Function.NamedOutput(c=c))

    # GRU model function
    # in this case:
    #   (dh, x) --> (h)
    # e.g. https://en.wikipedia.org/wiki/Gated_recurrent_unit
    def gru(dh, x):

        dhs = Sdh(dh)  # previous value, stabilized
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        projx3 = b + times(x, W)
        projh2  = times(dhs, H)

        zt_proj = slice (projx3, stack_axis, 0*stacked_dim, 1*stacked_dim) + slice (projh2, stack_axis, 0*stacked_dim, 1*stacked_dim)
        rt_proj = slice (projx3, stack_axis, 1*stacked_dim, 2*stacked_dim) + slice (projh2, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ct_proj = slice (projx3, stack_axis, 2*stacked_dim, 3*stacked_dim)

        zt = sigmoid (zt_proj)        # update gate z(t)

        rt = sigmoid (rt_proj)        # reset gate r(t)

        rs = dhs * rt        # "cell" c
        ct = activation (ct_proj + times(rs, H1))

        ht = (1 - zt) * ct + zt * dhs # hidden state ht / output

        # for comparison: CUDNN_GRU
        # i(t) = sigmoid(W_i x(t) +          R_i h(t-1)  + b_Wi + b_Ru)
        # r(t) = sigmoid(W_r x(t) +          R_r h(t-1)  + b_Wr + b_Rr)   --same up to here
        # h'(t) =   tanh(W_h x(t) + r(t) .* (R_h h(t-1)) + b_Wh + b_Rh)   --r applied after projection? Would make life easier!
        # h(t) = (1 - i(t) .* h'(t)) + i(t) .* h(t-1)                     --TODO: need to confirm bracketing with NVIDIA

        h = times(Sht(ht), Wmr) if has_projection else \
            ht

        # returns the new state as a tuple with names but order matters
        return Function.NamedOutput(h=h)

    def rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation (times(x, W) + times(dhs, H) + b)
        h = times(Sht(ht), Wmr) if has_projection else \
            ht
        return Function.NamedOutput(h=h)

    function = {
        'RNNUnit': rnn,
        'GRU':     gru,
        'LSTM':    lstm
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Exemple #7
0
def _RecurrentBlock(type,
                    shape,
                    cell_shape,
                    activation,
                    use_peepholes,
                    init,
                    init_bias,
                    enable_self_stabilization,
                    name=''):

    has_projection = cell_shape is not None
    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError(
            "%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)

    stack_axis = -1
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    cell_shape_list_W = list(cell_shape)
    cell_shape_list_H = list(cell_shape)
    cell_shape_stacked = tuple(cell_shape_list)
    shape_list = list(shape)

    # slot value pair onehot vector dimension
    sv_dim = cell_shape_list[stack_axis]
    stacked_dim = shape_list[stack_axis]
    sv_shape_stacked = tuple([sv_dim])

    # 3*hidden_dim
    cell_shape_list[stack_axis] = stacked_dim * {'LSTM': 3}[type]
    cell_shape_stacked = tuple(cell_shape_list)

    # 2*hidden_dim + sv_dim
    cell_shape_list_H[stack_axis] = stacked_dim * {'LSTM': 2}[type] + sv_dim
    cell_shape_stacked_H = tuple(cell_shape_list_H)

    cell_shape_list_W[stack_axis] = stacked_dim * {'LSTM': 2}[type]
    cell_shape_stacked_W = tuple(cell_shape_list_W)

    # parameters
    b = Parameter(cell_shape_stacked, init=init_bias, name='b')  # bias
    brg = Parameter(sv_shape_stacked, init=init_bias, name='brg')  # bias
    W = Parameter(_INFERRED + cell_shape_stacked, init=init, name='W')
    Wrg = Parameter(_INFERRED + sv_shape_stacked, init=init, name='Wrg')
    Wcx = Parameter(_INFERRED + shape, init=init, name='Wcx')

    H = Parameter(shape + cell_shape_stacked, init=init, name='H')
    Hrg = Parameter(shape + sv_shape_stacked, init=init, name='Hrg')
    Hcx = Parameter(shape + shape, init=init, name='Hcx')
    Hsv = Parameter(sv_shape_stacked + cell_shape_stacked,
                    init=init,
                    name='Hsv')
    Hsvrg = Parameter(sv_shape_stacked + sv_shape_stacked,
                      init=init,
                      name='Hsvrg')
    Wfc = Parameter(sv_shape_stacked + shape, init=init, name='Wfc')

    # LSTM model function
    # in this case:
    #   (dh, dc, sv, x) --> (h, c, sv)

    def lstm(dh, dc, sv, x):

        # projected contribution from input(s), hidden, and bias
        proj3 = b + times(x, W) + times(dh, H) + times(sv, Hsv)

        it_proj = slice(proj3, stack_axis, 0 * stacked_dim, 1 * stacked_dim)
        ft_proj = slice(proj3, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        ot_proj = slice(proj3, stack_axis, 2 * stacked_dim, 3 * stacked_dim)

        it = sigmoid(it_proj)  # input gate(t)
        ft = sigmoid(ft_proj)  # forget-me-not gate(t)
        ot = sigmoid(ot_proj)  # output gate(t)

        # the following is reading gate
        proj3rg = sigmoid(
            times(x, Wrg) + times(dh, Hrg) + times(sv, Hsvrg) + brg)
        v = proj3rg * sv

        cx_t = tanh(times(x, Wcx) + times(dh, Hcx))

        # need to do stablization ??
        # update memory cell
        c = it * cx_t + ft * dc + tanh(times(v, Wfc))

        h = ot * tanh(c)

        return (h, c, v)

    function = {'LSTM': lstm}[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)
Exemple #8
0
def dense_factored(shapes, #(shape1, shape2)
                  activation=default_override_or(identity),
                  init={'W1':None, 'W2':None},
                  input_rank=None,
                  map_rank=None,
                  bias=default_override_or(True),
                  init_bias=default_override_or(0),
                  name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert(input_rank is None and
           map_rank is None and
           all(isinstance(s,int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias       = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias  = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError("Dense: input_rank and map_rank cannot be specified at the same time.")


    # If input_rank not given then pass a single _INFERRED; 
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1, init=init_weights['W1'], name='W1')
    W2 = Parameter(output_shape1 + output_shape2, init=init_weights['W2'], name='W2')
    b = Parameter(output_shape2, init=init_bias,    name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r
    return dense
Exemple #9
0
def create_model(params: model_params):
    """
  Create ReasoNet model
  Args:
    params (class:`model_params`): The parameters used to create the model
  """
    logger.log(
        "Create model: dropout_rate: {0}, init:{1}, embedding_init: {2}".
        format(params.dropout_rate, params.init, params.embedding_init))
    # Query and Doc/Context/Paragraph inputs to the model
    query_seq_axis = Axis('sourceAxis')
    context_seq_axis = Axis('contextAxis')
    query_sequence = sequence.input(shape=(params.vocab_dim),
                                    is_sparse=True,
                                    sequence_axis=query_seq_axis,
                                    name='query')
    context_sequence = sequence.input(shape=(params.vocab_dim),
                                      is_sparse=True,
                                      sequence_axis=context_seq_axis,
                                      name='context')
    entity_ids_mask = sequence.input(shape=(1, ),
                                     is_sparse=False,
                                     sequence_axis=context_seq_axis,
                                     name='entity_ids_mask')
    # embedding
    if params.embedding_init is None:
        embedding_init = create_random_matrix(params.vocab_dim,
                                              params.embedding_dim)
    else:
        embedding_init = params.embedding_init
    embedding = parameter(shape=(params.vocab_dim, params.embedding_dim),
                          init=None)
    embedding.value = embedding_init
    embedding_matrix = constant(embedding_init,
                                shape=(params.vocab_dim, params.embedding_dim))

    if params.dropout_rate is not None:
        query_embedding = ops.dropout(times(query_sequence, embedding),
                                      params.dropout_rate,
                                      name='query_embedding')
        context_embedding = ops.dropout(times(context_sequence, embedding),
                                        params.dropout_rate,
                                        name='context_embedding')
    else:
        query_embedding = times(query_sequence,
                                embedding,
                                name='query_embedding')
        context_embedding = times(context_sequence,
                                  embedding,
                                  name='context_embedding')

    contextGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim),
                            init=glorot_uniform(),
                            name='gru_params')
    queryGruW = Parameter(_INFERRED + _as_tuple(params.hidden_dim),
                          init=glorot_uniform(),
                          name='gru_params')

    entity_embedding = ops.times(context_sequence,
                                 embedding_matrix,
                                 name='constant_entity_embedding')
    # Unlike other words in the context, we keep the entity vectors fixed as a random vector so that each vector just means an identifier of different entities in the context and it has no semantic meaning
    full_context_embedding = ops.element_select(entity_ids_mask,
                                                entity_embedding,
                                                context_embedding)
    context_memory = ops.optimized_rnnstack(full_context_embedding,
                                            contextGruW,
                                            params.hidden_dim,
                                            1,
                                            True,
                                            recurrent_op='gru',
                                            name='context_mem')

    query_memory = ops.optimized_rnnstack(query_embedding,
                                          queryGruW,
                                          params.hidden_dim,
                                          1,
                                          True,
                                          recurrent_op='gru',
                                          name='query_mem')
    qfwd = ops.slice(sequence.last(query_memory),
                     -1,
                     0,
                     params.hidden_dim,
                     name='fwd')
    qbwd = ops.slice(sequence.first(query_memory),
                     -1,
                     params.hidden_dim,
                     params.hidden_dim * 2,
                     name='bwd')
    init_status = ops.splice(
        qfwd, qbwd,
        name='Init_Status')  # get last fwd status and first bwd status
    return attention_model(context_memory,
                           query_memory,
                           init_status,
                           params.hidden_dim,
                           params.attention_dim,
                           max_steps=params.max_rl_steps)
def dense_factored(
        shapes,  #(shape1, shape2)
        activation=default_override_or(identity),
        init={
            'W1': None,
            'W2': None
        },
        input_rank=None,
        map_rank=None,
        bias=default_override_or(True),
        init_bias=default_override_or(0),
        name=''):
    '''
    Perform the new model creation using the factored inputs W1 and W2. 
    The returend function represents the new model.

    Args:
        shapes                  : dimensions of the input matrices.
        activation              : activation function used for the model.
        init                    : the two matrices corresponding to the factorization.
        input_rank              : rank of the input tensor.
        map_rank                : ???
        bias                    : bias for the model.
        init_bias               : initial bias value.
        name                    : name of the block function that creates the new model.
        
    Returns:
        a model that is factored and projected (reduced).
    '''

    # matthaip: Not sure how to handle input tensor of rank > 1
    # or selective flattening of ranks
    assert (input_rank is None and map_rank is None
            and all(isinstance(s, int) for s in list(shapes)))

    activation = get_default_override(cntk.layers.Dense, activation=activation)
    bias = get_default_override(cntk.layers.Dense, bias=bias)
    init_bias = get_default_override(cntk.layers.Dense, init_bias=init_bias)
    # how to use get_default_override for init parameeter?

    output_shape1 = _as_tuple(shapes[0])
    output_shape2 = _as_tuple(shapes[1])
    if input_rank is not None and map_rank is not None:
        raise ValueError(
            "Dense: input_rank and map_rank cannot be specified at the same time."
        )

    # If input_rank not given then pass a single _INFERRED;
    # map_rank if given will determine the input_rank.
    # The dimension inference may still create multiple axes.
    input_shape = _INFERRED

    # parameters bound to this Function
    #    init_weights = _initializer_for(init, Record(output_rank=output_rank))
    init_weights = init
    W1 = Parameter(input_shape + output_shape1,
                   init=init_weights['W1'],
                   name='W1')
    W2 = Parameter(output_shape1 + output_shape2,
                   init=init_weights['W2'],
                   name='W2')
    b = Parameter(output_shape2, init=init_bias, name='b') if bias else None

    # expression of this function
    @BlockFunction('DenseFactored', name)
    def dense(x):
        r = times(x, W1)
        r = times(r, W2)
        if b:
            r = r + b
        if activation is not None:
            r = activation(r)
        return r

    return dense
Exemple #11
0
def LSTM(shape, activation=default_override_or(tanh), weight_drop_rate=None,
         ih_init=default_override_or(glorot_uniform()), ih_bias=default_override_or(0),
         hh_init=default_override_or(glorot_uniform()), hh_bias=default_override_or(0),
         name=''):
    """ PyTorch style implementation of LSTM. Used for loading pytorch pretrained models.

    This difference between this implementation and cntk's one is that the slicing of
    the recurrent weights are different.

    pytorch is ifgo but cntk is igfo. And pytorch has 2 biases, but cntk only has one. In this implementation,
    i kept the biases to one to speed it up a little more.

    """
    activation = get_default_override(LSTM, activation=activation)
    ih_init = get_default_override(LSTM, ih_init=ih_init)
    ih_bias = get_default_override(LSTM, ih_bias=ih_bias)
    hh_init = get_default_override(LSTM, hh_init=hh_init)
    hh_bias = get_default_override(LSTM, hh_bias=hh_bias)

    stack_axis = - 1
    shape = _as_tuple(shape)
    cell_shape = shape
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * 4
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    init_bias = ih_bias + hh_bias  # combine both biases in pytorch into one
    b  = Parameter(            cell_shape_stacked,   init=init_bias,    name='b')                    # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=ih_init,      name='W')                    # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=hh_init,      name='H')                    # hidden-to-hidden

    dropout = C.layers.Dropout(dropout_rate=weight_drop_rate, name='h_dropout') if weight_drop_rate is not None else None

    @C.BlockFunction('PT::LSTM', name)
    def lstm(dh, dc, x):
        # projected contribution from input(s), hidden, and bias

        dropped_H = dropout(H) if weight_drop_rate is not None else H
        proj4 = b + times(x, W) + times(dh, dropped_H)

        # slicing layout different from cntk's implementation
        it_proj  = slice(proj4, stack_axis, 0 * stacked_dim, 1 * stacked_dim)  # split along stack_axis
        ft_proj  = slice(proj4, stack_axis, 1 * stacked_dim, 2 * stacked_dim)
        bit_proj = slice(proj4, stack_axis, 2 * stacked_dim, 3 * stacked_dim)  # g gate
        ot_proj  = slice(proj4, stack_axis, 3 * stacked_dim, 4 * stacked_dim)

        it = sigmoid(it_proj)                        # input gate(t)
        bit = it * activation(bit_proj)              # applied to tanh of input network

        ft = sigmoid(ft_proj)                        # forget-me-not gate(t)
        bft = ft * dc                                # applied to cell(t-1)

        ct = bft + bit                               # c(t) is sum of both

        ot = sigmoid(ot_proj)                        # output gate(t)
        ht = ot * activation(ct)                     # applied to tanh(cell(t))
        return ht, ct

    return lstm
Exemple #12
0
def _RecurrentBlock(type, shape, cell_shape, activation, use_peepholes,
                    init, init_bias,
                    enable_self_stabilization, dropout_rate, seed,
                    name=''):
    '''
    Helper to create a recurrent block of type 'WeightDroppedLSTM', 'GRU', or RNNStep.
    '''

    has_projection = cell_shape is not None

    shape = _as_tuple(shape)

    cell_shape = _as_tuple(cell_shape) if cell_shape is not None else shape
    if len(shape) != 1 or len(cell_shape) != 1:
        raise ValueError("%s: shape and cell_shape must be vectors (rank-1 tensors)" % type)
        # otherwise we'd need to fix slicing and Param initializers

    stack_axis = -1  # for efficient computation, we stack multiple variables (along the fastest-changing one, to match BS)
    # determine stacking dimensions
    cell_shape_list = list(cell_shape)
    stacked_dim = cell_shape_list[stack_axis]
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times
    cell_shape_list[stack_axis] = stacked_dim * {
        'IndRNN': 1,
        'IndyLSTM': 4,
        'WeightDroppedLSTM': 4
    }[type]
    cell_shape_stacked_H = tuple(cell_shape_list)  # patched dims with stack_axis duplicated 4 times

    # parameters
    b  = Parameter(            cell_shape_stacked,   init=init_bias, name='b')                              # bias
    W  = Parameter(_INFERRED + cell_shape_stacked,   init=init,      name='W')                              # input
    H  = Parameter(shape     + cell_shape_stacked_H, init=init,      name='H')                              # hidden-to-hidden
    H1 = Parameter(            cell_shape_stacked_H, init=init,      name='H1') if type == 'IndyLSTM' else None  # hidden-to-hidden
    H2 = Parameter(shape                 ,           init=init,      name='H2') if type == 'IndRNN' else None  # hidden-to-hidden
    Ci = Parameter(            cell_shape,           init=init,      name='Ci') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Cf = Parameter(            cell_shape,           init=init,      name='Cf') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}
    Co = Parameter(            cell_shape,           init=init,      name='Co') if use_peepholes else None  # cell-to-hiddden {note: applied elementwise}

    Wmr = Parameter(cell_shape + shape, init=init, name='P') if has_projection else None  # final projection

    # each use of a stabilizer layer must get its own instance
    Sdh = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dh_stabilizer')
    Sdc = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='dc_stabilizer')
    Sct = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='c_stabilizer')
    Sht = Stabilizer(enable_self_stabilization=enable_self_stabilization, name='P_stabilizer')

    # DropConnect
    dropout = C.layers.Dropout(dropout_rate=dropout_rate, seed=seed, name='h_dropout')

    # define the model function itself
    # general interface for Recurrence():
    #   (all previous outputs delayed, input) --> (outputs and state)
    # where
    #  - the first output is the main output, e.g. 'h' for LSTM
    #  - the remaining outputs, if any, are additional state
    #  - if for some reason output != state, then output is still fed back and should just be ignored by the recurrent block

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def weight_dropped_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + times(dhs, dropout(H))

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    # LSTM model function
    # in this case:
    #   (dh, dc, x) --> (h, c)
    def indy_lstm(dh, dc, x):

        dhs = Sdh(dh)  # previous values, stabilized
        dcs = Sdc(dc)
        # note: input does not get a stabilizer here, user is meant to do that outside

        # projected contribution from input(s), hidden, and bias
        proj4 = b + times(x, W) + C.splice(dhs, dhs, dhs, dhs) * H1  # 4 is the number of stacked dim

        it_proj  = slice (proj4, stack_axis, 0*stacked_dim, 1*stacked_dim)  # split along stack_axis
        bit_proj = slice (proj4, stack_axis, 1*stacked_dim, 2*stacked_dim)
        ft_proj  = slice (proj4, stack_axis, 2*stacked_dim, 3*stacked_dim)
        ot_proj  = slice (proj4, stack_axis, 3*stacked_dim, 4*stacked_dim)

        # helper to inject peephole connection if requested
        def peep(x, c, C):
            return x + C * c if use_peepholes else x

        it = sigmoid(peep(it_proj, dcs, Ci))  # input gate(t)
        # TODO: should both activations be replaced?
        bit = it * activation(bit_proj)  # applied to tanh of input network

        ft = sigmoid(peep(ft_proj, dcs, Cf))  # forget-me-not gate(t)
        bft = ft * dc  # applied to cell(t-1)

        ct = bft + bit  # c(t) is sum of both

        ot = sigmoid(peep(ot_proj, Sct(ct), Co))  # output gate(t)
        ht = ot * activation(ct)  # applied to tanh(cell(t))

        c = ct  # cell value
        h = times(Sht(ht), Wmr) if has_projection else ht

        return h, c

    def ind_rnn(dh, x):
        dhs = Sdh(dh)  # previous value, stabilized
        ht = activation(times(x, W) + dhs * H2 + b)
        h = times(Sht(ht), Wmr) if has_projection else ht
        return h

    function = {
        'IndRNN': ind_rnn,
        'IndyLSTM': indy_lstm,
        'WeightDroppedLSTM':    weight_dropped_lstm
    }[type]

    # return the corresponding lambda as a CNTK Function
    return BlockFunction(type, name)(function)